def train_ql(size, lr, rd, eps_start=1.0, eps_end=0.05, eps_decay=0.999): env = gym.make('game2048-v0', size=size) agent = model.QLearning(env.action_space, learning_rate=lr, reward_decay=rd) total_steps = 0 total_scores = 0 highest_score = 0 # trials = 1 * 100000 * (size ** 2) trials = 400000 rewards_window = deque(maxlen=100) scores_window = deque(maxlen=100) eps = eps_start for trial in range(1, trials+1): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) stepno = 0 rewards = 0 while True: stepno += 1 total_steps += 1 action = agent.choose_action(str(obs), eps) obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) if done: obs_ = 'terminal' agent.learn(obs, action, reward, obs_) obs = obs_ rewards += reward if done: break #env.render() eps = max(eps_end, eps_decay * eps) rewards_window.append(rewards) scores_window.append(env.get_score()) if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'. format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps), end="") if trial% 100 == 0: print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'. format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps)) eval(env, agent, 1000, render=False) print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \ highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}') print(f'table_len: {len(agent.q_table)} steps: {total_steps}')
def train_sarsa(size, lr, rd): env = gym.make('game2048-v0', size=size) agent = model.Sarsa(env.action_space, learning_rate=lr, reward_decay=rd) total_steps = 0 total_scores = 0 highest_score = 0 trials = 1 * 1000 * (size ** 2) for trial in range(trials): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) action = agent.choose_action(obs) stepno = 0 rewards = 0 while True: stepno += 1 total_steps += 1 obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) action_ = agent.choose_action(obs_, True) if done: obs_ = 'terminal' agent.learn(obs, action, reward, obs_, action_) obs = obs_ action = action_ rewards += reward if done: break #env.render() print(f'Completed in {trial} use {stepno} steps highest: \ {env.highest()} rewards: {rewards}', end="") if env.highest() >= 2 ** (size ** 2 - 1): highest[trial] = env.highest() if env.highest() >= 2 ** (size ** 2): targets[trial] = env.highest() if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() stepno = 0 rewards = 0 eval(env, agent, render=False) print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \ highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}') print(f'highest len: {len(highest)} prob: {len(highest) * 1.0 / trials} \ target len: {len(targets)} prob: {len(targets) * 1.0 / trials}')
def eval(env, agent, times=1000, render=False): if False: write_explore(agent, 'explore_old.file') highest_score = 0 total_scores = 0 size = env.get_size() scores = [] max_tiles = [] for i in range(times): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) while True: action = agent.choose_action(obs) obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) if render: print(f'action is: {action} {obs} {obs_}') env.render() if obs_ == obs: # env.render() agent.learn(obs, action, reward, obs_) obs = obs_ if done: break env.render() scores.append(env.get_score()) max_tiles.append(env.highest()) if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() if times > 0: plot_score(scores, max_tiles) print(f'eval avg_score: {total_scores / times} highest_score: {highest_score}') if False: write_explore(agent, 'explore_new.file')
def test_env(model, vis=False): state = env.reset() if vis: env.render() done = False total_reward = 0 while not done: state = torch.FloatTensor(state).unsqueeze(0).to(device) dist, _ = model(state) next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0]) state = next_state if vis: env.render() total_reward += reward return total_reward, env.get_score()
def eval(env, agent, times=1000, render=False): highest_score = 0 scores = [] max_tiles = [] eps = 0.0 random = False for i in range(times): obs = env.reset() while True: action, action_values = agent.choose_action(obs, eps, rand=random) obs_, reward, done, _ = env.step(action) if render: env.render() if str(obs_) == str(obs): random = True #env.render() # print(f'action is: {action} {reward} {action_values} {obs} {obs_}') print( f'action is: {action} {reward} {action_values} {obs} {obs_}' ) else: random = False obs = obs_ if done: break env.render() scores.append(env.get_score()) max_tiles.append(env.highest()) if env.get_score() > highest_score: highest_score = env.get_score() if times > 0: plot_score(scores, max_tiles) print( f'eval avg_score: {np.mean(scores)} highest_score: {highest_score}' )
def train_dqn(size, agt, eps_start=1.0, eps_end=0.05, eps_decay=0.999): env = gym.make('game2048-v0', size=size, norm=FLAGS.norm) env.seed(1) if FLAGS.norm: channels = size * size + 2 else: channels = 1 agent = model.DQNAgent(size, channels, 4, 0, FLAGS.double_q, FLAGS.dueling) if FLAGS.model_file: print(f'load {FLAGS.model_file}') agent.load(FLAGS.model_file) total_steps = 0 total_scores = 0 highest_score = 0 trials = 10000 eps = eps_start scores_window = deque(maxlen=WINDOWS_SIZE) rewards_window = deque(maxlen=WINDOWS_SIZE) scores = [] sd_name = 'model_%dx%d.checkpoint' % (size, size) random = False for trial in range(1, trials + 1): obs = env.reset() stepno = 0 rewards = 0 loss = 0 while True: stepno += 1 total_steps += 1 action, _ = agent.choose_action(obs, eps, rand=random) obs_, reward, done, _ = env.step(action) random = np.all(obs == obs_) loss = agent.step(obs, action, reward, obs_, done) obs = obs_ rewards += reward if done: break eps = max(eps_end, eps * eps_decay) rewards_window.append(rewards) scores_window.append(env.get_score()) scores.append(rewards) # env.render() if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() print( '\rEpisode {}\t Steps: {}\t\t Average Reward: {:.2f}\t\t Average Scores: {:.2f}\t loss: {:.2f}\t highest: {}\t eps: {:.4f}' .format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), loss, highest_score, eps), end="") if trial % WINDOWS_SIZE == 0: print( '\rEpisode {}\t Steps: {}\t\t Average Reward: {:.2f}\t\t Average Scores: {:.2f}\t loss: {:.2f}\t highest: {}\t eps: {:.4f}' .format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), loss, highest_score, eps)) if trial % 1000 == 0: agent.save(sd_name) eval(env, agent, 1000, render=False) print(f'steps: {total_steps} avg_score: {total_scores / trials} \ highest_score: {highest_score} at size: {size}') plot_score(scores, [])