def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) writer = SummaryWriter('logs') net.to(device) net.train() running_score = 0 steps = 0 loss = 0 for e in range(3000): done = False memory = Memory() score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = torch.zeros(2) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state loss = QNet.train_model(net, memory.sample(), optimizer) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > goal_score: break
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### ポリシーネットワークの構築 ### inputに対してπ(a|s) と Q(s, a) が出力される ### 次元とユニット数は2つで同じ net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) net.to(device) net.train() ### もろもろの初期化 running_score = 0 steps = 0 loss = 0 steps_before = 0 df = pd.DataFrame(index=range(10000), columns=["steps", "loss_policy", "loss_value"]) for e in range(10000): done = False ### 1エピソード分のメモリすら持たずに1ステップずつ学習 ### 環境を初期状態に score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) lp = [] lv = [] while not done: steps += 1 ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 transition = [state, next_state, action, reward, mask] score += reward state = next_state ### 1ステップごとに、そのステップの結果のみを学習 loss, loss_policy, loss_value = QNet.train_model(net, optimizer, transition) # loss = QNet.train_model(net, optimizer, transition) lp.append(loss_policy.item()) lv.append(loss_value.item()) lp = np.asarray(lp[:-1]).sum() / (len(lp) - 1) lv = np.asarray(lv[:-1]).sum() / (len(lv) - 1) print("Ep {0:04d}: {1} step, loss_policy: {2}, loss_value: {3}".format(e, steps - steps_before, lp, lv)) # print("Ep {0:04d}: {1} step".format(e, steps - steps_before)) df.loc[e, "steps"] = steps - steps_before df.loc[e, "loss_policy"] = lp df.loc[e, "loss_value"] = lv steps_before = steps score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) if running_score > goal_score: break df.to_csv("loss.csv")
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### ポリシーネットワークの構築 ### inputに対してπ(a|s) と V(s) が出力される ### Vの出力は1つ が学習時にはAdvantage関数を計算する net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) net.to(device) net.train() ### もろもろの初期化 running_score = 0 steps = 0 loss = 0 steps_before = 0 df = pd.DataFrame(index=range(10000), columns=["steps", "loss_policy", "loss_value"]) memory = Memory() for e in range(10000): done = False ### 1エピソード分のメモリすら持たずに1ステップずつ学習 ### 環境を初期状態に score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = torch.zeros(num_actions) action_one_hot[action] = 1 transition = [state, next_state, action, reward, mask] memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state steps_before = steps score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % 16 == 0: ### 16ステップごとに、まとめて学習 loss, loss_policy, loss_value = QNet.train_model( net, optimizer, memory.sample()) ### メモリの初期化 memory = Memory() df.loc[e, "steps"] = running_score df.loc[e, "loss_policy"] = loss_policy df.loc[e, "loss_value"] = loss_value print( "Ep {0:04d}: score: {1:02d}, loss_policy: {2}, loss_value: {3}" .format(e, int(running_score), loss_policy, loss_value)) if running_score > goal_score: break df.to_csv("loss.csv")
def test(level_list, render=True): online_net = QNet(h=84, w=84, outputs=36) online_net.load_state_dict(torch.load('saved/online_net.pt')) online_net.to(device) cnt = 0 death = 0 total_reward = 0.0 str_level_list = [LEVEL_SET[idx - 1] for idx in level_list] for level in str_level_list: env = make_retro(game=env_name, state=level, use_restricted_actions=retro.Actions.DISCRETE) obs = env.reset() state = torch.Tensor(obs).to(device).permute(2, 0, 1) #state = state.view(state.size()[0], -1) state = state.unsqueeze(0) previous_lives = 3 previous_level = level_list[cnt] cnt += 1 if death >= 3: break for t in count(): action = online_net.get_action(state.to(device)) if render: env.render() time.sleep(0.02) next_state, reward, done, info = env.step(action) next_state = torch.Tensor(next_state).permute(2, 0, 1) #next_state = next_state.view(next_state.size()[0], -1) next_state = next_state.unsqueeze(0) total_reward += reward current_lives = info['lives'] current_level = info['level'] if current_lives != previous_lives: print('Dead') previous_lives = info['lives'] death += 1 #if death >= 3: # print("Finished ", level, " Total reward: {}".format(total_reward)) # break if current_level != previous_level: print('Stage changed') print("Finished ", level, " Total reward: {}".format(total_reward)) break state = next_state if done: print('All lives gone') print("Finished ", level, " Total reward: {}".format(total_reward)) break env.close() return
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory_With_TDError(replay_memory_capacity) running_score = 0 steps = 0 beta = beta_start loss = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = target_net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = np.zeros(2) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state if steps > initial_exploration: beta += 0.00005 beta = min(1, beta) batch, weights = memory.sample(batch_size, online_net, target_net, beta) loss = QNet.train_model(online_net, target_net, optimizer, batch, weights) if steps % update_target == 0: update_target_model(online_net, target_net) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f} | beta: {:.2f}'.format( e, running_score, beta)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > goal_score: break
def main(): ### 環境を初期化 env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### ポリシーネットワークの構築 net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) net.to(device) net.train() ### もろもろの初期化 running_score = 0 steps = 0 loss = 0 steps_before = 0 for e in range(10000): done = False ### 1エピソードごとにMemoryは空にする(実質、Experience Replay がない) memory = Memory() ### 環境を初期状態に score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = torch.zeros(num_actions) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state ### 1エピソード分をまとめて学習 ### memory.sample はランダムに選択ではなく、1エピソードのmemory全体を返す loss = QNet.train_model(net, optimizer, memory.sample()) print("Ep {0:04d}: {1} step".format(e, steps - steps_before)) steps_before = steps score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) if running_score > goal_score: break