def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = QNet(num_inputs, num_actions)

    optimizer = optim.Adam(net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    net.to(device)
    net.train()
    running_score = 0
    steps = 0
    loss = 0

    for e in range(3000):
        done = False
        memory = Memory()

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1

            action_one_hot = torch.zeros(2)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

        loss = QNet.train_model(net, memory.sample(), optimizer)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > goal_score:
            break
예제 #2
0
파일: train.py 프로젝트: kaznyan/temp
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### ポリシーネットワークの構築
    ### inputに対してπ(a|s) と Q(s, a) が出力される
    ### 次元とユニット数は2つで同じ
    net = QNet(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=lr)

    net.to(device)
    net.train()

    ### もろもろの初期化
    running_score = 0
    steps = 0
    loss = 0
    steps_before = 0

    df = pd.DataFrame(index=range(10000), columns=["steps", "loss_policy", "loss_value"])

    for e in range(10000):
        done = False
        ### 1エピソード分のメモリすら持たずに1ステップずつ学習

        ### 環境を初期状態に
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        lp = []
        lv = []
        while not done:
            steps += 1

            ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定
            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            transition = [state, next_state, action, reward, mask]

            score += reward
            state = next_state

            ### 1ステップごとに、そのステップの結果のみを学習
            loss, loss_policy, loss_value = QNet.train_model(net, optimizer, transition)
            # loss = QNet.train_model(net, optimizer, transition)
            lp.append(loss_policy.item())
            lv.append(loss_value.item())

        lp = np.asarray(lp[:-1]).sum() / (len(lp) - 1)
        lv = np.asarray(lv[:-1]).sum() / (len(lv) - 1)
        print("Ep {0:04d}: {1} step, loss_policy: {2}, loss_value: {3}".format(e, steps - steps_before, lp, lv))
        # print("Ep {0:04d}: {1} step".format(e, steps - steps_before))
        df.loc[e, "steps"]       = steps - steps_before
        df.loc[e, "loss_policy"] = lp
        df.loc[e, "loss_value"]  = lv
        steps_before = steps

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))

        if running_score > goal_score:
            break
    df.to_csv("loss.csv")
예제 #3
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### ポリシーネットワークの構築
    ### inputに対してπ(a|s) と V(s) が出力される
    ### Vの出力は1つ が学習時にはAdvantage関数を計算する
    net = QNet(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=lr)

    net.to(device)
    net.train()

    ### もろもろの初期化
    running_score = 0
    steps = 0
    loss = 0
    steps_before = 0

    df = pd.DataFrame(index=range(10000),
                      columns=["steps", "loss_policy", "loss_value"])

    memory = Memory()

    for e in range(10000):
        done = False
        ### 1エピソード分のメモリすら持たずに1ステップずつ学習

        ### 環境を初期状態に
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定
            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1

            action_one_hot = torch.zeros(num_actions)
            action_one_hot[action] = 1

            transition = [state, next_state, action, reward, mask]

            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

        steps_before = steps

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score

        if e % 16 == 0:
            ### 16ステップごとに、まとめて学習
            loss, loss_policy, loss_value = QNet.train_model(
                net, optimizer, memory.sample())
            ### メモリの初期化
            memory = Memory()

            df.loc[e, "steps"] = running_score
            df.loc[e, "loss_policy"] = loss_policy
            df.loc[e, "loss_value"] = loss_value

            print(
                "Ep {0:04d}: score: {1:02d}, loss_policy: {2}, loss_value: {3}"
                .format(e, int(running_score), loss_policy, loss_value))

        if running_score > goal_score:
            break
    df.to_csv("loss.csv")
def test(level_list, render=True):
    online_net = QNet(h=84, w=84, outputs=36)
    online_net.load_state_dict(torch.load('saved/online_net.pt'))

    online_net.to(device)

    cnt = 0
    death = 0
    total_reward = 0.0

    str_level_list = [LEVEL_SET[idx - 1] for idx in level_list]
    for level in str_level_list:
        env = make_retro(game=env_name,
                         state=level,
                         use_restricted_actions=retro.Actions.DISCRETE)

        obs = env.reset()
        state = torch.Tensor(obs).to(device).permute(2, 0, 1)
        #state = state.view(state.size()[0], -1)
        state = state.unsqueeze(0)

        previous_lives = 3
        previous_level = level_list[cnt]
        cnt += 1
        if death >= 3:
            break

        for t in count():
            action = online_net.get_action(state.to(device))

            if render:
                env.render()
                time.sleep(0.02)

            next_state, reward, done, info = env.step(action)

            next_state = torch.Tensor(next_state).permute(2, 0, 1)
            #next_state = next_state.view(next_state.size()[0], -1)
            next_state = next_state.unsqueeze(0)

            total_reward += reward

            current_lives = info['lives']
            current_level = info['level']

            if current_lives != previous_lives:
                print('Dead')
                previous_lives = info['lives']
                death += 1
                #if death >= 3:
                #    print("Finished ", level, " Total reward: {}".format(total_reward))
                #    break

            if current_level != previous_level:
                print('Stage changed')
                print("Finished ", level,
                      " Total reward: {}".format(total_reward))
                break

            state = next_state

            if done:
                print('All lives gone')
                print("Finished ", level,
                      " Total reward: {}".format(total_reward))
                break

        env.close()
    return
예제 #5
0
def main():
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    online_net = QNet(num_inputs, num_actions)
    target_net = QNet(num_inputs, num_actions)
    update_target_model(online_net, target_net)

    optimizer = optim.Adam(online_net.parameters(), lr=lr)
    writer = SummaryWriter('logs')

    online_net.to(device)
    target_net.to(device)
    online_net.train()
    target_net.train()
    memory = Memory_With_TDError(replay_memory_capacity)
    running_score = 0
    steps = 0
    beta = beta_start
    loss = 0

    for e in range(3000):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            action = target_net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1
            action_one_hot = np.zeros(2)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

            if steps > initial_exploration:
                beta += 0.00005
                beta = min(1, beta)

                batch, weights = memory.sample(batch_size, online_net,
                                               target_net, beta)
                loss = QNet.train_model(online_net, target_net, optimizer,
                                        batch, weights)

                if steps % update_target == 0:
                    update_target_model(online_net, target_net)

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f} |  beta: {:.2f}'.format(
                e, running_score, beta))
            writer.add_scalar('log/score', float(running_score), e)
            writer.add_scalar('log/loss', float(loss), e)

        if running_score > goal_score:
            break
예제 #6
0
def main():
    ### 環境を初期化
    env = gym.make(env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    ### ポリシーネットワークの構築
    net = QNet(num_inputs, num_actions)
    optimizer = optim.Adam(net.parameters(), lr=lr)

    net.to(device)
    net.train()

    ### もろもろの初期化
    running_score = 0
    steps = 0
    loss = 0
    steps_before = 0

    for e in range(10000):
        done = False
        ### 1エピソードごとにMemoryは空にする(実質、Experience Replay がない)
        memory = Memory()

        ### 環境を初期状態に
        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            steps += 1

            ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定
            action = net.get_action(state)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state)
            next_state = next_state.unsqueeze(0)

            mask = 0 if done else 1
            reward = reward if not done or score == 499 else -1

            action_one_hot = torch.zeros(num_actions)
            action_one_hot[action] = 1
            memory.push(state, next_state, action_one_hot, reward, mask)

            score += reward
            state = next_state

        ### 1エピソード分をまとめて学習
        ### memory.sample はランダムに選択ではなく、1エピソードのmemory全体を返す
        loss = QNet.train_model(net, optimizer, memory.sample())

        print("Ep {0:04d}: {1} step".format(e, steps - steps_before))
        steps_before = steps

        score = score if score == 500.0 else score + 1
        running_score = 0.99 * running_score + 0.01 * score
        if e % log_interval == 0:
            print('{} episode | score: {:.2f}'.format(e, running_score))

        if running_score > goal_score:
            break