예제 #1
0
파일: dqn_main.py 프로젝트: waderaku/DQN
def dqn_argo(param_set: Parameter_Set, max_reward):
    # Agentの生成
    netWork = Network(action_dim=2)
    target_network = Network(action_dim=2)
    agent = Agent(network=netWork,
                  target_network=target_network,
                  eps_start=param_set.eps_init,
                  eps_anneal=param_set.eps_anneal,
                  eps_min=param_set.eps_min,
                  lr=param_set.lr,
                  gamma=param_set.gamma)

    # Envの生成
    env = gym.make('CartPole-v0')

    replay_buffer = Replay_Buffer(param_set.cap)

    save_reward_list = []
    reward_list = []
    for i in range(REWARD_SAVE_EVALUATION_SIZE):
        save_reward_list.append(0)
    for i in range(REWARD_EVALUATION_SIZE):
        reward_list.append(0)

    # データ集め(何回ゲームをやるか)
    for i in range(EPISODE_NUM):

        # Envの初期化情報の取得
        state = env.reset()
        done = False

        # エピソード報酬初期化
        episode_reward = 0

        # 1ゲーム終了させる(Envから終了判定もらう)
        while not done:

            if i > INIT_EXPLORATION:
                # Actionをε-greedyで決める
                action = agent.get_action(state)
            else:
                action = env.action_space.sample()

            # Action引数にEnvからS、r,dの情報を引っ張ってくる
            next_state, reward, done, info = env.step(action)

            # エピソード報酬計算
            episode_reward += reward

            # ReplayBufferにaddする
            replay_buffer.add(state, action, next_state, reward, done)

            # StにSt+1を代入(更新処理)
            state = next_state
        loss = tf.constant(0)

        if i > INIT_EXPLORATION:
            # ニューラルネットワーク学習
            sample = replay_buffer.sample(BATCH_SIZE)
            if sample:
                loss = agent.update(replay_buffer.sample(BATCH_SIZE))

            if i % param_set.q_update == 0:
                agent.network_synchronize()

            reward_list[i % REWARD_EVALUATION_SIZE] = episode_reward

            save_reward_list[i % REWARD_SAVE_EVALUATION_SIZE] = episode_reward

            if sum(save_reward_list) / len(save_reward_list) >= max_reward:
                print("最高記録更新!!!")
                agent.save(SAVE_DIRECTORY + SAVE_FILE)
                max_reward = sum(save_reward_list) / len(save_reward_list)
    return sum(reward_list) / len(reward_list), max_reward
예제 #2
0
        #     render()
        #     print("Collisions detected by agent(s)", ', '.join(str(a) for a in obs if is_collision(a)))
        #     break
        if done['__all__']: break

    # Epsilon decay
    if flags.train: eps = max(0.01, flags.epsilon_decay * eps)

    # Save some training statistics in their respective deques
    tasks_finished = sum(done[i] for i in range(flags.num_agents))
    done_window.append(tasks_finished / max(1, flags.num_agents))
    collisions_window.append(1. if collision else 0.)
    scores_window.append(score / max_steps)
    steps_window.append(steps_taken)

    # Generate training reports, saving our progress every so often
    print(get_report(), end=" ")
    if episode % flags.report_interval == 0:
        print(get_report(show_time=True))
        start_time = time.time()
        if flags.train: agent.save(project_root / 'checkpoints', episode, eps)

    # Add stats to the tensorboard summary
    summary.add_scalar('performance/avg_score', np.mean(scores_window),
                       episode)
    summary.add_scalar('performance/avg_steps', np.mean(steps_window), episode)
    summary.add_scalar('performance/completions', np.mean(done_window),
                       episode)
    summary.add_scalar('performance/collisions', np.mean(collisions_window),
                       episode)