def dqn_argo(param_set: Parameter_Set, max_reward): # Agentの生成 netWork = Network(action_dim=2) target_network = Network(action_dim=2) agent = Agent(network=netWork, target_network=target_network, eps_start=param_set.eps_init, eps_anneal=param_set.eps_anneal, eps_min=param_set.eps_min, lr=param_set.lr, gamma=param_set.gamma) # Envの生成 env = gym.make('CartPole-v0') replay_buffer = Replay_Buffer(param_set.cap) save_reward_list = [] reward_list = [] for i in range(REWARD_SAVE_EVALUATION_SIZE): save_reward_list.append(0) for i in range(REWARD_EVALUATION_SIZE): reward_list.append(0) # データ集め(何回ゲームをやるか) for i in range(EPISODE_NUM): # Envの初期化情報の取得 state = env.reset() done = False # エピソード報酬初期化 episode_reward = 0 # 1ゲーム終了させる(Envから終了判定もらう) while not done: if i > INIT_EXPLORATION: # Actionをε-greedyで決める action = agent.get_action(state) else: action = env.action_space.sample() # Action引数にEnvからS、r,dの情報を引っ張ってくる next_state, reward, done, info = env.step(action) # エピソード報酬計算 episode_reward += reward # ReplayBufferにaddする replay_buffer.add(state, action, next_state, reward, done) # StにSt+1を代入(更新処理) state = next_state loss = tf.constant(0) if i > INIT_EXPLORATION: # ニューラルネットワーク学習 sample = replay_buffer.sample(BATCH_SIZE) if sample: loss = agent.update(replay_buffer.sample(BATCH_SIZE)) if i % param_set.q_update == 0: agent.network_synchronize() reward_list[i % REWARD_EVALUATION_SIZE] = episode_reward save_reward_list[i % REWARD_SAVE_EVALUATION_SIZE] = episode_reward if sum(save_reward_list) / len(save_reward_list) >= max_reward: print("最高記録更新!!!") agent.save(SAVE_DIRECTORY + SAVE_FILE) max_reward = sum(save_reward_list) / len(save_reward_list) return sum(reward_list) / len(reward_list), max_reward
# render() # print("Collisions detected by agent(s)", ', '.join(str(a) for a in obs if is_collision(a))) # break if done['__all__']: break # Epsilon decay if flags.train: eps = max(0.01, flags.epsilon_decay * eps) # Save some training statistics in their respective deques tasks_finished = sum(done[i] for i in range(flags.num_agents)) done_window.append(tasks_finished / max(1, flags.num_agents)) collisions_window.append(1. if collision else 0.) scores_window.append(score / max_steps) steps_window.append(steps_taken) # Generate training reports, saving our progress every so often print(get_report(), end=" ") if episode % flags.report_interval == 0: print(get_report(show_time=True)) start_time = time.time() if flags.train: agent.save(project_root / 'checkpoints', episode, eps) # Add stats to the tensorboard summary summary.add_scalar('performance/avg_score', np.mean(scores_window), episode) summary.add_scalar('performance/avg_steps', np.mean(steps_window), episode) summary.add_scalar('performance/completions', np.mean(done_window), episode) summary.add_scalar('performance/collisions', np.mean(collisions_window), episode)