def test_one(agent, dir_record, itr): agent.env.seed(itr) env_record = Monitor(agent.env, directory=dir_record) ob = env_record.reset() agent.frame_sequence.insert(atari_img_preprocess(ob)) while True: fs1 = agent.frame_sequence.memory_as_array() ## Find next action action = agent.next_action() ob, reward, done, _ = env_record.step(action) agent.frame_sequence.insert(atari_img_preprocess(ob)) fs2 = agent.frame_sequence.memory_as_array() ## Save results into the replay memory agent.replay_memory.insert(fs1, action, np.clip(reward, -1, 1), fs2, done) if done: break #end total_reward = env_record.get_episode_rewards()[0] env_record.close() return total_reward #end
def train_one(agent, dir_record, seed=None): if not seed is None: agent.env.seed(seed) env_record = Monitor(agent.env, directory=dir_record) ob = env_record.reset() agent.frame_sequence.reset() agent.frame_sequence.insert(atari_img_preprocess(ob)) while True: fs1 = agent.frame_sequence.memory_as_array() ## Find next action action = agent.next_action() ob, reward, done, _ = env_record.step(action) agent.frame_sequence.insert(atari_img_preprocess(ob)) fs2 = agent.frame_sequence.memory_as_array() ## Save results into the replay memory agent.replay_memory.insert(fs1, action, reward, fs2, done) ## Perform learning if len(agent.replay_memory.memory) >= REPLAY_START_SIZE: agent.learn() ## If done == True, then this game is finished if done: break #end ## Save the model agent.save_model(os.path.join(dir_record, 'model.ckpt')) total_reward = env_record.get_episode_rewards()[0] env_record.close() ## Save cost graph per iteration costs_episode = zip(*agent.costs) fig = plt.figure() plt.plot(*costs_episode) plt.title('Costs during training the agent') plt.xlabel('Iteration') plt.ylabel('Cost') fig.savefig(os.path.join(dir_record, 'costs.png')) plt.close(fig) ## Save error graph per iteration errors_episode = zip(*agent.errors) fig = plt.figure() plt.plot(*errors_episode) plt.title('Errors during training the agent') plt.xlabel('Iteration') plt.ylabel('Error') fig.savefig(os.path.join(dir_record, 'errors.png')) plt.close(fig) return total_reward #end
def run(seed, episodes, batch_size, gamma, inverting_gradients, initial_memory_threshold, replay_memory_size, epsilon_steps, tau_actor, tau_actor_param, use_ornstein_noise, learning_rate_actor, learning_rate_actor_param, title, epsilon_final, clip_grad, beta, scale_actions, split, indexed, zero_index_gradients, action_input_layer, evaluation_episodes, multipass, weighted, average, random_weighted, update_ratio, save_freq, save_dir, layers): if save_freq > 0 and save_dir: save_dir = os.path.join(save_dir, title + "{}".format(str(seed))) os.makedirs(save_dir, exist_ok=True) env = make_env(scale_actions) dir = os.path.join(save_dir, title) env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True) # env.seed(seed) # doesn't work on HFO np.random.seed(seed) from agents.pdqn_nstep import PDQNNStepAgent from agents.pdqn_split_nstep import PDQNNStepSplitAgent from agents.pdqn_multipass_nstep import MultiPassPDQNNStepAgent assert not (split and multipass) agent_class = PDQNNStepAgent if split: agent_class = PDQNNStepSplitAgent elif multipass: agent_class = MultiPassPDQNNStepAgent assert action_input_layer >= 0 if action_input_layer > 0: assert split agent = agent_class( env.observation_space, env.action_space, actor_kwargs={ "hidden_layers": layers, 'action_input_layer': action_input_layer, 'activation': "leaky_relu", 'output_layer_init_std': 0.01 }, actor_param_kwargs={ "hidden_layers": layers, 'activation': "leaky_relu", 'output_layer_init_std': 0.01 }, batch_size=batch_size, learning_rate_actor=learning_rate_actor, # 0.0001 learning_rate_actor_param=learning_rate_actor_param, # 0.001 epsilon_steps=epsilon_steps, epsilon_final=epsilon_final, gamma=gamma, # 0.99 tau_actor=tau_actor, tau_actor_param=tau_actor_param, clip_grad=clip_grad, beta=beta, indexed=indexed, weighted=weighted, average=average, random_weighted=random_weighted, initial_memory_threshold=initial_memory_threshold, use_ornstein_noise=use_ornstein_noise, replay_memory_size=replay_memory_size, inverting_gradients=inverting_gradients, zero_index_gradients=zero_index_gradients, seed=seed) print(agent) network_trainable_parameters = sum(p.numel() for p in agent.actor.parameters() if p.requires_grad) network_trainable_parameters += sum( p.numel() for p in agent.actor_param.parameters() if p.requires_grad) print("Total Trainable Network Parameters: %d" % network_trainable_parameters) max_steps = 15000 total_reward = 0. returns = [] timesteps = [] goals = [] start_time_train = time.time() for i in range(episodes): if save_freq > 0 and save_dir and i % save_freq == 0: agent.save_models(os.path.join(save_dir, str(i))) info = {'status': "NOT_SET"} state = env.reset() state = np.array(state, dtype=np.float32, copy=False) act, act_param, all_action_parameters = agent.act(state) action = pad_action(act, act_param) episode_reward = 0. agent.start_episode() transitions = [] for j in range(max_steps): next_state, reward, terminal, info = env.step(action) next_state = np.array(next_state, dtype=np.float32, copy=False) # status = info['status'] # if status != 'IN_GAME': # print(status) next_act, next_act_param, next_all_action_parameters = agent.act( next_state) next_action = pad_action(next_act, next_act_param) transitions.append([ state, np.concatenate(([act], all_action_parameters.data)).ravel(), reward, next_state, np.concatenate( ([next_act], next_all_action_parameters.data)).ravel(), terminal ]) act, act_param, all_action_parameters = next_act, next_act_param, next_all_action_parameters action = next_action state = next_state episode_reward += reward #env.render() if terminal: break agent.end_episode() # calculate n-step returns n_step_returns = compute_n_step_returns(transitions, gamma) for t, nsr in zip(transitions, n_step_returns): t.append(nsr) agent.replay_memory.append(state=t[0], action=t[1], reward=t[2], next_state=t[3], next_action=t[4], terminal=t[5], time_steps=None, n_step_return=nsr) n_updates = int(update_ratio * j) for _ in range(n_updates): agent._optimize_td_loss() returns.append(episode_reward) timesteps.append(j) goals.append(info['status'] == 'GOAL') total_reward += episode_reward if i % 100 == 0: print('{0:5s} R:{1:.4f} r100:{2:.4f}'.format( str(i + 1), total_reward / (i + 1), np.array(returns[-100:]).mean())) end_time_train = time.time() if save_freq > 0 and save_dir: agent.save_models(os.path.join(save_dir, str(i))) returns = env.get_episode_rewards() np.save(os.path.join(dir, title + "{}".format(str(seed))), np.column_stack((returns, timesteps, goals))) if evaluation_episodes > 0: print("Evaluating agent over {} episodes".format(evaluation_episodes)) agent.epsilon_final = 0. agent.epsilon = 0. agent.noise = None agent.actor.eval() agent.actor_param.eval() start_time_eval = time.time() evaluation_results = evaluate( env, agent, evaluation_episodes) # returns, timesteps, goals end_time_eval = time.time() print("Ave. evaluation return =", sum(evaluation_results[:, 0]) / evaluation_results.shape[0]) print("Ave. timesteps =", sum(evaluation_results[:, 1]) / evaluation_results.shape[0]) goal_timesteps = evaluation_results[:, 1][evaluation_results[:, 2] == 1] if len(goal_timesteps) > 0: print("Ave. timesteps per goal =", sum(goal_timesteps) / evaluation_results.shape[0]) else: print("Ave. timesteps per goal =", sum(goal_timesteps) / evaluation_results.shape[0]) print("Ave. goal prob. =", sum(evaluation_results[:, 2]) / evaluation_results.shape[0]) np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_results) print("Evaluation time: %.2f seconds" % (end_time_eval - start_time_eval)) print("Training time: %.2f seconds" % (end_time_train - start_time_train)) print(agent) env.close()