def main(): torch.set_num_threads(1) torch.manual_seed(0) env = gym.make(env_name) env.seed(seed) print('New model') policy = Policy('actor_critic', env.observation_space.shape[0], env.action_space.n) policy.to(device) optimizer = PPO(policy, clip_param, ppo_epoch, mini_batch_size, value_loss_coef, entropy_coef, learning_rate, max_grad_norm) episode_rewards = deque(maxlen=50) for eps in range(0, n_eps + 1): state = env.reset() storage = Storage(device=device) policy.eval() episode_rewards.append(test_env(policy, gym.make(env_name))) if eps % 5 == 0: print('Avg reward', np.mean(episode_rewards)) for step in range(n_steps): state = torch.FloatTensor(state).to(device) with torch.no_grad(): value, action, log_prob = policy.act(state) next_state, reward, done, _ = env.step(action.item()) storage.push(state, action, log_prob, value, reward, done) state = next_state if done: state = env.reset() next_state = torch.FloatTensor(next_state).to(device) with torch.no_grad(): next_value = policy.get_value(next_state).detach() storage.compute(next_value) policy.train() value_loss, action_loss, dist_entropy = optimizer.update(storage) with open('metrics.csv', 'a') as metrics: metrics.write('{},{},{}\n'.format(value_loss, action_loss, dist_entropy))
def play_full_episode(agents: ParallelAgentsWrapper, policy: Policy, step: int, params: argparse, is_train: bool) \ -> Tuple[ParallelAgentsWrapper, int, bool, bool, float, int, Dict[str, float]]: eval_required = False checkpoint_reached = False epoch_reward = 0 rewards, terminals, states, terminals_due_to_timeout, success = agents.perform_actions( ['new game' for _ in range(params.number_of_agents)], is_train) # Restart all the agents. log_dict = {} start_step = step successful_agents = [0 for _ in range(params.number_of_agents)] while not all([t or t is None for t in terminals]): # Loop ends only when all agents have terminated. action = policy.get_action(states, is_train) rewards, terminals, states, terminals_due_to_timeout, success = agents.perform_actions(action, is_train) # reward is a list. Passing it to update_observation changes its values hence all references should be # performed prior to calling update_observation. for idx, reward in enumerate(rewards): if reward is not None: epoch_reward += reward if success[idx]: successful_agents[idx] = 1 logging.debug('step: %s, reward: %s, terminal: %s, terminal_due_to_timeout: %s, sucess: %s', step, rewards, terminals, terminals_due_to_timeout, success) policy.update_observation(rewards, terminals, terminals_due_to_timeout, success, is_train) if is_train: single_log_dict = policy.train(states) else: single_log_dict = {} step += 1 if step % params.eval_frequency == 0: eval_required = True if step % params.checkpoint_interval == 0: checkpoint_reached = True for item in single_log_dict: if item in log_dict: log_dict[item] = log_dict[item] + single_log_dict[item] else: log_dict[item] = single_log_dict[item] for item in log_dict: log_dict[item] = log_dict[item] * 1.0 / (step - start_step) return agents, step, eval_required, checkpoint_reached, epoch_reward, sum(successful_agents), log_dict