def train(env, hparams): # randomness (https://pytorch.org/docs/stable/notes/randomness.html) random_seed = hparams['seed'] torch.manual_seed(random_seed) torch.cuda.manual_seed(random_seed) torch.cuda.manual_seed_all(random_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(random_seed) random.seed(random_seed) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] scores_hparams = hparams['scores'] scores = Scores( scores_hparams['expectation'],size=scores_hparams['window_size'], check_solved=scores_hparams['check_solved']) env_info = env.reset(train_mode=True)[brain_name] # reset the environment # number of agents num_agents = len(env_info.agents) # size of each action action_size = brain.vector_action_space_size states = env_info.vector_observations # get the current state (for each agent) state_size = states.shape[1] Agent.set_hparams(state_size, action_size, hparams) agents = [] for _ in range(num_agents): agents.append( Agent(action_size)) prefix = f'result/{hparams["output"]}' for i in range(hparams['epoch']): env_info = env.reset(train_mode=True)[brain_name] # reset the environment # number of agents num_agents = len(env_info.agents) for agent in agents: agent.reset() # size of each action action_size = brain.vector_action_space_size states = env_info.vector_observations # get the current state (for each agent) # initialize the score (for each agent) epoch_score = np.zeros(num_agents) for t in range(1, hparams['t_max']+1): actions = np.array( [agents[i].act(states[i]) for i in range(num_agents) ]) env_info = env.step(actions)[brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) dones = env_info.local_done # see if episode finished for i in range(num_agents): agents[i].step(t, states[i], actions[i], env_info.rewards[i], next_states[i], dones[i]) states = next_states epoch_score += env_info.rewards if t % 20 == 0: print('\rTimestep {}\tScore: {:.2f}\tmin: {:.2f}\tmax: {:.2f}' .format(t, np.mean(epoch_score), np.min(epoch_score), np.max(epoch_score)), end='') if np.any(dones): break print('') if scores.AddScore(np.mean(epoch_score)) is True: break Agent.save(prefix) scores.FlushLog(prefix, False)
# b_agent = Agent(args.model_name, state_size, action_size) try: b_agent.load() # try to load to continue training except: pass for epx in range(1, args.episodes + 1): at_step = 0 env_info = env.reset(train_mode=False)[brain_name] b_agent.reset_episode() while True: action = b_agent.act(state) env_info = env.step(action)[brain_name] at_step += 1 next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] if at_step % 100 == 0: log.info("ep:{} step:{} r:{} l:{}".format( epx, at_step, b_agent.cum_rewards(), b_agent.ave_loss())) if done: break b_agent.sense(state, action, reward, next_state, done) state = next_state print("{},{}".format(epx, b_agent.cum_rewards())) b_agent.save() log.info("finished.")
agent = Agent(state_space, HIDDEN_SIZE, action_dim, 1, seed=SEED, buffer_size=MEMORY_BUFFER_SIZE, actor_lr=ACTOR_LR, actor_hidden_sizes=ACTOR_HIDDEN_UNITS, actor_weight_decay=ACTOR_WEIGHT_DECAY, critic_lr=CRITIC_LR, critic_hidden_sizes=CRITIC_HIDDEN_UNITS, critic_weight_decay=CRITIC_WEIGHT_DECAY, batch_size=BATCH_SIZE, gamma=GAMMA, tau=TAU ) print(agent) agent.load() scores, actor_losses, critic_losses = run_ddpg(n_episodes=N_EPISODES, is_training=is_training, eps_start=EPS_START if is_training else EPS_END, eps_decay=EPS_DECAY, eps_end=EPS_END, max_t=MAX_STEPS, learn_every_step=LEARN_EVERY_STEP) if is_training: agent.save() fig = plt.figure() ax1 = fig.add_subplot(311) ax1.plot(np.arange(1, len(scores) + 1), scores) ax1.set_ylabel('Score') ax1.set_xlabel('Episode #') ax2 = fig.add_subplot(312) ax2.plot(np.arange(1, len(actor_losses) + 1), actor_losses) # ax2.legend() ax2.set_ylabel('Actor Loss') ax2.set_xlabel('Episode #') ax3 = fig.add_subplot(313) ax3.plot(np.arange(1, len(critic_losses) + 1), critic_losses)