def testAgent(): print("Testing the Agent") agent = DDPGAgent(state_size=state_size, action_size=action_size, n_agents=n_agents, seed=48, train=False) env_info = env.reset(train_mode=False)[brain_name] # reset the environment states = env_info.vector_observations # get the current state score = np.zeros(n_agents) # initialize the score while True: actions = agent.act(states) # select an action env_info = env.step(actions)[ brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished score += np.array(rewards) # update the score states = next_states # roll over the state to next time step if np.any(dones): # exit loop if episode finished break print("Score: {}".format(np.mean(score))) return score
mem_size=50000, n_actions=1, batch_size=64) train_score_history = [] avg_train_score_history = [] test_score_history = [] avg_test_score_history = [] for i in range(5000): obs = env.reset() done = False train_score = 0 while not done: act = agent.act(obs) new_state, reward, done, _ = env.step(act) agent.record(obs, act, reward, new_state, done) agent.learn() train_score += reward obs = new_state train_score_history.append(train_score) avg_train_score_history.append(np.mean(train_score_history[-100:])) print('episode %s score %d last 100 games avg reward %.2f' % (i, train_score, float(avg_train_score_history[-1]))) # testing if i % 10 == 0: test_sore_list = [] for j in range(3):
def main_single_agent(): env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64", worker_id=1, seed=1) env_date = str(datetime.datetime.now()) file_path = os.path.join('data_single', env_date) os.makedirs(file_path, exist_ok=True) save_config(file_path) brain_name = env.brain_names[0] buffer = ReplayBuffer(Config.buffer_size) agent = DDPGAgent(in_actor=48, hidden_in_actor=Config.actor_hidden[0], hidden_out_actor=Config.actor_hidden[1], out_actor=2, in_critic=50, hidden_in_critic=Config.critic_hidden[0], hidden_out_critic=Config.critic_hidden[1], lr_actor=Config.actor_lr, lr_critic=Config.critic_lr, noise_dist=Config.noise_distribution, checkpoint_path=Config.checkpoint_path) agent_reward, all_rewards_mean = [], [] batchsize = Config.batchsize max_reward = Config.max_reward # amplitude of OU noise # this slowly decreases to 0 noise = Config.noise_beginning logger = logging.getLogger('Tennis MADDPG') all_rewards = [] for episode in range(Config.n_episodes): reward_this_episode = 0 env_info = env.reset(train_mode=True)[brain_name] states = torch.from_numpy(np.concatenate(env_info.vector_observations) ) # get the current state (for each agent) scores = np.zeros(2) # initialize the score (for each agent) n_of_steps = 0 noise = max( Config.min_noise, Config.noise_beginning * (1 - (Config.n_episodes - episode) / Config.n_episodes)) while True: n_of_steps += 1 states_tensor = torch.tensor(states).float() actions = agent.act(states_tensor, noise=noise) actions_array = actions.detach().numpy() actions_for_env = np.clip(actions_array, -1, 1) # all actions between -1 and 1 env_info = env.step(np.array([ actions_for_env, actions_for_env ]))[brain_name] # send all actions to tne environment states_next = torch.from_numpy( np.concatenate(env_info.vector_observations)) # if replay_buffer_reward_min is defined, add to replay buffer only the observations higher than min_reward reward = np.sum(np.array(env_info.rewards)) reward_this_episode += reward if Config.replay_buffer_raward_min and reward_this_episode >= Config.replay_buffer_raward_min: buffer_data = (states, torch.from_numpy(actions_for_env), reward, states_next, env_info.local_done[0]) buffer.push(buffer_data) if not Config.replay_buffer_raward_min: buffer_data = (states, torch.from_numpy(actions_for_env), reward, states_next, env_info.local_done[0]) buffer.push(buffer_data) dones = env_info.local_done # see if episode finished scores += np.sum( env_info.rewards) # update the score (for each agent) states = states_next # roll over states to next time step if np.any(dones): # exit loop if episode finished break all_rewards.append(reward_this_episode) all_rewards_mean.append(np.mean(all_rewards[-100:])) if len(buffer) > Config.warmup: agent.update(buffer, batchsize=batchsize, tau=Config.tau, discount=Config.discount_factor) if episode % Config.update_episode_n == 0: agent.update_targets(tau=Config.tau) if (episode + 1) % 100 == 0 or episode == Config.n_episodes - 1: logger.info( f'Episode {episode}: Average reward over 100 episodes is {all_rewards_mean[-1]}' ) if all_rewards_mean and all_rewards_mean[-1] > max_reward: logger.info('Found best model. Saving model into file: ...') save_dict_list = [] save_dict = { 'actor_params': agent.actor.state_dict(), 'actor_optim_params': agent.actor_optimizer.state_dict(), 'critic_params': agent.critic.state_dict(), 'critic_optim_params': agent.critic_optimizer.state_dict() } save_dict_list.append(save_dict) save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(file_path, 'episode-{}.pt'.format(episode))) max_reward = all_rewards_mean[-1] plt.plot(all_rewards_mean) plt.xlabel('N of episodes') plt.ylabel('Reward') plt.title( 'Final rewards of single agent for tennis collaboration task') plt.savefig(os.path.join(file_path, 'result_plot.png')) save_dict = { 'actor_params': agent.actor.state_dict(), 'actor_target_params': agent.target_actor.save_dict(), 'actor_optim_params': agent.actor_optimizer.state_dict(), 'critic_params': agent.critic.state_dict(), 'critic_target_params': agent.target_critic.state_dict(), 'critic_optim_params': agent.critic_optimizer.state_dict() } torch.save(save_dict, os.path.join(file_path, 'episode-{}.pt'.format(episode)))
in_critic=50, hidden_in_critic=Config.critic_hidden[0], hidden_out_critic=Config.critic_hidden[1], lr_actor=Config.actor_lr, lr_critic=Config.critic_lr, noise_dist=Config.noise_distribution, checkpoint_path=Config.checkpoint_path) for episode in range(args.n_episodes): env_info = env.reset(train_mode=False)[brain_name] states = torch.from_numpy(np.concatenate(env_info.vector_observations) ) # get the current state (for each agent) scores = np.zeros(2) # initialize the score (for each agent) while True: states_tensor = torch.tensor(states).float() actions = agent.act(states_tensor, noise=0) actions_array = actions.detach().numpy() actions_for_env = np.clip(actions_array, -1, 1) # all actions between -1 and 1 env_info = env.step(np.array([ actions_for_env, actions_for_env ]))[brain_name] # send all actions to tne environment states_next = torch.from_numpy( np.concatenate(env_info.vector_observations)) # if replay_buffer_reward_min is defined, add to replay buffer only the observations higher than min_reward reward = np.sum(np.array(env_info.rewards)) dones = env_info.local_done # see if episode finished scores += np.sum(
def main(args): env = gym.make(args.env) outdir = '/tmp/ddpg' env = wrappers.Monitor(env, outdir, force=True) assert (env.action_space.high == -env.action_space.low ).all(), 'action_space bound should be symmetric' assert (env.action_space.high == env.action_space.high[0] ).all(), 'all action dims should have the same bound' agent = DDPGAgent(env.observation_space.shape[0], env.action_space.shape[0], float(env.action_space.high[0])) optimizer = DDPGOptimizer(agent, args.capacity, args.batch_size, args.gamma, args.tau, args.init_lr, args.weight_decay, args.crayon_vis) for episode in range(args.num_episode): agent.ou_noise.reset() state = env.reset().astype(NUMPY_PRECISION) running_loss = 0. training_total_reward = 0. for step in count(): action = agent.noisy_act(state) next_state, reward, done, _ = env.step(action) # env.render() state, action, reward, next_state = map( lambda x: NUMPY_PRECISION(x), (state, action, reward, next_state)) if done: next_state = None optimizer.memory.push_back(SARS(state, action, reward, next_state)) if optimizer.memory.trainable: loss = optimizer.step() running_loss += loss state = next_state training_total_reward += reward if done: if args.crayon_vis: optimizer.stats.add_scalar_value('average loss', running_loss / step) optimizer.stats.add_scalar_value('step', step) optimizer.stats.add_scalar_value('training total reward', training_total_reward) break if episode % 100 == 99: total_reward = 0. for eval in range(args.num_test): # agent.ou_noise.reset() state = env.reset().astype(NUMPY_PRECISION) for step in count(): # action = agent.noisy_act(state) action = agent.act(state) # print(action) next_state, reward, done, _ = env.step(action) env.render() state, action, reward, next_state = map( lambda x: NUMPY_PRECISION(x), (state, action, reward, next_state)) state = next_state total_reward += reward if done: break print('[Eval] {}th episode, total reward: {}, average reward: {}'. format(episode, total_reward, total_reward / args.num_test)) env.close()