def learn(self): agent = DDPGAgent( env=self.env, replay_memory_size=REPLAY_MEMORY_SIZE, learning_rate=LEARNING_RATE, batch_size=MINIBATCH_SIZE, gamma=GAMMA, tau=TAU ) stats = {'scores': [], 'avg': [], 'min': [], 'max': []} for ep in tqdm(range(1, self.episodes + 1), ascii=True, unit='episodes'): print(self.epsilon) action_stats = [0, 0] current_state = self.env.reset() current_state = self.convert_gray(current_state) done = False score = 0 steps = 0 while not done: steps += 1 if np.random.random() > self.epsilon: action_stats[0] += 1 action = agent.get_action(current_state) else: action_stats[1] += 1 action = self.env.action_space.sample() action[2] = min(action[2], 0.2) action[1] = action[1]*2 new_state, reward, done, _ = self.env.step(action) if ep % self.results_every_n_episodes == 0: self.env.render() score += reward new_state = self.convert_gray(new_state) agent.memory.push(current_state, action, reward, new_state) if steps % 64 == 0: agent.update() current_state = new_state if self.epsilon > 0.1: self.epsilon -= self.epsilon_decay_value if score < 0: break print(action_stats) print(score) stats['scores'].append(score) self.env.close() return agent.actor
env.render() episode_reward += reward ## store transition - Standard Experience Replay state_rep = np.concatenate((state, goal), axis=0) next_state_rep = np.concatenate((next_state["observation"], goal), axis=0) standard_transition = [state_rep, action, reward, next_state_rep] # agent.memory.store(standard_transition) standard_replay.append(standard_transition) state = next_state["observation"] if agent.memory.__len__() > BATCH_SIZE: ## perform one-step optimization on BATCH agent.update(batch_size = BATCH_SIZE) ## the episode is now over ## need to create normalized HER transitions using Strategy her_replay = strategy.get_her_transitions(standard_replay) ## normalize standard transitions as well normalized_stnd_replay = [] for transition in standard_replay: normalized_state = normalizer(transition[0][:-3], 5.0) normalized_goal = normalizer(transition[0][-3:], 5.0) normalized_next_state = normalizer(transition[3][:-3], 5.0) normalized_action = normalizer(transition[1], 5.0) normalized_stnd_replay.append([(np.concatenate((normalized_state, normalized_goal), axis=0)), normalized_action,
def main_single_agent(): env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64", worker_id=1, seed=1) env_date = str(datetime.datetime.now()) file_path = os.path.join('data_single', env_date) os.makedirs(file_path, exist_ok=True) save_config(file_path) brain_name = env.brain_names[0] buffer = ReplayBuffer(Config.buffer_size) agent = DDPGAgent(in_actor=48, hidden_in_actor=Config.actor_hidden[0], hidden_out_actor=Config.actor_hidden[1], out_actor=2, in_critic=50, hidden_in_critic=Config.critic_hidden[0], hidden_out_critic=Config.critic_hidden[1], lr_actor=Config.actor_lr, lr_critic=Config.critic_lr, noise_dist=Config.noise_distribution, checkpoint_path=Config.checkpoint_path) agent_reward, all_rewards_mean = [], [] batchsize = Config.batchsize max_reward = Config.max_reward # amplitude of OU noise # this slowly decreases to 0 noise = Config.noise_beginning logger = logging.getLogger('Tennis MADDPG') all_rewards = [] for episode in range(Config.n_episodes): reward_this_episode = 0 env_info = env.reset(train_mode=True)[brain_name] states = torch.from_numpy(np.concatenate(env_info.vector_observations) ) # get the current state (for each agent) scores = np.zeros(2) # initialize the score (for each agent) n_of_steps = 0 noise = max( Config.min_noise, Config.noise_beginning * (1 - (Config.n_episodes - episode) / Config.n_episodes)) while True: n_of_steps += 1 states_tensor = torch.tensor(states).float() actions = agent.act(states_tensor, noise=noise) actions_array = actions.detach().numpy() actions_for_env = np.clip(actions_array, -1, 1) # all actions between -1 and 1 env_info = env.step(np.array([ actions_for_env, actions_for_env ]))[brain_name] # send all actions to tne environment states_next = torch.from_numpy( np.concatenate(env_info.vector_observations)) # if replay_buffer_reward_min is defined, add to replay buffer only the observations higher than min_reward reward = np.sum(np.array(env_info.rewards)) reward_this_episode += reward if Config.replay_buffer_raward_min and reward_this_episode >= Config.replay_buffer_raward_min: buffer_data = (states, torch.from_numpy(actions_for_env), reward, states_next, env_info.local_done[0]) buffer.push(buffer_data) if not Config.replay_buffer_raward_min: buffer_data = (states, torch.from_numpy(actions_for_env), reward, states_next, env_info.local_done[0]) buffer.push(buffer_data) dones = env_info.local_done # see if episode finished scores += np.sum( env_info.rewards) # update the score (for each agent) states = states_next # roll over states to next time step if np.any(dones): # exit loop if episode finished break all_rewards.append(reward_this_episode) all_rewards_mean.append(np.mean(all_rewards[-100:])) if len(buffer) > Config.warmup: agent.update(buffer, batchsize=batchsize, tau=Config.tau, discount=Config.discount_factor) if episode % Config.update_episode_n == 0: agent.update_targets(tau=Config.tau) if (episode + 1) % 100 == 0 or episode == Config.n_episodes - 1: logger.info( f'Episode {episode}: Average reward over 100 episodes is {all_rewards_mean[-1]}' ) if all_rewards_mean and all_rewards_mean[-1] > max_reward: logger.info('Found best model. Saving model into file: ...') save_dict_list = [] save_dict = { 'actor_params': agent.actor.state_dict(), 'actor_optim_params': agent.actor_optimizer.state_dict(), 'critic_params': agent.critic.state_dict(), 'critic_optim_params': agent.critic_optimizer.state_dict() } save_dict_list.append(save_dict) save_dict_list.append(save_dict) torch.save( save_dict_list, os.path.join(file_path, 'episode-{}.pt'.format(episode))) max_reward = all_rewards_mean[-1] plt.plot(all_rewards_mean) plt.xlabel('N of episodes') plt.ylabel('Reward') plt.title( 'Final rewards of single agent for tennis collaboration task') plt.savefig(os.path.join(file_path, 'result_plot.png')) save_dict = { 'actor_params': agent.actor.state_dict(), 'actor_target_params': agent.target_actor.save_dict(), 'actor_optim_params': agent.actor_optimizer.state_dict(), 'critic_params': agent.critic.state_dict(), 'critic_target_params': agent.target_critic.state_dict(), 'critic_optim_params': agent.critic_optimizer.state_dict() } torch.save(save_dict, os.path.join(file_path, 'episode-{}.pt'.format(episode)))