Exemplo n.º 1
0
    def learn(self):
        agent = DDPGAgent(
            env=self.env,
            replay_memory_size=REPLAY_MEMORY_SIZE,
            learning_rate=LEARNING_RATE,
            batch_size=MINIBATCH_SIZE,
            gamma=GAMMA,
            tau=TAU
        )

        stats = {'scores': [], 'avg': [], 'min': [], 'max': []}
        for ep in tqdm(range(1, self.episodes + 1), ascii=True, unit='episodes'):

            print(self.epsilon)
            action_stats = [0, 0]
            current_state = self.env.reset()
            current_state = self.convert_gray(current_state)

            done = False
            score = 0
            steps = 0

            while not done:
                steps += 1

                if np.random.random() > self.epsilon:
                    action_stats[0] += 1
                    action = agent.get_action(current_state)
                else:
                    action_stats[1] += 1
                    action = self.env.action_space.sample()
                    action[2] = min(action[2], 0.2)
                    action[1] = action[1]*2

                new_state, reward, done, _ = self.env.step(action)
                if ep % self.results_every_n_episodes == 0:
                    self.env.render()

                score += reward

                new_state = self.convert_gray(new_state)

                agent.memory.push(current_state, action, reward, new_state)

                if steps % 64 == 0:
                    agent.update()

                current_state = new_state

                if self.epsilon > 0.1:
                    self.epsilon -= self.epsilon_decay_value

                if score < 0:
                    break

            print(action_stats)
            print(score)
            stats['scores'].append(score)
        self.env.close()
        return agent.actor
Exemplo n.º 2
0
			env.render()

			episode_reward += reward

			## store transition - Standard Experience Replay
			state_rep = np.concatenate((state, goal), axis=0)
			next_state_rep = np.concatenate((next_state["observation"], goal), axis=0)
			standard_transition = [state_rep, action, reward, next_state_rep]
			# agent.memory.store(standard_transition)
			standard_replay.append(standard_transition)

			state = next_state["observation"]

			if agent.memory.__len__() > BATCH_SIZE:
				## perform one-step optimization on BATCH
				agent.update(batch_size = BATCH_SIZE)

		## the episode is now over
		## need to create normalized HER transitions using Strategy
		her_replay = strategy.get_her_transitions(standard_replay)

		## normalize standard transitions as well
		normalized_stnd_replay = []
		for transition in standard_replay:
			normalized_state = normalizer(transition[0][:-3], 5.0)
			normalized_goal = normalizer(transition[0][-3:], 5.0)
			normalized_next_state = normalizer(transition[3][:-3], 5.0)
			normalized_action = normalizer(transition[1], 5.0)

			normalized_stnd_replay.append([(np.concatenate((normalized_state, normalized_goal), axis=0)),
										   normalized_action, 
Exemplo n.º 3
0
def main_single_agent():
    env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64",
                           worker_id=1,
                           seed=1)
    env_date = str(datetime.datetime.now())
    file_path = os.path.join('data_single', env_date)

    os.makedirs(file_path, exist_ok=True)
    save_config(file_path)

    brain_name = env.brain_names[0]

    buffer = ReplayBuffer(Config.buffer_size)
    agent = DDPGAgent(in_actor=48,
                      hidden_in_actor=Config.actor_hidden[0],
                      hidden_out_actor=Config.actor_hidden[1],
                      out_actor=2,
                      in_critic=50,
                      hidden_in_critic=Config.critic_hidden[0],
                      hidden_out_critic=Config.critic_hidden[1],
                      lr_actor=Config.actor_lr,
                      lr_critic=Config.critic_lr,
                      noise_dist=Config.noise_distribution,
                      checkpoint_path=Config.checkpoint_path)

    agent_reward, all_rewards_mean = [], []
    batchsize = Config.batchsize
    max_reward = Config.max_reward
    # amplitude of OU noise
    # this slowly decreases to 0
    noise = Config.noise_beginning

    logger = logging.getLogger('Tennis MADDPG')
    all_rewards = []
    for episode in range(Config.n_episodes):
        reward_this_episode = 0
        env_info = env.reset(train_mode=True)[brain_name]
        states = torch.from_numpy(np.concatenate(env_info.vector_observations)
                                  )  # get the current state (for each agent)
        scores = np.zeros(2)  # initialize the score (for each agent)
        n_of_steps = 0
        noise = max(
            Config.min_noise,
            Config.noise_beginning *
            (1 - (Config.n_episodes - episode) / Config.n_episodes))
        while True:
            n_of_steps += 1

            states_tensor = torch.tensor(states).float()
            actions = agent.act(states_tensor, noise=noise)
            actions_array = actions.detach().numpy()
            actions_for_env = np.clip(actions_array, -1,
                                      1)  # all actions between -1 and 1

            env_info = env.step(np.array([
                actions_for_env, actions_for_env
            ]))[brain_name]  # send all actions to tne environment

            states_next = torch.from_numpy(
                np.concatenate(env_info.vector_observations))

            # if replay_buffer_reward_min is defined, add to replay buffer only the observations higher than min_reward
            reward = np.sum(np.array(env_info.rewards))
            reward_this_episode += reward
            if Config.replay_buffer_raward_min and reward_this_episode >= Config.replay_buffer_raward_min:
                buffer_data = (states, torch.from_numpy(actions_for_env),
                               reward, states_next, env_info.local_done[0])
                buffer.push(buffer_data)

            if not Config.replay_buffer_raward_min:
                buffer_data = (states, torch.from_numpy(actions_for_env),
                               reward, states_next, env_info.local_done[0])

                buffer.push(buffer_data)

            dones = env_info.local_done  # see if episode finished
            scores += np.sum(
                env_info.rewards)  # update the score (for each agent)
            states = states_next  # roll over states to next time step
            if np.any(dones):  # exit loop if episode finished
                break

        all_rewards.append(reward_this_episode)
        all_rewards_mean.append(np.mean(all_rewards[-100:]))
        if len(buffer) > Config.warmup:
            agent.update(buffer,
                         batchsize=batchsize,
                         tau=Config.tau,
                         discount=Config.discount_factor)
            if episode % Config.update_episode_n == 0:
                agent.update_targets(tau=Config.tau)

        if (episode + 1) % 100 == 0 or episode == Config.n_episodes - 1:
            logger.info(
                f'Episode {episode}:  Average reward over 100 episodes is {all_rewards_mean[-1]}'
            )
            if all_rewards_mean and all_rewards_mean[-1] > max_reward:
                logger.info('Found best model. Saving model into file: ...')

                save_dict_list = []
                save_dict = {
                    'actor_params': agent.actor.state_dict(),
                    'actor_optim_params': agent.actor_optimizer.state_dict(),
                    'critic_params': agent.critic.state_dict(),
                    'critic_optim_params': agent.critic_optimizer.state_dict()
                }

                save_dict_list.append(save_dict)
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(file_path, 'episode-{}.pt'.format(episode)))
                max_reward = all_rewards_mean[-1]
            plt.plot(all_rewards_mean)
            plt.xlabel('N of episodes')
            plt.ylabel('Reward')
            plt.title(
                'Final rewards of single agent for tennis collaboration task')
            plt.savefig(os.path.join(file_path, 'result_plot.png'))

    save_dict = {
        'actor_params': agent.actor.state_dict(),
        'actor_target_params': agent.target_actor.save_dict(),
        'actor_optim_params': agent.actor_optimizer.state_dict(),
        'critic_params': agent.critic.state_dict(),
        'critic_target_params': agent.target_critic.state_dict(),
        'critic_optim_params': agent.critic_optimizer.state_dict()
    }

    torch.save(save_dict,
               os.path.join(file_path, 'episode-{}.pt'.format(episode)))