Пример #1
0
    def compute_payoff_matrix(self, pelicans, panthers):
        """
        - Pelican strategies are rows; panthers are columns
        - Payoffs are all to the pelican
        """

        # Resizing the payoff matrix for new strategies
        self.payoffs = np.pad(self.payoffs,
                             [(0, len(pelicans) - self.payoffs.shape[0]),
                             (0, len(panthers) - self.payoffs.shape[1])],
                             mode = 'constant')

        # Adding payoff for the last row strategy
        if self.pelican_model is not None:
            for i, opponent in enumerate(panthers):
                self.pelican_env.env_method('set_panther_using_path', opponent)
                victory_prop, avg_reward = helper.check_victory(self.pelican_model,
                                                                self.pelican_env,
                                                                trials = self.payoff_matrix_trials)
                self.payoffs[-1, i] = victory_prop

        # Adding payoff for the last column strategy
        if self.panther_model is not None:
            for i, opponent in enumerate(pelicans):
                self.panther_env.env_method('set_pelican_using_path', opponent)
                victory_prop, avg_reward = helper.check_victory(self.panther_model,
                                                                self.panther_env,
                                                                trials = self.payoff_matrix_trials)
                self.payoffs[i, -1] = 1 - victory_prop # do in terms of pelican
Пример #2
0
def train_agent(exp_path,
                model,
                env,
                testing_interval,
                max_steps,
                model_type,
                basicdate,
                tb_writer,
                tb_log_name,
                early_stopping=True,
                previous_steps=0):
    steps = 0
    logger.info("Beginning training for {} steps".format(max_steps))
    model.set_env(env)

    while steps < max_steps:
        logger.info("Training for {} steps".format(testing_interval))
        model.learn(testing_interval)
        steps = steps + testing_interval
        agent_filepath, _, _ = helper.save_model_with_env_settings(
            exp_path, model, model_type, env, basicdate)
        if early_stopping:
            victory_count, avg_reward = helper.check_victory(model,
                                                             env,
                                                             trials=10)
            if tb_writer is not None and tb_log_name is not None:
                tb_steps = steps + previous_steps
                logger.info(
                    "Writing to tensorboard for {} after {} steps".format(
                        tb_log_name, tb_steps))
                tb_writer.add_scalar('{}_avg_reward'.format(tb_log_name),
                                     avg_reward, tb_steps)
                tb_writer.add_scalar('{}_victory_count'.format(tb_log_name),
                                     victory_count, tb_steps)
            if victory_count > 7:
                logger.info("Stopping training early")
                break  #Stopping training as winning
    #Save agent
    logger.info('steps = ' + str(steps))
    agent_filepath, _, _ = helper.save_model_with_env_settings(
        exp_path, model, model_type, env, basicdate)
    agent_filepath = os.path.dirname(agent_filepath)
    return agent_filepath, steps
Пример #3
0
    def eval_agent_against_mixture(self,
                                    exp_path,
                                    driving_agent, # agent that we train
                                    model,
                                    env, # Can either be a single env or subvecproc
                                    opponent_policy_fpaths, # policies of opponent of driving agent
                                    opponent_mixture, # mixture of opponent of driving agent
                                    n_eps): # number of eval eps

        ################################################################
        # Heuristic to compute number of opponents to sample as mixture
        ################################################################
        # Min positive probability
        min_prob = min([pr for pr in opponent_mixture if pr > 0])
        target_n_opponents = self.num_parallel_envs * int(1.0 / min_prob)
        n_opponents = min(target_n_opponents, self.max_n_opponents_to_sample)

        if self.parallel:
            # Ensure that n_opponents is a multiple of
            n_opponents = self.num_parallel_envs * round(n_opponents / self.num_parallel_envs)

        logger.info("=============================================")
        logger.info("Sampling %d opponents" % n_opponents)
        logger.info("=============================================")

        # Sample n_opponents
        opponents = np.random.choice(opponent_policy_fpaths,
                                     size = n_opponents,
                                     p = opponent_mixture)

        logger.info("=============================================")
        logger.info("Opponents has %d elements" % len(opponents))
        logger.info("=============================================")

        victories = []
        avg_rewards = []
        # If we use parallel envs, we run all the training against different sampled opponents in parallel
        if self.parallel:
            # Method to load new opponents via filepath
            setter = 'set_panther_using_path' if driving_agent == 'pelican' else 'set_pelican_using_path'
            for i, opponent in enumerate(opponents):
                # Stick this in the right slot, looping back after self.num_parallel_envs
                env.env_method(setter, opponent, indices = [i % self.num_parallel_envs])
                # When we have filled all self.num_parallel_envs, then train
                if i > 0 and (i + 1) % self.num_parallel_envs == 0:
                    logger.info("Beginning parallel eval for {} steps".format(self.training_steps))
                    model.set_env(env) 

                    victory_prop, avg_reward = helper.check_victory(model, env, trials = n_eps)

                    victories.append(victory_prop)
                    avg_rewards.append(avg_reward)
        
        # Otherwise we sample different opponents and we train against each of them separately
        else:
            for opponent in opponents:
                if driving_agent == 'pelican':
                    env.set_panther_using_path(opponent)
                else:
                    env.set_pelican_using_path(opponent)
                logger.info("Beginning sequential eval for {} steps".format(self.training_steps))
                model.set_env(env)
                victory_prop, avg_reward = helper.check_victory(model, env, trials = n_eps)
                victories.append(victory_prop)
                avg_rewards.append(avg_reward)

        return np.mean(victories)#, np.mean(avg_rewards)