def compute_payoff_matrix(self, pelicans, panthers): """ - Pelican strategies are rows; panthers are columns - Payoffs are all to the pelican """ # Resizing the payoff matrix for new strategies self.payoffs = np.pad(self.payoffs, [(0, len(pelicans) - self.payoffs.shape[0]), (0, len(panthers) - self.payoffs.shape[1])], mode = 'constant') # Adding payoff for the last row strategy if self.pelican_model is not None: for i, opponent in enumerate(panthers): self.pelican_env.env_method('set_panther_using_path', opponent) victory_prop, avg_reward = helper.check_victory(self.pelican_model, self.pelican_env, trials = self.payoff_matrix_trials) self.payoffs[-1, i] = victory_prop # Adding payoff for the last column strategy if self.panther_model is not None: for i, opponent in enumerate(pelicans): self.panther_env.env_method('set_pelican_using_path', opponent) victory_prop, avg_reward = helper.check_victory(self.panther_model, self.panther_env, trials = self.payoff_matrix_trials) self.payoffs[i, -1] = 1 - victory_prop # do in terms of pelican
def train_agent(exp_path, model, env, testing_interval, max_steps, model_type, basicdate, tb_writer, tb_log_name, early_stopping=True, previous_steps=0): steps = 0 logger.info("Beginning training for {} steps".format(max_steps)) model.set_env(env) while steps < max_steps: logger.info("Training for {} steps".format(testing_interval)) model.learn(testing_interval) steps = steps + testing_interval agent_filepath, _, _ = helper.save_model_with_env_settings( exp_path, model, model_type, env, basicdate) if early_stopping: victory_count, avg_reward = helper.check_victory(model, env, trials=10) if tb_writer is not None and tb_log_name is not None: tb_steps = steps + previous_steps logger.info( "Writing to tensorboard for {} after {} steps".format( tb_log_name, tb_steps)) tb_writer.add_scalar('{}_avg_reward'.format(tb_log_name), avg_reward, tb_steps) tb_writer.add_scalar('{}_victory_count'.format(tb_log_name), victory_count, tb_steps) if victory_count > 7: logger.info("Stopping training early") break #Stopping training as winning #Save agent logger.info('steps = ' + str(steps)) agent_filepath, _, _ = helper.save_model_with_env_settings( exp_path, model, model_type, env, basicdate) agent_filepath = os.path.dirname(agent_filepath) return agent_filepath, steps
def eval_agent_against_mixture(self, exp_path, driving_agent, # agent that we train model, env, # Can either be a single env or subvecproc opponent_policy_fpaths, # policies of opponent of driving agent opponent_mixture, # mixture of opponent of driving agent n_eps): # number of eval eps ################################################################ # Heuristic to compute number of opponents to sample as mixture ################################################################ # Min positive probability min_prob = min([pr for pr in opponent_mixture if pr > 0]) target_n_opponents = self.num_parallel_envs * int(1.0 / min_prob) n_opponents = min(target_n_opponents, self.max_n_opponents_to_sample) if self.parallel: # Ensure that n_opponents is a multiple of n_opponents = self.num_parallel_envs * round(n_opponents / self.num_parallel_envs) logger.info("=============================================") logger.info("Sampling %d opponents" % n_opponents) logger.info("=============================================") # Sample n_opponents opponents = np.random.choice(opponent_policy_fpaths, size = n_opponents, p = opponent_mixture) logger.info("=============================================") logger.info("Opponents has %d elements" % len(opponents)) logger.info("=============================================") victories = [] avg_rewards = [] # If we use parallel envs, we run all the training against different sampled opponents in parallel if self.parallel: # Method to load new opponents via filepath setter = 'set_panther_using_path' if driving_agent == 'pelican' else 'set_pelican_using_path' for i, opponent in enumerate(opponents): # Stick this in the right slot, looping back after self.num_parallel_envs env.env_method(setter, opponent, indices = [i % self.num_parallel_envs]) # When we have filled all self.num_parallel_envs, then train if i > 0 and (i + 1) % self.num_parallel_envs == 0: logger.info("Beginning parallel eval for {} steps".format(self.training_steps)) model.set_env(env) victory_prop, avg_reward = helper.check_victory(model, env, trials = n_eps) victories.append(victory_prop) avg_rewards.append(avg_reward) # Otherwise we sample different opponents and we train against each of them separately else: for opponent in opponents: if driving_agent == 'pelican': env.set_panther_using_path(opponent) else: env.set_pelican_using_path(opponent) logger.info("Beginning sequential eval for {} steps".format(self.training_steps)) model.set_env(env) victory_prop, avg_reward = helper.check_victory(model, env, trials = n_eps) victories.append(victory_prop) avg_rewards.append(avg_reward) return np.mean(victories)#, np.mean(avg_rewards)