예제 #1
0
def save_model(exp_path, model, model_type, env, basicdate):
    logger.info("Saving model")

    helper.save_model_with_env_settings(exp_path,model,model_type,env,basicdate)

    video_path = os.path.join(exp_path, 'training.mp4')
    helper.make_video(model,env,video_path)            
예제 #2
0
def save_model(exp_path, model, model_type, env, basicdate):
    logger.info("Saving model")

    # helper.save_model(exp_path, model, model_type, env.driving_agent, env.render_height, env.render_width, image_based, basicdate)
    helper.save_model_with_env_settings(exp_path,model,model_type,env,basicdate)

    video_path = os.path.join(exp_path, 'training.mp4')
    helper.make_video(model,env,video_path)   
예제 #3
0
def evaluate_algorithms(exp_name, base_path, tb_enabled, algorithms,
                        victory_threshold, victory_trials, max_seconds,
                        testing_interval, use_non_image):
    basicdate = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
    exp_name = "{}_{}".format(exp_name, basicdate)
    exp_path = os.path.join(base_path, exp_name)
    logger.info("Storing results in {}".format(exp_path))

    writer = None
    if tb_enabled:
        writer = SummaryWriter(exp_path)

    for algo in algorithms:
        tb_log_name = "{}_non_image".format(algo) if use_non_image else algo
        logger.info("Evaluating algorithm: {}; non-image: {}".format(
            algo, use_non_image))
        if use_non_image:
            image_based = False
            env = plark_env_non_image_state.PlarkEnvNonImageState(
                driving_agent='pelican',
                config_file_path=
                '/Components/plark-game/plark_game/game_config/10x10/panther_easy.json'
            )
            policy = "MlpPolicy"  # CnnPolicy doesn't work with MultiDiscrete observation space
        else:
            image_based = True
            env = plark_env.PlarkEnv(
                driving_agent='pelican',
                config_file_path=
                '/Components/plark-game/plark_game/game_config/10x10/panther_easy.json'
            )
            policy = "CnnPolicy"

        model = helper.make_new_model(algo, policy, env)
        helper.train_until(model,
                           env,
                           victory_threshold,
                           victory_trials,
                           max_seconds,
                           testing_interval,
                           tb_writer=writer,
                           tb_log_name=tb_log_name)
        helper.save_model_with_env_settings(exp_path, model, algo, env,
                                            image_based, basicdate)

    writer.close()
예제 #4
0
def train_agent(exp_path,
                model,
                env,
                testing_interval,
                max_steps,
                model_type,
                basicdate,
                tb_writer,
                tb_log_name,
                early_stopping=True,
                previous_steps=0):
    steps = 0
    logger.info("Beginning training for {} steps".format(max_steps))
    model.set_env(env)

    while steps < max_steps:
        logger.info("Training for {} steps".format(testing_interval))
        model.learn(testing_interval)
        steps = steps + testing_interval
        agent_filepath, _, _ = helper.save_model_with_env_settings(
            exp_path, model, model_type, env, basicdate)
        if early_stopping:
            victory_count, avg_reward = helper.check_victory(model,
                                                             env,
                                                             trials=10)
            if tb_writer is not None and tb_log_name is not None:
                tb_steps = steps + previous_steps
                logger.info(
                    "Writing to tensorboard for {} after {} steps".format(
                        tb_log_name, tb_steps))
                tb_writer.add_scalar('{}_avg_reward'.format(tb_log_name),
                                     avg_reward, tb_steps)
                tb_writer.add_scalar('{}_victory_count'.format(tb_log_name),
                                     victory_count, tb_steps)
            if victory_count > 7:
                logger.info("Stopping training early")
                break  #Stopping training as winning
    #Save agent
    logger.info('steps = ' + str(steps))
    agent_filepath, _, _ = helper.save_model_with_env_settings(
        exp_path, model, model_type, env, basicdate)
    agent_filepath = os.path.dirname(agent_filepath)
    return agent_filepath, steps
def compare_envs(exp_name, base_path, tb_enabled, victory_threshold,
                 victory_trials, max_seconds, testing_interval,
                 num_parallel_envs, non_image):
    basicdate = str(datetime.now().strftime("%Y%m%d_%H%M%S"))
    exp_name = "{}_{}".format(exp_name, basicdate)
    exp_path = os.path.join(base_path, exp_name)
    logger.info("Storing results in {}".format(exp_path))

    writer = None
    if tb_enabled:
        writer = SummaryWriter(exp_path)

    for parallel in [False, True]:
        algo = "PPO2"
        policy = "MlpPolicy" if non_image else "CnnPolicy"
        tb_log_name = "{}_parallel".format(algo) if parallel else algo
        logger.info("Evaluating {}; parallel: {}".format(algo, parallel))
        if parallel:
            logger.info("Evaluating using {} parallel environments".format(
                num_parallel_envs))
            env_fn = createNonImageEnv if non_image else createImageEnv
            env = SubprocVecEnv([env_fn for _ in range(num_parallel_envs)])
        else:
            env = createNonImageEnv() if non_image else createImageEnv()

        model = helper.make_new_model(algo, policy, env)
        helper.train_until(model,
                           env,
                           victory_threshold,
                           victory_trials,
                           max_seconds,
                           testing_interval,
                           tb_writer=writer,
                           tb_log_name=tb_log_name)
        helper.save_model_with_env_settings(exp_path, model, algo, env,
                                            basicdate)

    writer.close()
예제 #6
0
    def train_agent(self,
                    exp_path, # Path for saving the agent
                    model,
                    env):     # Can be either single env or vec env
                     
        logger.info("Beginning individual training for {} steps".format(self.training_steps))
        model.set_env(env)
        model.learn(self.training_steps)

        logger.info('Finished train agent')
        savepath = self.basicdate + '_pnm_iteration_' + str(self.pnm_iteration)
        agent_filepath ,_, _= helper.save_model_with_env_settings(exp_path, model, self.model_type, env, savepath)
        agent_filepath = os.path.dirname(agent_filepath)
        return agent_filepath
예제 #7
0
def save():
    logger.info(str(retrain_iter))
    logger.info(str(retrain_values))

    plt.figure(figsize=(9, 3))
    plt.subplot(131)
    plt.bar(retrain_iter, retrain_values)
    plt.subplot(132)
    plt.scatter(retrain_iter, retrain_values)
    plt.subplot(133)
    plt.plot(retrain_iter, retrain_values)
    plt.suptitle('Retraining Progress')
    image_based = False
    model_path, model_dir, modellabel = helper.save_model_with_env_settings(
        basepath, model, modeltype, env, image_based, basicdate)
    fig_path = os.path.join(model_dir, 'Training_Progress.png')
    plt.savefig(fig_path)
    print('Model saved to ', model_path)
예제 #8
0
    def run_pnm(self):

        panther_agent_filepath, pelican_agent_filepath = self.initialAgents()

        # Initialize old NE stuff for stopping criterion
        value_to_pelican = 0.
        mixture_pelicans = np.array([1.])
        mixture_panthers = np.array([1.])

        # Create DataFrames for plotting purposes
        df_cols = ["NE_Payoff", "Pelican_BR_Payoff", "Panther_BR_Payoff", "Pelican_supp_size", "Panther_supp_size"]
        df = pd.DataFrame(columns = df_cols)
        # second df for period rigorous exploitability checks
        exploit_df_cols = ["iter",  "NE_Payoff", "Pelican_BR_Payoffs", "Panther_BR_Payoffs"]
        exploit_df = pd.DataFrame(columns = exploit_df_cols)

        # Train best responses until Nash equilibrium is found or max_iterations are reached
        logger.info('Parallel Nash Memory (PNM)')
        for self.pnm_iteration in range(self.max_pnm_iterations):
            start = time.time()

            logger.info("*********************************************************")
            logger.info('PNM iteration ' + str(self.pnm_iteration + 1) + ' of ' + str(self.max_pnm_iterations))
            logger.info("*********************************************************")

            self.pelicans.append(pelican_agent_filepath)
            self.panthers.append(panther_agent_filepath)

            if self.pnm_iteration == 0:
                self.compute_initial_payoffs()

            # Computing the payoff matrices and solving the corresponding LPs
            # Only compute for pelican in the sparse env, that of panther is the negative traspose (game is zero-sum)
            logger.info('Computing payoffs and mixtures')
            self.compute_payoff_matrix(self.pelicans, self.panthers)
            logger.info("=================================================")
            logger.info("New matrix game:")
            logger.info("As numpy array:")
            logger.info('\n' + str(self.payoffs))
            logger.info("As dataframe:")
            tmp_df = pd.DataFrame(self.payoffs).rename_axis('Pelican', axis = 0).rename_axis('Panther', axis = 1)
            logger.info('\n' + str(tmp_df))

            # save payoff matrix
            np.save('%s/payoffs_%d.npy' % (self.pnm_logs_exp_path, self.pnm_iteration), self.payoffs)

            def get_support_size(mixture):
                # return size of the support of mixed strategy mixture
                return sum([1 if m > 0 else 0 for m in mixture])

            # Check if we found a stable NE, in that case we are done (and fitting DF)
            if self.pnm_iteration > 0:
                # Both BR payoffs (from against last time's NE) in terms of pelican payoff
                br_value_pelican = np.dot(mixture_pelicans, self.payoffs[-1, :-1])
                br_value_panther = np.dot(mixture_panthers, self.payoffs[:-1, -1])

                ssize_pelican = get_support_size(mixture_pelicans)
                ssize_panther = get_support_size(mixture_panthers)

                logger.info("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
                logger.info("\n\
                             Pelican BR payoff: %.3f,\n\
                             Value of Game: %.3f,\n\
                             Panther BR payoff: %.3f,\n\
                             Pelican Supp Size: %d,\n\
                             Panther Supp Size: %d,\n" % (
                                                          br_value_pelican,
                                                          value_to_pelican,
                                                          br_value_panther,
                                                          ssize_pelican,
                                                          ssize_panther
                                                          ))
                logger.info("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
                values = dict(zip(df_cols, [value_to_pelican, br_value_pelican,
                                                              br_value_panther,
                                                              ssize_pelican,
                                                              ssize_panther]))
                df = df.append(values, ignore_index = True)

                # Write to csv file
                df_path =  os.path.join(self.exp_path, 'values_iter_%02d.csv' % self.pnm_iteration)
                df.to_csv(df_path, index = False)
                helper.get_fig(df)
                fig_path = os.path.join(self.exp_path, 'values_iter_%02d.pdf' % self.pnm_iteration)
                plt.savefig(fig_path)
                print("==========================================")
                print("WRITTEN VALUES DF TO CSV: %s" % df_path)
                print("==========================================")

                # here value_to_pelican is from the last time the subgame was solved
                if abs(br_value_pelican - value_to_pelican) < self.stopping_eps and\
                   abs(br_value_panther - value_to_pelican) < self.stopping_eps:

                    print('Stable Nash Equilibrium found')
                    break

            logger.info("SOLVING NEW GAME:")
            # solve game for pelican
            (mixture_pelicans, value_to_pelican) = lp_solve.solve_zero_sum_game(self.payoffs)
            # with np.printoptions(precision=3):
            logger.info(mixture_pelicans)
            mixture_pelicans /= np.sum(mixture_pelicans)
            # with np.printoptions(precision=3):
            logger.info("After normalisation:")
            logger.info(mixture_pelicans)
            np.save('%s/mixture_pelicans_%d.npy' % (self.pnm_logs_exp_path, self.pnm_iteration), mixture_pelicans)

            # solve game for panther
            (mixture_panthers, value_panthers) = lp_solve.solve_zero_sum_game(-self.payoffs.transpose())
            # with np.printoptions(precision=3):
            logger.info(mixture_panthers)
            mixture_panthers /= np.sum(mixture_panthers)
            # with np.printoptions(precision=3):
            logger.info("After normalisation:")
            logger.info(mixture_panthers)
            np.save('%s/mixture_panthers_%d.npy' % (self.pnm_logs_exp_path, self.pnm_iteration), mixture_panthers)

            # end of logging matrix game and solution
            logger.info("=================================================")

            # Train from skratch or retrain an existing model for pelican
            logger.info('Training pelican')
            
            self.pelican_model = self.bootstrap(self.pelicans, self.pelican_env, mixture_pelicans)
                
            pelican_agent_filepath = self.train_agent_against_mixture('pelican',
                                                                      self.pelicans_tmp_exp_path,
                                                                      self.pelican_model,
                                                                      self.pelican_env,
                                                                      self.panthers,
                                                                      mixture_panthers,
                                                                      self.training_steps)

            
            
            
            # Train from scratch or retrain an existing model for panther
            logger.info('Training panther')
            
            self.panther_model = self.bootstrap(self.panthers, self.panther_env, mixture_panthers)
            
            panther_agent_filepath = self.train_agent_against_mixture('panther',
                                                                     self.panthers_tmp_exp_path,
                                                                     self.panther_model,
                                                                     self.panther_env,
                                                                     self.pelicans,
                                                                     mixture_pelicans,
                                                                     self.training_steps)

            logger.info("PNM iteration lasted: %d seconds" % (time.time() - start))

            if self.pnm_iteration  > 0 and self.pnm_iteration  % self.testing_interval == 0:
                # Find best pelican (protagonist) against panther (opponent) mixture
                candidate_pelican_rbbr_fpaths, candidate_pelican_rbbr_win_percentages = self.iter_train_against_mixture(
                                                self.exploit_n_rbbrs, # Number of resource bounded best responses
                                                self.pelicans_tmp_exp_path,
                                                self.pelican_model, # driving_agent, # agent that we train
                                                self.pelican_env, # env, # Can either be a single env or subvecproc
                                                self.pelicans, # Filepaths to existing models
                                                mixture_pelicans, # mixture for bootstrapping
                                                self.panthers, # opponent_policy_fpaths, # policies of opponent of driving agent
                                                mixture_panthers) # opponent_mixture)

                logger.info("################################################")
                logger.info('candidate_pelican_rbbr_win_percentages: %s' %  np.round(candidate_pelican_rbbr_win_percentages,2))
                logger.info("################################################")
                br_values_pelican = np.round(candidate_pelican_rbbr_win_percentages,2).tolist()

                candidate_panther_rbbr_fpaths, candidate_panther_rbbr_win_percentages = self.iter_train_against_mixture(
                                                self.exploit_n_rbbrs, # Number of resource bounded best responses
                                                self.panthers_tmp_exp_path,
                                                self.panther_model, # driving_agent, # agent that we train
                                                self.panther_env, # env, # Can either be a single env or subvecproc
                                                self.panthers, # Filepaths to existing models
                                                mixture_panthers, # mixture for bootstrapping
                                                self.pelicans, # opponent_policy_fpaths, # policies of opponent of driving agent
                                                mixture_pelicans) # opponent_mixture)

                logger.info("################################################")
                logger.info('candidate_panther_rbbr_win_percentages: %s' % np.round(candidate_panther_rbbr_win_percentages,2))
                logger.info("################################################")
                br_values_panther = [1-p for p in np.round(candidate_panther_rbbr_win_percentages,2)]

                values = dict(zip(exploit_df_cols, [self.pnm_iteration,
                                                    value_to_pelican, 
                                                    br_values_pelican,
                                                    br_values_panther]))
                exploit_df = exploit_df.append(values, ignore_index = True)

                # add medians
                exploit_df['pelican_median'] = exploit_df['Pelican_BR_Payoffs'].apply(np.median)
                exploit_df['panther_median'] = exploit_df['Panther_BR_Payoffs'].apply(np.median)

                # Write to csv file
                df_path =  os.path.join(self.exp_path, 'exploit_iter_%02d.csv' % self.pnm_iteration)

                tmp_df = exploit_df.set_index('iter')
                tmp_df.to_csv(df_path, index = True)

                helper.get_fig_with_exploit(df, tmp_df)
                fig_path = os.path.join(self.exp_path, 'values_with_exploit_iter_%02d.pdf' % self.pnm_iteration)
                plt.savefig(fig_path)
                print("==========================================")
                print("WRITTEN EXPLOIT DF TO CSV: %s" % df_path)
                print("==========================================")

                if self.video_flag:
                    # occasionally ouput useful things along the way
                    # Make videos
                    verbose = False
                    video_path =  os.path.join(self.exp_path, 'pelican_pnm_iter_%02d.mp4' % self.pnm_iteration)
                    basewidth,hsize = helper.make_video_VEC_ENV(self.pelican_model, 
                                                                self.pelican_env, 
                                                                video_path,
                                                                fps=self.fps,
                                                                basewidth=self.basewidth,
                                                                n_steps=self.video_steps,
                                                                verbose=verbose)
                                                                
                    video_path =  os.path.join(self.exp_path, 'panther_pnm_iter_%02d.mp4' % self.pnm_iteration)
                    basewidth,hsize = helper.make_video_VEC_ENV(self.panther_model, 
                                                                self.panther_env, 
                                                                video_path,
                                                                fps=self.fps,
                                                                basewidth=self.basewidth,
                                                                n_steps=self.video_steps,
                                                                verbose=verbose)


        # Saving final mixture and corresponding agents
        logger.info("################################################")
        logger.info("Saving final pelican mixtures and agents:")
        support_pelicans = np.nonzero(mixture_pelicans)[0]
        mixture_pelicans = mixture_pelicans[support_pelicans]
        np.save(self.exp_path + '/final_mixture_pelicans.npy', mixture_pelicans)
        logger.info("Final pelican mixture saved to: %s" % self.exp_path + '/final_mixture_pelicans.npy')
        for i, idx in enumerate(mixture_pelicans):
            self.pelican_model = helper.loadAgent(glob.glob(self.pelicans[i]+ "/*.zip")[0], self.model_type)
            agent_filepath ,_, _= helper.save_model_with_env_settings(self.pelicans_tmp_exp_path,
                                                                      self.pelican_model,
                                                                      self.model_type,
                                                                      self.pelican_env,
                                                                      self.basicdate + "_ps_" + str(i))
            logger.info("Saving  pelican %d to %s" % (i, agent_filepath))
        support_panthers = np.nonzero(mixture_panthers)[0]
        mixture_panthers = mixture_panthers[support_panthers]
        np.save(self.exp_path + '/final_mixture_panthers.npy', mixture_panthers)
        logger.info("Final panther mixture saved to: %s" % self.exp_path + '/final_mixture_panthers.npy')
        for i, idx in enumerate(mixture_panthers):
            self.panther_model = helper.loadAgent(glob.glob(self.panthers[i]+ "/*.zip")[0], self.model_type)
            agent_filepath ,_, _= helper.save_model_with_env_settings(self.panthers_tmp_exp_path,
                                                                      self.panther_model,
                                                                      self.model_type,
                                                                      self.panther_env,
                                                                      self.basicdate + "_ps_" + str(i))

            logger.info("Saving  panther %d to %s" % (i, agent_filepath))
예제 #9
0
    def train_agent_against_mixture(self,
                                    driving_agent, # agent that we train
                                    exp_path,
                                    model,
                                    env, # Can either be a single env or subvecproc
                                    opponent_policy_fpaths, # policies of opponent of driving agent
                                    opponent_mixture,
                                    training_steps,
                                    filepath_addon=''): # mixture of opponent of driving agent
                                    
        ################################################################
        # Heuristic to compute number of opponents to sample as mixture
        ################################################################
        # Min positive probability
        min_prob = min([pr for pr in opponent_mixture if pr > 0])
        target_n_opponents = self.num_parallel_envs * int(1.0 / min_prob)
        n_opponents = min(target_n_opponents, self.max_n_opponents_to_sample)

        if self.parallel:
            # Ensure that n_opponents is a multiple of
            n_opponents = self.num_parallel_envs * round(n_opponents / self.num_parallel_envs)

        logger.info("=============================================")
        logger.info("Sampling %d opponents" % n_opponents)
        logger.info("=============================================")

        # Sample n_opponents
        opponents = np.random.choice(opponent_policy_fpaths,
                                     size = n_opponents,
                                     p = opponent_mixture)

        logger.info("=============================================")
        logger.info("Opponents has %d elements" % len(opponents))
        logger.info("=============================================")

        # If we use parallel envs, we run all the training against different sampled opponents in parallel
        if self.parallel:
            # Method to load new opponents via filepath
            setter = 'set_panther_using_path' if driving_agent == 'pelican' else 'set_pelican_using_path'
            for i, opponent in enumerate(opponents):
                # Stick this in the right slot, looping back after self.num_parallel_envs
                env.env_method(setter, opponent, indices = [i % self.num_parallel_envs])
                # When we have filled all self.num_parallel_envs, then train
                if i > 0 and (i + 1) % self.num_parallel_envs == 0:
                    logger.info("Beginning parallel training for {} steps".format(self.training_steps))
                    model.set_env(env)
                    model.learn(training_steps)

        # Otherwise we sample different opponents and we train against each of them separately
        else:
            for opponent in opponents:
                if driving_agent == 'pelican':
                    env.set_panther_using_path(opponent)
                else:
                    env.set_pelican_using_path(opponent)
                logger.info("Beginning sequential training for {} steps".format(self.training_steps))
                model.set_env(env)
                model.learn(self.training_steps)

        # Save agent
        logger.info('Finished train agent')
        savepath = self.basicdate + '_pnm_iteration_'+ str(self.pnm_iteration) + filepath_addon 
        agent_filepath, _, _= helper.save_model_with_env_settings(exp_path, model, self.model_type, env, savepath)
        agent_filepath = os.path.dirname(agent_filepath)
        return agent_filepath
예제 #10
0
    model = DQN('CnnPolicy', env)
    model.learn(50)
    logger.info('STARTING STAGE 1 INITIAL EVALUATION')
    stg1_mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=1, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False)
    logger.info('FINISHING STAGE 1 INITIAL EVALUATION')
    stage1result = retrain(stg1_mean_reward, stage_one_threshold, 0 ,env, model)
    logger.info("Stage One Threshold Met")
    if stage1result == True:
        logger.info("Stage 2 Training Started")
        env = plark_env_guided_reward.PlarkEnvGuidedReward(config_file_path=easy_config)
        model.set_env(env)
        model.learn(50)
        logger.info('STARTING STAGE 2 INITIAL EVALUATION')
        stg2_mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=1, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False)
        logger.info('FINISHING STAGE 2 INITIAL EVALUATION')
        stage2result = retrain(stg2_mean_reward, stage_two_threshold, 0 ,env, model)
        logger.info("Stage Two Threshold Met")
        if stage2result == True:
            logger.info("Stage 3 Training Started")
            env = plark_env_guided_reward.PlarkEnvGuidedReward(config_file_path=medium_config)
            model.set_env(env)
            model.learn(50)
            logger.info('STARTING STAGE 3 EVALUATION')
            stg3_mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=1, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False)
            logger.info('FINISHED STAGE 3 EVALUATION')
            stage3result = retrain(stg3_mean_reward, stage_three_threshold, 0 ,env, model)
            if stage3result == True:
                logger.info("Stage Three Threshold Met")
                logger.info("Multi-Stage-Training-Complete")
    model_path,model_dir, modellabel = helper.save_model_with_env_settings(basepath,model,modeltype,env,basicdate)