示例#1
0
    def rollout_agent(self, agent_policy, reference=True, eval_episodes=None):
        """Rolls out agent_policy in the specified environment
        """
        if reference:
            if eval_episodes is None:
                eval_episodes = self.episodes_per_instance
            trajectory = evaluate_policy(nagents=self.nagents,
                                         env=self.reference_env,
                                         agent_policy=agent_policy,
                                         replay_buffer=None,
                                         eval_episodes=eval_episodes,
                                         max_steps=self.max_env_timesteps,
                                         freeze_agent=True,
                                         add_noise=False,
                                         log_distances=self.log_distances)
        else:
            trajectory = evaluate_policy(
                nagents=self.nagents,
                env=self.randomized_env,
                agent_policy=agent_policy,
                replay_buffer=self.replay_buffer,
                eval_episodes=self.episodes_per_instance,
                max_steps=self.max_env_timesteps,
                freeze_agent=self.freeze_agent,
                add_noise=True,
                log_distances=self.log_distances)

        return trajectory
    def generate_ground_truth(self, simulator_agent, agent_policy, timesteps,
                              log_path):
        logger.debug('Generating ground truth...')

        self.evaluation_scores = [None] * simulator_agent.nparams
        default_values = [['default'] * simulator_agent.nparams
                          ] * self.neval_eps

        for randomized_dimension in range(simulator_agent.nparams):
            evaluation_array = []
            for i, x in enumerate(self.ground_truth_x):
                if i % DISPLAY_FREQUENCY == 0:
                    logger.info("Dim: {}, Index: {}/{}".format(
                        randomized_dimension, i, len(self.ground_truth_x)))

                values = default_values
                for index in range(self.neval_eps):
                    values[index][randomized_dimension] = x

                self.randomized_env.randomize(values)

                randomized_rewards, final_distances = evaluate_policy(
                    nagents=self.neval_eps,
                    env=self.randomized_env,
                    agent_policy=agent_policy,
                    replay_buffer=None,
                    eval_episodes=1,
                    max_steps=self.max_steps,
                    return_rewards=True,
                    add_noise=False,
                    log_distances=self.log_distances)

                if self.log_distances:
                    evaluation_array.append(
                        np.array([
                            np.mean(final_distances),
                            np.std(final_distances)
                        ]))
                else:
                    evaluation_array.append(
                        np.array([
                            np.mean(randomized_rewards),
                            np.std(randomized_rewards)
                        ]))

            self.evaluation_scores[randomized_dimension] = np.array(
                evaluation_array)

        self.evaluation_scores = np.array(self.evaluation_scores)

        for randomized_dimension in range(simulator_agent.nparams):
            name = self.randomized_env.get_dimension_name(randomized_dimension)

        np.savez('{}.npz'.format(
            os.path.join(log_path, 'raw_rewards-{}'.format(timesteps))),
                 raw_rewards=self.evaluation_scores)

        logger.info('Ground truth generated.')
        return self.evaluation_scores
 def get_ref_reward(self):
     ref_rewards = []
     for i in range(5):
         trajectory = evaluate_policy(nagents=10,
                                      env=self.reference_env,
                                      agent_policy=self.agent_policy,
                                      replay_buffer=None,
                                      eval_episodes=10,
                                      max_steps=self.max_env_timesteps,
                                      freeze_agent=True,
                                      add_noise=False,
                                      log_distances=False)
         for roll in trajectory:
             ref_rewards.append(np.sum(roll[:, -1].squeeze()))
     self.ref_rewards = np.mean(ref_rewards)
    def generate_ground_truth(self,
                              simulator_agent,
                              agent_policy,
                              timesteps,
                              log_path,
                              plot_path,
                              record_video=False):
        logger.debug('Generating ground truth...')

        self.evaluation_scores = [None] * simulator_agent.nparams
        # default_values = [['default'] * simulator_agent.nparams] * self.neval_eps
        # print(dir(self.randomized_env))
        # default_values = [self.randomized_env.dimensions[dimension].default_value for dimension in range(0, simulator_agent.nparams)] * self.neval_eps
        # default_values = [self.randomized_env.unwrapped.dimensions[dimension].default_value for dimension in range(0, simulator_agent.nparams)] * self.neval_eps
        default_values = [[
            self.randomized_env.unwrapped.dimensions[dimension].default_value
            for dimension in range(0, simulator_agent.nparams)
        ]] * self.neval_eps

        for randomized_dimension in range(simulator_agent.nparams):
            evaluation_array = []
            for i, x in enumerate(self.ground_truth_x):
                if i % DISPLAY_FREQUENCY == 0:
                    logger.info("Dim: {}, Index: {}/{}".format(
                        randomized_dimension, i, len(self.ground_truth_x)))

                values = default_values
                for index in range(self.neval_eps):
                    values[index][randomized_dimension] = x

                self.randomized_env.randomize(values)

                randomized_rewards, final_distances = evaluate_policy(
                    nagents=self.neval_eps,
                    env=self.randomized_env,
                    agent_policy=agent_policy,
                    replay_buffer=None,
                    eval_episodes=1,
                    max_steps=self.max_steps,
                    return_rewards=True,
                    record_video=record_video and i % DISPLAY_FREQUENCY == 0,
                    add_noise=False,
                    log_distances=self.log_distances,
                    video_path=os.path.join(
                        plot_path, 'raw_video-%d-dim%d-index-%dof%d' %
                        (timesteps, randomized_dimension, i + 1,
                         len(self.ground_truth_x))))

                if self.log_distances:
                    evaluation_array.append(
                        np.array([
                            np.mean(final_distances),
                            np.std(final_distances)
                        ]))
                else:
                    evaluation_array.append(
                        np.array([
                            np.mean(randomized_rewards),
                            np.std(randomized_rewards)
                        ]))

            self.evaluation_scores[randomized_dimension] = np.array(
                evaluation_array)

        self.evaluation_scores = np.array(self.evaluation_scores)

        for randomized_dimension in range(simulator_agent.nparams):
            name = self.randomized_env.get_dimension_name(randomized_dimension)

        np.savez('{}.npz'.format(
            os.path.join(log_path, 'raw_rewards-{}'.format(timesteps))),
                 raw_rewards=self.evaluation_scores)

        # TODO: save video of rollouts -> in evaluate_policy

        logger.info('Ground truth generated.')
        return self.evaluation_scores
    def plot_discriminator_reward(self, simulator_agent, agent_policy,
                                  timesteps, plot_path, log_path):
        logger.debug('Generating ground truth...')

        # default_values = [['default'] * simulator_agent.nparams] * self.neval_eps
        default_values = [[
            self.randomized_env.unwrapped.dimensions[dimension].default_value
            for dimension in range(0, simulator_agent.nparams)
        ]] * self.neval_eps

        for randomized_dimension in range(simulator_agent.nparams):
            evaluation_array_mean = []
            evaluation_array_median = []
            for i, x in enumerate(self.ground_truth_x):
                if i % DISPLAY_FREQUENCY == 0:
                    logger.info("Dim: {}, Index: {}/{}".format(
                        randomized_dimension, i, len(self.ground_truth_x)))

                values = default_values
                for index in range(self.neval_eps):
                    values[index][randomized_dimension] = x

                self.randomized_env.randomize(values)
                trajectory = evaluate_policy(nagents=self.neval_eps,
                                             env=self.randomized_env,
                                             agent_policy=agent_policy,
                                             replay_buffer=None,
                                             eval_episodes=1,
                                             max_steps=self.max_steps,
                                             freeze_agent=True,
                                             add_noise=False,
                                             log_distances=self.log_distances)

                trajectory = [trajectory[i] for i in range(self.neval_eps)]
                trajectory = np.concatenate(trajectory)

                randomized_discrim_score_mean, randomized_discrim_score_median, _ = \
                    simulator_agent.discriminator_rewarder.get_score(trajectory)

                evaluation_array_mean.append(randomized_discrim_score_mean)
                evaluation_array_median.append(randomized_discrim_score_median)

            ground_truth_scaled = self.randomized_env.rescale(
                randomized_dimension, self.ground_truth_x)
            name = self.randomized_env.get_dimension_name(randomized_dimension)
            print('MeanDR', evaluation_array_mean[::10])
            print('MedianDR', evaluation_array_median[::10])

            plt.plot(ground_truth_scaled, evaluation_array_mean, c="green")
            plt.savefig('{}.png'.format(
                os.path.join(plot_path,
                             'mean-discrimrew-{}-{}'.format(name, timesteps))))
            plt.close()

            plt.plot(ground_truth_scaled, evaluation_array_median, c="green")
            plt.savefig('{}.png'.format(
                os.path.join(plot_path,
                             'med-discrimrew-{}-{}'.format(name, timesteps))))
            plt.close()

            np.savez('{}.npz'.format(
                os.path.join(log_path,
                             'discriminator_rewards-{}'.format(timesteps))),
                     discriminator_mean=evaluation_array_mean,
                     discriminator_median=evaluation_array_median)
示例#6
0
    def select_action(self, agent_policy):
        """Select an action based on SVPG policy, where an action is the delta in each dimension.
        Update the counts and statistics after training agent,
        rolling out policies, and calculating simulator reward.
        """
        if self.svpg_timesteps >= self.initial_svpg_steps:
            # Get sim instances from SVPG policy
            simulation_instances = self.svpg.step()

            index = self.svpg_timesteps % self.svpg_horizon
            self.simulation_instances_full_horizon[:,
                                                   index, :, :] = simulation_instances

        else:
            # Creates completely randomized environment
            simulation_instances = np.ones(
                (self.nagents, self.svpg.svpg_rollout_length,
                 self.svpg.nparams)) * -1
            small_ranges = np.linspace(0, 1, self.nagents + 1)
            for i in range(self.nagents):
                miu = (small_ranges[i] + small_ranges[i + 1]) / 2
                sigma = (small_ranges[0 + 1] + small_ranges[0]) / 6
                row = np.random.normal(
                    miu, sigma, (self.svpg.svpg_rollout_length, self.nparams))
                row[row < 0] = 0
                row[row > 1] = 1
                simulation_instances[i] = row

        assert (self.nagents, self.svpg.svpg_rollout_length,
                self.svpg.nparams) == simulation_instances.shape

        # Create placeholders for trajectories
        randomized_trajectories = [[] for _ in range(self.nagents)]
        reference_trajectories = [[] for _ in range(self.nagents)]

        # Create placeholder for rewards
        rewards = np.zeros(simulation_instances.shape[:2])

        # Discriminator debugging
        randomized_discrim_score_mean = 0
        reference_discrim_score_mean = 0
        randomized_discrim_score_median = 0
        reference_discrim_score_median = 0

        # Reshape to work with vectorized environments
        simulation_instances = np.transpose(simulation_instances, (1, 0, 2))
        log_path = os.path.join(PARA_LOG,
                                'parameter_log_{}'.format(self.svpg_timesteps))
        log_file = open(log_path, 'w', 1)

        # Create environment instances with vectorized env, and rollout agent_policy in both
        for t in range(self.svpg.svpg_rollout_length):
            agent_timesteps_current_iteration = 0
            logging.info('Iteration t: {}/{}'.format(
                t, self.svpg.svpg_rollout_length))

            reference_trajectory = self.rollout_agent(agent_policy)

            self.randomized_env.randomize(
                randomized_values=simulation_instances[t])
            env_params = self.randomized_env.get_current_params()
            log_file.write(' '.join([str(val)
                                     for val in env_params[:, 0]]) + '\n')

            randomized_trajectory = self.rollout_agent(agent_policy,
                                                       reference=False)

            for i in range(self.nagents):
                agent_timesteps_current_iteration += len(
                    randomized_trajectory[i])

                reference_trajectories[i].append(reference_trajectory[i])
                randomized_trajectories[i].append(randomized_trajectory[i])

                self.agent_timesteps += len(randomized_trajectory[i])
                self.agent_timesteps_since_eval += len(
                    randomized_trajectory[i])

                simulator_reward = self.discriminator_rewarder.calculate_rewards(
                    randomized_trajectories[i][t])
                rewards[i][t] = simulator_reward

                logger.info('Setting: {}, Score: {}'.format(
                    simulation_instances[t][i], simulator_reward))

            if not self.freeze_discriminator:
                # flatten and combine all randomized and reference trajectories for discriminator
                flattened_randomized = [
                    randomized_trajectories[i][t] for i in range(self.nagents)
                ]
                flattened_randomized = np.concatenate(flattened_randomized)

                flattened_reference = [
                    reference_trajectories[i][t] for i in range(self.nagents)
                ]
                flattened_reference = np.concatenate(flattened_reference)

                randomized_discrim_score_mean, randomized_discrim_score_median, randomized_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_randomized)
                reference_discrim_score_mean, reference_discrim_score_median, reference_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_reference)

                # Train discriminator based on state action pairs for agent env. steps
                # TODO: Train more?
                self.discriminator_rewarder.train_discriminator(
                    flattened_reference,
                    flattened_randomized,
                    iterations=agent_timesteps_current_iteration)

                randomized_discrim_score_mean, randomized_discrim_score_median, randomized_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_randomized)
                reference_discrim_score_mean, reference_discrim_score_median, reference_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_reference)

        # Calculate discriminator based reward, pass it back to SVPG policy
        if self.svpg_timesteps >= self.initial_svpg_steps:
            if self.train_svpg:
                self.svpg.train(rewards)

            for dimension in range(self.nparams):
                self.sampled_regions[dimension] = np.concatenate([
                    self.sampled_regions[dimension],
                    simulation_instances[:, :, dimension].flatten()
                ])

        solved_reference = info = None
        if self.agent_timesteps_since_eval > self.agent_eval_frequency:
            self.agent_timesteps_since_eval %= self.agent_eval_frequency
            logger.info(
                "Evaluating for {} episodes afer timesteps: {} (SVPG), {} (Agent)"
                .format(self.randomized_eval_episodes * self.nagents,
                        self.svpg_timesteps, self.agent_timesteps))

            agent_reference_eval_rewards = []
            agent_randomized_eval_rewards = []

            final_dist_ref = []
            final_dist_rand = []

            for _ in range(self.randomized_eval_episodes):
                rewards_ref, dist_ref = evaluate_policy(
                    nagents=self.nagents,
                    env=self.reference_env,
                    agent_policy=agent_policy,
                    replay_buffer=None,
                    eval_episodes=1,
                    max_steps=self.max_env_timesteps,
                    return_rewards=True,
                    add_noise=False,
                    log_distances=self.log_distances)

                full_random_settings = np.ones(
                    (self.nagents, self.nparams)) * -1
                self.randomized_env.randomize(
                    randomized_values=full_random_settings)

                rewards_rand, dist_rand = evaluate_policy(
                    nagents=self.nagents,
                    env=self.randomized_env,
                    agent_policy=agent_policy,
                    replay_buffer=None,
                    eval_episodes=1,
                    max_steps=self.max_env_timesteps,
                    return_rewards=True,
                    add_noise=False,
                    log_distances=self.log_distances)

                agent_reference_eval_rewards += list(rewards_ref)
                agent_randomized_eval_rewards += list(rewards_rand)
                final_dist_ref += [dist_ref]
                final_dist_rand += [dist_rand]

            evaluation_criteria_reference = agent_reference_eval_rewards
            evaluation_criteria_randomized = agent_randomized_eval_rewards

            if self.log_distances:
                evaluation_criteria_reference = final_dist_ref
                evaluation_criteria_randomized = final_dist_rand

            solved_reference = check_solved(self.reference_env_id,
                                            evaluation_criteria_reference)
            solved_randomized = check_solved(self.randomized_eval_env_id,
                                             evaluation_criteria_randomized)

            info = {
                'solved':
                str(solved_reference),
                'solved_randomized':
                str(solved_randomized),
                'svpg_steps':
                self.svpg_timesteps,
                'agent_timesteps':
                self.agent_timesteps,
                'final_dist_ref_mean':
                np.mean(final_dist_ref),
                'final_dist_ref_std':
                np.std(final_dist_ref),
                'final_dist_ref_median':
                np.median(final_dist_ref),
                'final_dist_rand_mean':
                np.mean(final_dist_rand),
                'final_dist_rand_std':
                np.std(final_dist_rand),
                'final_dist_rand_median':
                np.median(final_dist_rand),
                'agent_reference_eval_rewards_mean':
                np.mean(agent_reference_eval_rewards),
                'agent_reference_eval_rewards_std':
                np.std(agent_reference_eval_rewards),
                'agent_reference_eval_rewards_median':
                np.median(agent_reference_eval_rewards),
                'agent_reference_eval_rewards_min':
                np.min(agent_reference_eval_rewards),
                'agent_reference_eval_rewards_max':
                np.max(agent_reference_eval_rewards),
                'agent_randomized_eval_rewards_mean':
                np.mean(agent_randomized_eval_rewards),
                'agent_randomized_eval_rewards_std':
                np.std(agent_randomized_eval_rewards),
                'agent_randomized_eval_rewards_median':
                np.median(agent_randomized_eval_rewards),
                'agent_randomized_eval_rewards_min':
                np.min(agent_randomized_eval_rewards),
                'agent_randomized_eval_rewards_max':
                np.max(agent_randomized_eval_rewards),
                'randomized_discrim_score_mean':
                str(randomized_discrim_score_mean),
                'reference_discrim_score_mean':
                str(reference_discrim_score_mean),
                'randomized_discrim_score_median':
                str(randomized_discrim_score_median),
                'reference_discrim_score_median':
                str(reference_discrim_score_median),
            }

            agent_hard_eval_rewards, final_dist_hard = evaluate_policy(
                nagents=self.nagents,
                env=self.hard_env,
                agent_policy=agent_policy,
                replay_buffer=None,
                eval_episodes=1,
                max_steps=self.max_env_timesteps,
                return_rewards=True,
                add_noise=False,
                log_distances=self.log_distances)
            info_hard = {
                'final_dist_hard_mean':
                np.mean(final_dist_hard),
                'final_dist_hard_std':
                np.std(final_dist_hard),
                'final_dist_hard_median':
                np.median(final_dist_hard),
                'agent_hard_eval_rewards_median':
                np.median(agent_hard_eval_rewards),
                'agent_hard_eval_rewards_mean':
                np.mean(agent_hard_eval_rewards),
                'agent_hard_eval_rewards_std':
                np.std(agent_hard_eval_rewards),
            }

            info.update(info_hard)

        self.svpg_timesteps += 1
        return solved_reference, info
            actor_paths = glob.glob(os.path.join(os.getcwd(), paths['paper'], 'best-seed*_actor.pth'))
            print(actor_paths)
            for actor_idx, actor_path in enumerate(actor_paths):
                agent_policy = DDPGActor(
                    state_dim=reference_env.observation_space.shape[0], 
                    action_dim=reference_env.action_space.shape[0], 
                    agent_name=args.agent_name,
                    load_agent=True,
                    model_path=actor_path
                )
    
                rewards_rand, dist_rand = evaluate_policy(nagents=N_PROCESSES,
                                                      env=randomized_env,
                                                      agent_policy=agent_policy,
                                                      replay_buffer=None,
                                                      eval_episodes=NEVAL_EPISODES // N_PROCESSES,
                                                      max_steps=args.max_env_timesteps,
                                                      return_rewards=True,
                                                      add_noise=False,
                                                      log_distances=True)

                rewards_grid[i, j, actor_idx, :] = rewards_rand
                finaldists_grid[i, j, actor_idx, :] = dist_rand 
   
    reshow_hyperparameters(args, paths)
    print(finaldists_grid)

    np.savez(os.path.join(paths['paper'], 'grid_generalization.npz'), 
        rewards_grid=rewards_grid,
        finaldists_grid=finaldists_grid
    )
    def plot_discriminator_reward(self, simulator_agent, agent_policy,
                                  timesteps, plot_path, log_path):
        logger.debug('Plotting Discriminator Reward...')

        default_values = [['default'] * simulator_agent.nparams
                          ] * self.neval_eps

        for randomized_dimension in range(simulator_agent.nparams):
            evaluation_array_mean = []
            evaluation_array_median = []
            simulator_agent.discriminator_rewarder.get_ref_reward()
            for i, x in enumerate(self.ground_truth_x):
                if i % DISPLAY_FREQUENCY == 0:
                    logger.info("Dim: {}, Index: {}/{}".format(
                        randomized_dimension, i, len(self.ground_truth_x)))

                values = default_values
                for index in range(self.neval_eps):
                    values[index][randomized_dimension] = x

                self.randomized_env.randomize(values)
                trajectory = evaluate_policy(nagents=self.neval_eps,
                                             env=self.randomized_env,
                                             agent_policy=agent_policy,
                                             replay_buffer=None,
                                             eval_episodes=1,
                                             max_steps=self.max_steps,
                                             freeze_agent=True,
                                             add_noise=False,
                                             log_distances=self.log_distances)

                trajectory = [trajectory[i] for i in range(self.neval_eps)]
                trajectory = np.concatenate(trajectory)

                randomized_discrim_score_mean, randomized_discrim_score_median, _ = \
                    simulator_agent.discriminator_rewarder.get_score(trajectory)

                evaluation_array_mean.append(randomized_discrim_score_mean)
                evaluation_array_median.append(randomized_discrim_score_median)

            ground_truth_scaled = self.randomized_env.rescale(
                randomized_dimension, self.ground_truth_x)
            name = self.randomized_env.get_dimension_name(randomized_dimension)
            print('MeanDR', evaluation_array_mean[::10])
            print('MedianDR', evaluation_array_median[::10])

            plt.plot(ground_truth_scaled, evaluation_array_mean, c="green")
            plt.savefig('{}.png'.format(
                os.path.join(plot_path,
                             'mean-discrimrew-{}-{}'.format(name, timesteps))))
            plt.close()
            data = [[x, y]
                    for (x,
                         y) in zip(ground_truth_scaled, evaluation_array_mean)]
            table = wandb.Table(data=data,
                                columns=[f"Value of {name}", "Reward"])
            wandb.log({
                f"Mean Discrimination Reward":
                wandb.plot.line(table,
                                f"Value of {name}",
                                "Reward",
                                title="Mean Discrim Reward")
            })

            plt.plot(ground_truth_scaled, evaluation_array_median, c="green")
            plt.savefig('{}.png'.format(
                os.path.join(plot_path,
                             'med-discrimrew-{}-{}'.format(name, timesteps))))
            plt.close()

            np.savez('{}.npz'.format(
                os.path.join(log_path,
                             'discriminator_rewards-{}'.format(timesteps))),
                     discriminator_mean=evaluation_array_mean,
                     discriminator_median=evaluation_array_median)