def rollout_agent(self, agent_policy, reference=True, eval_episodes=None): """Rolls out agent_policy in the specified environment """ if reference: if eval_episodes is None: eval_episodes = self.episodes_per_instance trajectory = evaluate_policy(nagents=self.nagents, env=self.reference_env, agent_policy=agent_policy, replay_buffer=None, eval_episodes=eval_episodes, max_steps=self.max_env_timesteps, freeze_agent=True, add_noise=False, log_distances=self.log_distances) else: trajectory = evaluate_policy( nagents=self.nagents, env=self.randomized_env, agent_policy=agent_policy, replay_buffer=self.replay_buffer, eval_episodes=self.episodes_per_instance, max_steps=self.max_env_timesteps, freeze_agent=self.freeze_agent, add_noise=True, log_distances=self.log_distances) return trajectory
def generate_ground_truth(self, simulator_agent, agent_policy, timesteps, log_path): logger.debug('Generating ground truth...') self.evaluation_scores = [None] * simulator_agent.nparams default_values = [['default'] * simulator_agent.nparams ] * self.neval_eps for randomized_dimension in range(simulator_agent.nparams): evaluation_array = [] for i, x in enumerate(self.ground_truth_x): if i % DISPLAY_FREQUENCY == 0: logger.info("Dim: {}, Index: {}/{}".format( randomized_dimension, i, len(self.ground_truth_x))) values = default_values for index in range(self.neval_eps): values[index][randomized_dimension] = x self.randomized_env.randomize(values) randomized_rewards, final_distances = evaluate_policy( nagents=self.neval_eps, env=self.randomized_env, agent_policy=agent_policy, replay_buffer=None, eval_episodes=1, max_steps=self.max_steps, return_rewards=True, add_noise=False, log_distances=self.log_distances) if self.log_distances: evaluation_array.append( np.array([ np.mean(final_distances), np.std(final_distances) ])) else: evaluation_array.append( np.array([ np.mean(randomized_rewards), np.std(randomized_rewards) ])) self.evaluation_scores[randomized_dimension] = np.array( evaluation_array) self.evaluation_scores = np.array(self.evaluation_scores) for randomized_dimension in range(simulator_agent.nparams): name = self.randomized_env.get_dimension_name(randomized_dimension) np.savez('{}.npz'.format( os.path.join(log_path, 'raw_rewards-{}'.format(timesteps))), raw_rewards=self.evaluation_scores) logger.info('Ground truth generated.') return self.evaluation_scores
def get_ref_reward(self): ref_rewards = [] for i in range(5): trajectory = evaluate_policy(nagents=10, env=self.reference_env, agent_policy=self.agent_policy, replay_buffer=None, eval_episodes=10, max_steps=self.max_env_timesteps, freeze_agent=True, add_noise=False, log_distances=False) for roll in trajectory: ref_rewards.append(np.sum(roll[:, -1].squeeze())) self.ref_rewards = np.mean(ref_rewards)
def generate_ground_truth(self, simulator_agent, agent_policy, timesteps, log_path, plot_path, record_video=False): logger.debug('Generating ground truth...') self.evaluation_scores = [None] * simulator_agent.nparams # default_values = [['default'] * simulator_agent.nparams] * self.neval_eps # print(dir(self.randomized_env)) # default_values = [self.randomized_env.dimensions[dimension].default_value for dimension in range(0, simulator_agent.nparams)] * self.neval_eps # default_values = [self.randomized_env.unwrapped.dimensions[dimension].default_value for dimension in range(0, simulator_agent.nparams)] * self.neval_eps default_values = [[ self.randomized_env.unwrapped.dimensions[dimension].default_value for dimension in range(0, simulator_agent.nparams) ]] * self.neval_eps for randomized_dimension in range(simulator_agent.nparams): evaluation_array = [] for i, x in enumerate(self.ground_truth_x): if i % DISPLAY_FREQUENCY == 0: logger.info("Dim: {}, Index: {}/{}".format( randomized_dimension, i, len(self.ground_truth_x))) values = default_values for index in range(self.neval_eps): values[index][randomized_dimension] = x self.randomized_env.randomize(values) randomized_rewards, final_distances = evaluate_policy( nagents=self.neval_eps, env=self.randomized_env, agent_policy=agent_policy, replay_buffer=None, eval_episodes=1, max_steps=self.max_steps, return_rewards=True, record_video=record_video and i % DISPLAY_FREQUENCY == 0, add_noise=False, log_distances=self.log_distances, video_path=os.path.join( plot_path, 'raw_video-%d-dim%d-index-%dof%d' % (timesteps, randomized_dimension, i + 1, len(self.ground_truth_x)))) if self.log_distances: evaluation_array.append( np.array([ np.mean(final_distances), np.std(final_distances) ])) else: evaluation_array.append( np.array([ np.mean(randomized_rewards), np.std(randomized_rewards) ])) self.evaluation_scores[randomized_dimension] = np.array( evaluation_array) self.evaluation_scores = np.array(self.evaluation_scores) for randomized_dimension in range(simulator_agent.nparams): name = self.randomized_env.get_dimension_name(randomized_dimension) np.savez('{}.npz'.format( os.path.join(log_path, 'raw_rewards-{}'.format(timesteps))), raw_rewards=self.evaluation_scores) # TODO: save video of rollouts -> in evaluate_policy logger.info('Ground truth generated.') return self.evaluation_scores
def plot_discriminator_reward(self, simulator_agent, agent_policy, timesteps, plot_path, log_path): logger.debug('Generating ground truth...') # default_values = [['default'] * simulator_agent.nparams] * self.neval_eps default_values = [[ self.randomized_env.unwrapped.dimensions[dimension].default_value for dimension in range(0, simulator_agent.nparams) ]] * self.neval_eps for randomized_dimension in range(simulator_agent.nparams): evaluation_array_mean = [] evaluation_array_median = [] for i, x in enumerate(self.ground_truth_x): if i % DISPLAY_FREQUENCY == 0: logger.info("Dim: {}, Index: {}/{}".format( randomized_dimension, i, len(self.ground_truth_x))) values = default_values for index in range(self.neval_eps): values[index][randomized_dimension] = x self.randomized_env.randomize(values) trajectory = evaluate_policy(nagents=self.neval_eps, env=self.randomized_env, agent_policy=agent_policy, replay_buffer=None, eval_episodes=1, max_steps=self.max_steps, freeze_agent=True, add_noise=False, log_distances=self.log_distances) trajectory = [trajectory[i] for i in range(self.neval_eps)] trajectory = np.concatenate(trajectory) randomized_discrim_score_mean, randomized_discrim_score_median, _ = \ simulator_agent.discriminator_rewarder.get_score(trajectory) evaluation_array_mean.append(randomized_discrim_score_mean) evaluation_array_median.append(randomized_discrim_score_median) ground_truth_scaled = self.randomized_env.rescale( randomized_dimension, self.ground_truth_x) name = self.randomized_env.get_dimension_name(randomized_dimension) print('MeanDR', evaluation_array_mean[::10]) print('MedianDR', evaluation_array_median[::10]) plt.plot(ground_truth_scaled, evaluation_array_mean, c="green") plt.savefig('{}.png'.format( os.path.join(plot_path, 'mean-discrimrew-{}-{}'.format(name, timesteps)))) plt.close() plt.plot(ground_truth_scaled, evaluation_array_median, c="green") plt.savefig('{}.png'.format( os.path.join(plot_path, 'med-discrimrew-{}-{}'.format(name, timesteps)))) plt.close() np.savez('{}.npz'.format( os.path.join(log_path, 'discriminator_rewards-{}'.format(timesteps))), discriminator_mean=evaluation_array_mean, discriminator_median=evaluation_array_median)
def select_action(self, agent_policy): """Select an action based on SVPG policy, where an action is the delta in each dimension. Update the counts and statistics after training agent, rolling out policies, and calculating simulator reward. """ if self.svpg_timesteps >= self.initial_svpg_steps: # Get sim instances from SVPG policy simulation_instances = self.svpg.step() index = self.svpg_timesteps % self.svpg_horizon self.simulation_instances_full_horizon[:, index, :, :] = simulation_instances else: # Creates completely randomized environment simulation_instances = np.ones( (self.nagents, self.svpg.svpg_rollout_length, self.svpg.nparams)) * -1 small_ranges = np.linspace(0, 1, self.nagents + 1) for i in range(self.nagents): miu = (small_ranges[i] + small_ranges[i + 1]) / 2 sigma = (small_ranges[0 + 1] + small_ranges[0]) / 6 row = np.random.normal( miu, sigma, (self.svpg.svpg_rollout_length, self.nparams)) row[row < 0] = 0 row[row > 1] = 1 simulation_instances[i] = row assert (self.nagents, self.svpg.svpg_rollout_length, self.svpg.nparams) == simulation_instances.shape # Create placeholders for trajectories randomized_trajectories = [[] for _ in range(self.nagents)] reference_trajectories = [[] for _ in range(self.nagents)] # Create placeholder for rewards rewards = np.zeros(simulation_instances.shape[:2]) # Discriminator debugging randomized_discrim_score_mean = 0 reference_discrim_score_mean = 0 randomized_discrim_score_median = 0 reference_discrim_score_median = 0 # Reshape to work with vectorized environments simulation_instances = np.transpose(simulation_instances, (1, 0, 2)) log_path = os.path.join(PARA_LOG, 'parameter_log_{}'.format(self.svpg_timesteps)) log_file = open(log_path, 'w', 1) # Create environment instances with vectorized env, and rollout agent_policy in both for t in range(self.svpg.svpg_rollout_length): agent_timesteps_current_iteration = 0 logging.info('Iteration t: {}/{}'.format( t, self.svpg.svpg_rollout_length)) reference_trajectory = self.rollout_agent(agent_policy) self.randomized_env.randomize( randomized_values=simulation_instances[t]) env_params = self.randomized_env.get_current_params() log_file.write(' '.join([str(val) for val in env_params[:, 0]]) + '\n') randomized_trajectory = self.rollout_agent(agent_policy, reference=False) for i in range(self.nagents): agent_timesteps_current_iteration += len( randomized_trajectory[i]) reference_trajectories[i].append(reference_trajectory[i]) randomized_trajectories[i].append(randomized_trajectory[i]) self.agent_timesteps += len(randomized_trajectory[i]) self.agent_timesteps_since_eval += len( randomized_trajectory[i]) simulator_reward = self.discriminator_rewarder.calculate_rewards( randomized_trajectories[i][t]) rewards[i][t] = simulator_reward logger.info('Setting: {}, Score: {}'.format( simulation_instances[t][i], simulator_reward)) if not self.freeze_discriminator: # flatten and combine all randomized and reference trajectories for discriminator flattened_randomized = [ randomized_trajectories[i][t] for i in range(self.nagents) ] flattened_randomized = np.concatenate(flattened_randomized) flattened_reference = [ reference_trajectories[i][t] for i in range(self.nagents) ] flattened_reference = np.concatenate(flattened_reference) randomized_discrim_score_mean, randomized_discrim_score_median, randomized_discrim_score_sum = \ self.discriminator_rewarder.get_score(flattened_randomized) reference_discrim_score_mean, reference_discrim_score_median, reference_discrim_score_sum = \ self.discriminator_rewarder.get_score(flattened_reference) # Train discriminator based on state action pairs for agent env. steps # TODO: Train more? self.discriminator_rewarder.train_discriminator( flattened_reference, flattened_randomized, iterations=agent_timesteps_current_iteration) randomized_discrim_score_mean, randomized_discrim_score_median, randomized_discrim_score_sum = \ self.discriminator_rewarder.get_score(flattened_randomized) reference_discrim_score_mean, reference_discrim_score_median, reference_discrim_score_sum = \ self.discriminator_rewarder.get_score(flattened_reference) # Calculate discriminator based reward, pass it back to SVPG policy if self.svpg_timesteps >= self.initial_svpg_steps: if self.train_svpg: self.svpg.train(rewards) for dimension in range(self.nparams): self.sampled_regions[dimension] = np.concatenate([ self.sampled_regions[dimension], simulation_instances[:, :, dimension].flatten() ]) solved_reference = info = None if self.agent_timesteps_since_eval > self.agent_eval_frequency: self.agent_timesteps_since_eval %= self.agent_eval_frequency logger.info( "Evaluating for {} episodes afer timesteps: {} (SVPG), {} (Agent)" .format(self.randomized_eval_episodes * self.nagents, self.svpg_timesteps, self.agent_timesteps)) agent_reference_eval_rewards = [] agent_randomized_eval_rewards = [] final_dist_ref = [] final_dist_rand = [] for _ in range(self.randomized_eval_episodes): rewards_ref, dist_ref = evaluate_policy( nagents=self.nagents, env=self.reference_env, agent_policy=agent_policy, replay_buffer=None, eval_episodes=1, max_steps=self.max_env_timesteps, return_rewards=True, add_noise=False, log_distances=self.log_distances) full_random_settings = np.ones( (self.nagents, self.nparams)) * -1 self.randomized_env.randomize( randomized_values=full_random_settings) rewards_rand, dist_rand = evaluate_policy( nagents=self.nagents, env=self.randomized_env, agent_policy=agent_policy, replay_buffer=None, eval_episodes=1, max_steps=self.max_env_timesteps, return_rewards=True, add_noise=False, log_distances=self.log_distances) agent_reference_eval_rewards += list(rewards_ref) agent_randomized_eval_rewards += list(rewards_rand) final_dist_ref += [dist_ref] final_dist_rand += [dist_rand] evaluation_criteria_reference = agent_reference_eval_rewards evaluation_criteria_randomized = agent_randomized_eval_rewards if self.log_distances: evaluation_criteria_reference = final_dist_ref evaluation_criteria_randomized = final_dist_rand solved_reference = check_solved(self.reference_env_id, evaluation_criteria_reference) solved_randomized = check_solved(self.randomized_eval_env_id, evaluation_criteria_randomized) info = { 'solved': str(solved_reference), 'solved_randomized': str(solved_randomized), 'svpg_steps': self.svpg_timesteps, 'agent_timesteps': self.agent_timesteps, 'final_dist_ref_mean': np.mean(final_dist_ref), 'final_dist_ref_std': np.std(final_dist_ref), 'final_dist_ref_median': np.median(final_dist_ref), 'final_dist_rand_mean': np.mean(final_dist_rand), 'final_dist_rand_std': np.std(final_dist_rand), 'final_dist_rand_median': np.median(final_dist_rand), 'agent_reference_eval_rewards_mean': np.mean(agent_reference_eval_rewards), 'agent_reference_eval_rewards_std': np.std(agent_reference_eval_rewards), 'agent_reference_eval_rewards_median': np.median(agent_reference_eval_rewards), 'agent_reference_eval_rewards_min': np.min(agent_reference_eval_rewards), 'agent_reference_eval_rewards_max': np.max(agent_reference_eval_rewards), 'agent_randomized_eval_rewards_mean': np.mean(agent_randomized_eval_rewards), 'agent_randomized_eval_rewards_std': np.std(agent_randomized_eval_rewards), 'agent_randomized_eval_rewards_median': np.median(agent_randomized_eval_rewards), 'agent_randomized_eval_rewards_min': np.min(agent_randomized_eval_rewards), 'agent_randomized_eval_rewards_max': np.max(agent_randomized_eval_rewards), 'randomized_discrim_score_mean': str(randomized_discrim_score_mean), 'reference_discrim_score_mean': str(reference_discrim_score_mean), 'randomized_discrim_score_median': str(randomized_discrim_score_median), 'reference_discrim_score_median': str(reference_discrim_score_median), } agent_hard_eval_rewards, final_dist_hard = evaluate_policy( nagents=self.nagents, env=self.hard_env, agent_policy=agent_policy, replay_buffer=None, eval_episodes=1, max_steps=self.max_env_timesteps, return_rewards=True, add_noise=False, log_distances=self.log_distances) info_hard = { 'final_dist_hard_mean': np.mean(final_dist_hard), 'final_dist_hard_std': np.std(final_dist_hard), 'final_dist_hard_median': np.median(final_dist_hard), 'agent_hard_eval_rewards_median': np.median(agent_hard_eval_rewards), 'agent_hard_eval_rewards_mean': np.mean(agent_hard_eval_rewards), 'agent_hard_eval_rewards_std': np.std(agent_hard_eval_rewards), } info.update(info_hard) self.svpg_timesteps += 1 return solved_reference, info
actor_paths = glob.glob(os.path.join(os.getcwd(), paths['paper'], 'best-seed*_actor.pth')) print(actor_paths) for actor_idx, actor_path in enumerate(actor_paths): agent_policy = DDPGActor( state_dim=reference_env.observation_space.shape[0], action_dim=reference_env.action_space.shape[0], agent_name=args.agent_name, load_agent=True, model_path=actor_path ) rewards_rand, dist_rand = evaluate_policy(nagents=N_PROCESSES, env=randomized_env, agent_policy=agent_policy, replay_buffer=None, eval_episodes=NEVAL_EPISODES // N_PROCESSES, max_steps=args.max_env_timesteps, return_rewards=True, add_noise=False, log_distances=True) rewards_grid[i, j, actor_idx, :] = rewards_rand finaldists_grid[i, j, actor_idx, :] = dist_rand reshow_hyperparameters(args, paths) print(finaldists_grid) np.savez(os.path.join(paths['paper'], 'grid_generalization.npz'), rewards_grid=rewards_grid, finaldists_grid=finaldists_grid )
def plot_discriminator_reward(self, simulator_agent, agent_policy, timesteps, plot_path, log_path): logger.debug('Plotting Discriminator Reward...') default_values = [['default'] * simulator_agent.nparams ] * self.neval_eps for randomized_dimension in range(simulator_agent.nparams): evaluation_array_mean = [] evaluation_array_median = [] simulator_agent.discriminator_rewarder.get_ref_reward() for i, x in enumerate(self.ground_truth_x): if i % DISPLAY_FREQUENCY == 0: logger.info("Dim: {}, Index: {}/{}".format( randomized_dimension, i, len(self.ground_truth_x))) values = default_values for index in range(self.neval_eps): values[index][randomized_dimension] = x self.randomized_env.randomize(values) trajectory = evaluate_policy(nagents=self.neval_eps, env=self.randomized_env, agent_policy=agent_policy, replay_buffer=None, eval_episodes=1, max_steps=self.max_steps, freeze_agent=True, add_noise=False, log_distances=self.log_distances) trajectory = [trajectory[i] for i in range(self.neval_eps)] trajectory = np.concatenate(trajectory) randomized_discrim_score_mean, randomized_discrim_score_median, _ = \ simulator_agent.discriminator_rewarder.get_score(trajectory) evaluation_array_mean.append(randomized_discrim_score_mean) evaluation_array_median.append(randomized_discrim_score_median) ground_truth_scaled = self.randomized_env.rescale( randomized_dimension, self.ground_truth_x) name = self.randomized_env.get_dimension_name(randomized_dimension) print('MeanDR', evaluation_array_mean[::10]) print('MedianDR', evaluation_array_median[::10]) plt.plot(ground_truth_scaled, evaluation_array_mean, c="green") plt.savefig('{}.png'.format( os.path.join(plot_path, 'mean-discrimrew-{}-{}'.format(name, timesteps)))) plt.close() data = [[x, y] for (x, y) in zip(ground_truth_scaled, evaluation_array_mean)] table = wandb.Table(data=data, columns=[f"Value of {name}", "Reward"]) wandb.log({ f"Mean Discrimination Reward": wandb.plot.line(table, f"Value of {name}", "Reward", title="Mean Discrim Reward") }) plt.plot(ground_truth_scaled, evaluation_array_median, c="green") plt.savefig('{}.png'.format( os.path.join(plot_path, 'med-discrimrew-{}-{}'.format(name, timesteps)))) plt.close() np.savez('{}.npz'.format( os.path.join(log_path, 'discriminator_rewards-{}'.format(timesteps))), discriminator_mean=evaluation_array_mean, discriminator_median=evaluation_array_median)