def pre_replay(self, logger=logging): # Create PsychSim model logger.info('Creating world with "{}" map'.format(self.map_table.name)) try: self.world, self.triage_agent, self.observer, self.victims, self.world_map = \ make_single_player_world(self.parser.player_name(), self.map_table.init_loc, self.map_table.adjacency, self.map_table.victims, False, True, {}, self.create_observer, logger.getChild('make_single_player_world')) except: logger.error(traceback.format_exc()) logger.error('Unable to create world') return False # Last-minute filling in of models. Would do it earlier if we extracted triage_agent's name features = None self.model_list = [{dimension: value[index] for index, dimension in enumerate(self.models)} for value in itertools.product(*self.models.values()) if len(value) > 0] for index, model in enumerate(self.model_list): if 'name' not in model: model['name'] = '{}_{}'.format(self.triage_agent.name, '_'.join([model[dimension] for dimension in self.models])) for dimension in self.models: model[dimension] = self.models[dimension][model[dimension]] if dimension == 'reward': if not isinstance(model[dimension], dict): if features is None: import atomic.model_learning.linear.rewards as rewards features = rewards.create_reward_vector( self.triage_agent, self.world_map.all_locations, self.world_map.moveActions[self.triage_agent.name]) model[dimension] = {feature: model[dimension][i] for i, feature in enumerate(features)} if len(self.model_list) > 0: set_player_models(self.world, self.observer.name, self.triage_agent.name, self.victims, self.model_list) # self.parser.victimsObj = self.victims return True
def pre_replay(self, logger=logging): result = super.pre_replay(logger) if result is not True: # Failed return result try: if self.rddl_file: # Team mission self.observer = make_observer(self.world, self.parser.players) else: self.observer = None except: logger.error('Unable to create ATOMIC agent') logger.error(traceback.format_exc()) return False # Last-minute filling in of models. Would do it earlier if we extracted triage_agent's name features = None self.model_list = [{ dimension: value[index] for index, dimension in enumerate(self.models) } for value in itertools.product(*self.models.values()) if len(value) > 0] for player in self.parser.players: for index, model in enumerate(self.model_list): model = copy.deepcopy(model) model['name'] = '{}_{}'.format( player, '_'.join([model[dimension] for dimension in self.models])) for dimension in self.models: model[dimension] = self.models[dimension][model[dimension]] if dimension == 'reward': if not isinstance(model[dimension], dict): if features is None: import atomic.model_learning.linear.rewards as rewards features = rewards.create_reward_vector( self.world.agents[player], self.world_map.all_locations, self.world_map.moveActions[player]) model[dimension] = { feature: model[dimension][i] for i, feature in enumerate(features) } # if len(self.model_list) > 0: # set_player_models(self.world, self.observer.name, self.triage_agent.name, self.victims, self.model_list) # self.parser.victimsObj = self.victims return True
def post_replay(self): """ Performs linear reward model learning using the Maximum Entropy IRL algorithm. """ # checks result and avoids performing IRL if self._check_results(): return # checks trajectory trajectory = self.processor.trajectory if len(trajectory) <= self.length + self.num_trajectories - 1: logging.info( 'Could not process datapoint, empty or very short trajectory: {}' .format(self.parser.filename)) return # delete observer if present (not needed) if self.observer is not None: del self.world.agents[self.observer.name] logging.info('Removed observer agent from PsychSim world.') # sets general random seeds random.seed(self.seed) np.random.seed(self.seed) neighbors = self.map_table.adjacency locations = self.map_table.rooms_list coordinates = self.map_table.coordinates # print map plot_environment( self.world, locations, neighbors, os.path.join(self._output_dir, 'env.{}'.format(self.img_format)), coordinates) self.plot_player_data(coordinates, locations, neighbors, trajectory) trajectories = self.collect_sub_trajectories(coordinates, locations, neighbors, trajectory) # create reward vector and optimize reward weights via MaxEnt IRL logging.info('=================================') logging.info( 'Starting Maximum Entropy IRL optimization using {} processes...'. format( os.cpu_count() if self.processes is None else self.processes)) rwd_vector = create_reward_vector( self.triage_agent, locations, self.world_map.moveActions[self.triage_agent.name]) alg = MaxEntRewardLearning('max-ent', self.triage_agent.name, rwd_vector, self.processes, self.normalize, self.learn_rate, self.epochs, self.diff, True, self.prune, self.horizon, self.seed) result = alg.learn(trajectories, self.parser.filename, self.verbosity > 0) self.save_results(alg, result, rwd_vector, trajectory) logging.info('Finished processing {}!'.format(self.parser.filename)) logging.info('=================================\n\n')
logging.info(msg) else: # create world, agent and observer map_table = default_maps[args.map_name] world, agent, observer, victims, world_map = make_single_player_world( PLAYER_NAME, map_table.init_loc, map_table.adjacency, map_table.victims, False, FULL_OBS, False) # agent params agent.setAttribute('rationality', args.rationality) agent.setAttribute('selection', args.selection) agent.setAttribute('horizon', args.horizon) # set agent rwd function rwd_vector = create_reward_vector(agent, map_table.rooms_list, world_map.moveActions[agent.name]) rwd_vector.set_rewards(agent, REWARD_WEIGHTS) logging.info('Set reward vector: {}'.format( dict(zip(rwd_vector.names, REWARD_WEIGHTS)))) # generate trajectories logging.info( 'Generating {} trajectories of length {} using {} parallel processes...' .format( args.trajectories, args.length, args.processes if args.processes is not None else mp.cpu_count())) start = timer() trajectories = generate_trajectories(agent, args.trajectories, args.length, threshold=args.prune,
def cluster_reward_weights(analyzer, output_dir, linkage='ward', dist_threshold=DEF_DIST_THRESHOLD, stds=DEF_STDS, clear=False, verbosity=1): """ Analyzes the reward functions resulting from IRL optimization for each player log file. Performs clustering of reward functions based on the weight vectors and computes the mean rewards in each cluster. :param RewardModelAnalyzer analyzer: the reward model analyzer containing the necessary data. :param str output_dir: the directory in which to save the results. :param str linkage: the clustering linkage criterion. :param float dist_threshold: the distance above which clusters are not merged. :param float stds: the number of standard deviations above the gradient mean used for automatic cluster detection. :param bool clear: whether to clear the directory before processing. :param int verbosity: the verbosity level of the log file. :return: """ create_clear_dir(output_dir, clear) change_log_handler(os.path.join(output_dir, 'post-process.log'), verbosity) file_names = list(analyzer.results) logging.info('\n=================================') logging.info('Analyzing models\' reward weights for {} results...'.format(len(file_names))) # performs clustering of reward weights results = [analyzer.results[filename] for filename in file_names] clustering, thetas = cluster_linear_rewards(results, linkage, dist_threshold, stds) # gets rwd feature names with dummy info agent_name = analyzer.agent_names[file_names[0]] agent = analyzer.trajectories[file_names[0]][-1][0].agents[agent_name] locations = analyzer.map_tables[file_names[0]].rooms_list rwd_feat_names = create_reward_vector(agent, locations, WorldMap.get_move_actions(agent)).names # overall weight mean data = np.array([np.mean(thetas, axis=0), np.std(thetas, axis=0) / len(thetas)]).T.tolist() plot_bar(OrderedDict(zip(rwd_feat_names, data)), 'Overall Mean Weights', os.path.join(output_dir, 'weights-mean.{}'.format(analyzer.img_format)), plot_mean=False) # mean weights within each cluster clusters, cluster_weights = get_clusters_means(clustering, thetas) logging.info('Found {} clusters at max. distance: {:.2f}'.format( clustering.n_clusters_, clustering.distance_threshold)) for cluster in sorted(cluster_weights.keys()): idxs = clusters[cluster] data = cluster_weights[cluster] data[1] = data[1] / len(idxs) with np.printoptions(precision=2, suppress=True): logging.info('\tCluster {}: {}, \n\tmean: {}\n'.format(cluster, idxs, data[0])) plot_bar(OrderedDict(zip(rwd_feat_names, data.T.tolist())), 'Mean Weights for Cluster {}'.format(cluster), os.path.join(output_dir, 'weights-mean-{}.{}'.format(cluster, analyzer.img_format)), plot_mean=False) subject_ids = [analyzer.get_player_name(file_name) for file_name in file_names] player_names = [analyzer.agent_names[file_name] for file_name in file_names] save_mean_cluster_weights(cluster_weights, os.path.join(output_dir, 'cluster-weights.csv'), rwd_feat_names) extra_info = OrderedDict({ 'Internal subject ID': subject_ids, 'File name': file_names, 'Game player name': player_names}) save_clusters_info(clustering, extra_info, thetas, os.path.join(output_dir, 'clusters.csv'), rwd_feat_names) # individual rwd weights thetas = np.array([result.stats[THETA_STR] for result in results]) ind_df = pd.DataFrame(list(zip(file_names, *thetas.T.tolist())), columns=['File name'] + rwd_feat_names) ind_df.to_csv(os.path.join(output_dir, 'individual-weights.csv'), index=False) # cluster sizes cluster_sizes = OrderedDict({str(cluster): len(clusters[cluster]) for cluster in sorted(clusters.keys())}) plot_bar(cluster_sizes, 'Clusters Size', os.path.join(output_dir, 'sizes.{}'.format(analyzer.img_format))) # dendrogram plot_clustering_dendrogram( clustering, os.path.join(output_dir, 'weights-dendrogram.{}'.format(analyzer.img_format))) # player_names) plot_clustering_distances( clustering, os.path.join(output_dir, 'weights-distance.{}'.format(analyzer.img_format))) # gets different data partitions according to maps, conditions, subjects, etc gt_labels = { 'Subject': [analyzer.trial_conditions[file_name][SUBJECT_ID_TAG] for file_name in file_names], 'Map Condition': [analyzer.trial_conditions[file_name][COND_MAP_TAG][0] for file_name in file_names], 'Dynamic Map Cond.': [analyzer.trial_conditions[file_name][COND_MAP_TAG][1] for file_name in file_names], 'Train Condition': [analyzer.trial_conditions[file_name][COND_TRAIN_TAG] for file_name in file_names] } subject_min_trials = {} for i, file_name in enumerate(file_names): subj_label = gt_labels['Subject'][i] subj_trial = int(analyzer.trial_conditions[file_name][TRIAL_TAG]) if subj_label not in subject_min_trials or subj_trial < subject_min_trials[subj_label]: subject_min_trials[subj_label] = subj_trial gt_labels['Trial'] = [ int(analyzer.trial_conditions[file_name][TRIAL_TAG]) - subject_min_trials[gt_labels['Subject'][i]] for i, file_name in enumerate(file_names)] # performs clustering evaluation according to the different gt partitions and combinations thereof evaluate_clustering(clustering, gt_labels, output_dir, analyzer.img_format, 3)
def evaluate_reward_models(analyzer, output_dir, cluster_rwds_file=None, datapoint_clusters_file=None, clear=False, verbosity=1): """ Evaluates the learned reward functions by using internal evaluation metrics. It mainly computes the mismatch between observed player policies and policies resulting from different reward functions, including the ones resulting from IRL for each player and the means for each reward cluster. :param RewardModelAnalyzer analyzer: the reward model analyzer containing the necessary data. :param str output_dir: the directory in which to save the results. :param str cluster_rwds_file: the path to the file from which to load the clusters' reward weights. :param str datapoint_clusters_file: the path to the file from which to load the datapoints' clusters. :param bool clear: whether to clear the directory before processing. :param int verbosity: the verbosity level of the log file. :return: """ create_clear_dir(output_dir, clear) change_log_handler(os.path.join(output_dir, 'post-process.log'), verbosity) file_names = list(analyzer.results) # tries to load cluster info and sorts datapoints by cluster if datapoint_clusters_file is not None and os.path.isfile( datapoint_clusters_file): clusters = load_datapoints_clusters(datapoint_clusters_file) file_names.sort(key=lambda f: clusters[f] if f in clusters else -1) logging.info('\n=================================') logging.info( 'Performing cross-evaluation of reward functions for {} results...'. format(len(file_names))) # first gets data needed to compute players' "observed" policies trajectories = [analyzer.trajectories[filename] for filename in file_names] agent_names = [analyzer.agent_names[filename] for filename in file_names] agents = [ trajectories[i][-1][0].agents[agent_names[i]] for i in range(len(trajectories)) ] map_locs = [ analyzer.map_tables[filename].rooms_list for filename in file_names ] rwd_vectors = [ create_reward_vector(agents[i], map_locs[i], WorldMap.get_move_actions(agents[i])) for i in range(len(agents)) ] # saves nominal weight vectors/profiles save_mean_cluster_weights( {k: v.reshape(1, -1) for k, v in REWARD_MODELS.items()}, os.path.join(output_dir, 'nominal-weights.csv'), rwd_vectors[0].names) # calculates eval metrics for each player policy against nominal and cluster-based policies num_states = analyzer.num_trajectories * analyzer.length rwd_weights = OrderedDict(REWARD_MODELS) if cluster_rwds_file is not None and os.path.isfile(cluster_rwds_file): rwd_weights.update({ 'Cluster {}'.format(k): v for k, v in load_cluster_reward_weights(cluster_rwds_file).items() }) eval_matrix = cross_evaluation(trajectories, agent_names, rwd_vectors, list(rwd_weights.values()), AGENT_RATIONALITY, analyzer.horizon, analyzer.prune, analyzer.processes, num_states, analyzer.seed) # saves confusion matrix for cross-evaluation of each metric x_labels = [analyzer.get_player_name(filename) for filename in file_names] y_labels = list(rwd_weights.keys()) for metric_name, matrix in eval_matrix.items(): file_path = os.path.join( output_dir, '{}-cross-eval-matrix.{}'.format( metric_name.lower().replace(' ', '-'), analyzer.img_format)) plot_confusion_matrix( matrix, file_path, x_labels, y_labels, CONF_MAT_COLOR_MAP, '{} Cross-Evaluation'.format(metric_name), 'Agent Policy Using Player\'s Optimal Reward Function', 'Player\'s Observed Policy', 0, 1) # calculates eval metrics for each player policy against its optimal reward weights discovered via IRL (self-eval) metrics_values = {} for i, filename in enumerate(file_names): n_states = min(num_states, len(trajectories[i])) eval_matrix = cross_evaluation( [trajectories[i]], [agent_names[i]], [rwd_vectors[i]], [analyzer.results[filename].stats[THETA_STR]], AGENT_RATIONALITY, analyzer.horizon, analyzer.prune, analyzer.processes, n_states, analyzer.seed + i) # organizes by metric name and then by player player_name = analyzer.get_player_name(filename) for metric_name, matrix in eval_matrix.items(): if metric_name not in metrics_values: metrics_values[metric_name] = {} metrics_values[metric_name][player_name] = matrix[0, 0] # plots mean self-eval performance for metric_name, metric_values in metrics_values.items(): plot_bar(metric_values, metric_name.title(), os.path.join( output_dir, '{}-self-eval.{}'.format( metric_name.lower().replace(' ', '-'), analyzer.img_format)), None, y_label=metric_name)
loc_neighbors = MAP_TABLE.adjacency locations = MAP_TABLE.rooms_list coords = MAP_TABLE.coordinates world, agent, observer, victims, world_map = \ make_single_player_world(AGENT_NAME, MAP_TABLE.init_loc, loc_neighbors, MAP_TABLE.victims, False, FULL_OBS) plot_environment(world, locations, loc_neighbors, os.path.join(OUTPUT_DIR, 'env.pdf'), coords) # set agent params agent.setAttribute('horizon', HORIZON) agent.setAttribute('selection', SELECTION) agent.setAttribute('rationality', RATIONALITY) # set agent rwd function rwd_vector = create_reward_vector(agent, locations, world_map.moveActions[agent.name]) rwd_weights = random.sample(list(cluster_weights.values()), 1)[0] rwd_vector.set_rewards(agent, rwd_weights) logging.info('Set reward vector: {}'.format( dict(zip(rwd_vector.names, rwd_weights)))) # generates trajectory logging.info('Generating trajectory of length {}...'.format(NUM_STEPS)) trajectory = generate_trajectory(agent, NUM_STEPS) save_object(trajectory, os.path.join(OUTPUT_DIR, 'trajectory.pkl.gz'), True) # print stats plot_trajectories(agent, [trajectory], locations, loc_neighbors,