def log_diagnostics(self, paths, **kwargs): list_of_rewards, terminals, obs, actions, next_obs = split_paths(paths) returns = [] for rewards in list_of_rewards: returns.append(np.sum(rewards)) statistics = OrderedDict() statistics.update( create_stats_ordered_dict( 'Undiscounted Returns', returns, )) statistics.update( create_stats_ordered_dict( 'Rewards', list_of_rewards, )) statistics.update(create_stats_ordered_dict( 'Actions', actions, )) fraction_of_time_on_platform = [o[1] for o in obs] statistics['Fraction of time on platform'] = np.mean( fraction_of_time_on_platform) for key, value in statistics.items(): logger.record_tabular(key, value) return returns
def log_diagnostics(self, paths, **kwargs): list_of_rewards, terminals, obs, actions, next_obs = split_paths(paths) returns = [] for rewards in list_of_rewards: returns.append(np.sum(rewards)) last_statistics = OrderedDict() last_statistics.update( create_stats_ordered_dict( 'UndiscountedReturns', returns, )) last_statistics.update( create_stats_ordered_dict( 'Rewards', list_of_rewards, )) last_statistics.update(create_stats_ordered_dict( 'Actions', actions, )) for key, value in last_statistics.items(): logger.record_tabular(key, value) return returns
def _statistics_from_paths(self, paths, stat_prefix): rewards, terminals, obs, actions, next_obs = split_paths(paths) np_batch = dict( rewards=rewards, terminals=terminals, observations=obs, actions=actions, next_observations=next_obs, ) batch = np_to_pytorch_batch(np_batch) statistics = self._statistics_from_batch(batch, stat_prefix) statistics.update( create_stats_ordered_dict('Num Paths', len(paths), stat_prefix=stat_prefix)) return statistics
def test_split_paths(self): paths = [create_path(0), create_path(1)] rewards, terminals, obs, actions, next_obs = split_paths(paths) expected_rewards = np.array([-1, 0, 1, 0, 1, 2]).reshape(-1, 1) expected_terminals = np.array([0, 0, 1, 0, 0, 1]).reshape(-1, 1) expected_obs = np.array([ [2], [4], [8], [3], [5], [9], ]) expected_actions = np.array([ [5], [7], [9], [6], [8], [10], ]) expected_next_obs = np.array([ [4], [5], [9], [5], [6], [10], ]) self.assertNpEqual(rewards, expected_rewards) self.assertNpEqual(terminals, expected_terminals) self.assertNpEqual(obs, expected_obs) self.assertNpEqual(actions, expected_actions) self.assertNpEqual(next_obs, expected_next_obs)