def process_samples(self, itr, paths): baselines = [] returns = [] for path in paths: path_baselines = np.append(self.algo.baseline.predict(path), 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array([tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.algo.center_adv: raw_adv = np.concatenate([path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array([tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos] ) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array([tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths, prefix='', log=True, task_idx=0, noise_opt=False, joint_opt=False, sess=None): baselines = [] returns = [] for idx, path in enumerate(paths): path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) if log: logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths, log=log) if log: logger.log("fitted") if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if joint_opt is True: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) noises = tensor_utils.concat_tensor_list( [path["noises"] for path in paths]) task_idxs = task_idx * np.ones((len(noises), ), dtype=np.int32) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] debug_avg_ret = np.mean(undiscounted_returns) #mean = sess.run(self.algo.policy.all_params["latent_means"]) #std = sess.run(self.algo.policy.all_params["latent_stds"]) #import ipdb #ipdb.set_trace() ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, noises=noises, task_idxs=task_idxs, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) observations_latent = tensor_utils.concat_tensor_list( [path["observations"][0:1] for path in paths]) noises_latent = tensor_utils.concat_tensor_list( [path["noises"][0:1] for path in paths]) task_idxs_latent = task_idx * np.ones( (len(noises_latent), ), dtype=np.int32) actions_latent = tensor_utils.concat_tensor_list( [path["actions"][0:1] for path in paths]) rewards_latent = tensor_utils.concat_tensor_list( [path["rewards"][0:1] for path in paths]) returns_latent = tensor_utils.concat_tensor_list( [path["returns"][0:1] for path in paths]) advantages_latent = tensor_utils.concat_tensor_list( [path["advantages"][0:1] for path in paths]) env_infos_latent = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos_latent = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages_latent = util.center_advantages(advantages_latent) if self.algo.positive_adv: advantages_latent = util.shift_advantages_to_positive( advantages_latent) samples_data_latent = dict( observations=observations_latent, noises=noises_latent, task_idxs=task_idxs_latent, actions=actions_latent, rewards=rewards_latent, returns=returns_latent, advantages=advantages_latent, env_infos=env_infos_latent, agent_infos=agent_infos_latent, paths=paths, ) elif noise_opt is False: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) noises = tensor_utils.concat_tensor_list( [path["noises"] for path in paths]) task_idxs = task_idx * np.ones((len(noises), ), dtype=np.int32) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) for path in paths: for key in path['agent_infos']: if key == 'prob' and len( path['agent_infos'][key].shape) == 3: path['agent_infos'][key] = path['agent_infos'][key][:, 0] agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, noises=noises, task_idxs=task_idxs, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) elif noise_opt is True: observations = tensor_utils.concat_tensor_list( [path["observations"][0:1] for path in paths]) noises = tensor_utils.concat_tensor_list( [path["noises"][0:1] for path in paths]) task_idxs = task_idx * np.ones((len(noises), ), dtype=np.int32) actions = tensor_utils.concat_tensor_list( [path["actions"][0:1] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"][0:1] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"][0:1] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"][0:1] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, noises=noises, task_idxs=task_idxs, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) if log: #logger.record_tabular('Iteration', itr) #logger.record_tabular('AverageDiscountedReturn', # average_discounted_return) #for key in path['env_infos']: # info_returns = [sum(path["env_infos"][key]) for path in paths] # logger.record_tabular(prefix+'Average'+key, np.mean(info_returns)) # logger.record_tabular(prefix+'Max'+key, np.max(info_returns)) logger.record_tabular(prefix + 'AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular(prefix + 'ExplainedVariance', ev) logger.record_tabular(prefix + 'NumTrajs', len(paths)) logger.record_tabular(prefix + 'Entropy', ent) logger.record_tabular(prefix + 'Perplexity', np.exp(ent)) logger.record_tabular(prefix + 'StdReturn', np.std(undiscounted_returns)) logger.record_tabular(prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.record_tabular(prefix + 'MinReturn', np.min(undiscounted_returns)) if joint_opt is True: return samples_data, samples_data_latent else: return samples_data
def create_samples_dict(self, paths): if self.algo.safety_constraint: if self.use_safety_bonus: safety_key = 'safety_robust' + self.algo.safety_key[6:] else: safety_key = self.algo.safety_key logger.log("Policy optimization is using safety_key=%s." % safety_key) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) weights = tensor_utils.concat_tensor_list([path["weights"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, weights=weights, paths=paths, ) if self.algo.safety_constraint: safety_vals = tensor_utils.concat_tensor_list([path[safety_key] for path in paths]) samples_data['safety_values'] = safety_vals # for gradient calculation if self.algo.center_safety_vals: samples_data['safety_offset'] = np.mean(safety_vals) samples_data['safety_values'] = samples_data['safety_values'] - samples_data['safety_offset'] else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate([path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray([tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos] ) weights = [path["weights"] for path in paths] weights = tensor_utils.pad_tensor_n(weights, max_path_length) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) samples_data = dict ( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, weights=weights, paths=paths, ) if self.algo.safety_constraint: safety_vals = [path[safety_key] for path in paths] if self.algo.center_safety_vals: samples_data['safety_offset'] = np.mean(safety_vals) safety_vals = safety_vals - samples_data['safety_offset'] safety_vals = tensor_utils.pad_tensor_n(safety_vals, max_path_length) samples_data['safety_values'] = safety_vals if self.algo.safety_constraint: if self.algo.safety_key == 'safety_rewards': if self.use_safety_bonus: key = 'safety_robust_rewards' else: key = 'safety_rewards' safety_eval = np.mean(tensor_utils.concat_tensor_list( [path[key] for path in self.experience_replay[-1]] )) else: if self.use_safety_bonus: key = 'safety_robust_returns' else: key = 'safety_returns' safety_eval = np.mean( [path[key][0] for path in self.experience_replay[-1]] ) samples_data['safety_eval'] = safety_eval # linearization constant samples_data['safety_rescale'] = len(samples_data['safety_values']) / sum([len(paths) for paths in self.experience_replay]) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): # IMPORTANT: # Rewards accrued from a_t to a_t+1 are expected to be discounted by # the environment to values at time t #paths = list(itertools.chain.from_iterable(paths)) baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): t_sojourn = path["offset_t_sojourn"] gamma = self.algo.discount lamda = self.algo.gae_lambda discount_gamma = np.exp(-gamma * t_sojourn) discount_gamma_lambda = np.exp(-gamma * lamda * t_sojourn) path_baselines = np.append(all_path_baselines[idx], 0) if (len(path["rewards"]) != len(t_sojourn)): # TODO HANDLE INFINITE HORIZON GAMES pdb.set_trace() deltas = path["rewards"] + \ discount_gamma * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = variable_discount_cumsum( deltas, discount_gamma_lambda) path["returns"] = variable_discount_cumsum(path["rewards"], discount_gamma) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths, prefix='', log=True, fast_process=False, testitr=False, metalearn_baseline=False , isExpertTraj = False): baselines = [] returns = [] if testitr: metalearn_baseline = False train_baseline = (itr in BASELINE_TRAINING_ITRS) if not fast_process: for idx, path in enumerate(paths): path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) if not fast_process and not metalearn_baseline: if log: pass #logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) # TODO: doesn't seem like this is ever used else: # print("debug21 baseline before fitting",self.algo.baseline.predict(paths[0])[0:2], "...",self.algo.baseline.predict(paths[0])[-3:-1]) # print("debug23 predloss before fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths])) self.algo.baseline.fit(paths, log=log) # print("debug25 predloss AFTER fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths])) # print("debug22 returns ",paths[0]['returns'][0:2], "...",paths[0]['returns'][-3:-1]) # print("debug24 baseline after fitting",self.algo.baseline.predict(paths[0])[0:2], "...", self.algo.baseline.predict(paths[0])[-3:-1]) if log: pass #logger.log("fitted") if 'switch_to_init_dist' in dir(self.algo.baseline): self.algo.baseline.switch_to_init_dist() if train_baseline: self.algo.baseline.fit_train_baseline(paths) if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [self.algo.baseline.predict(path) for path in paths] for idx, path in enumerate(paths): if not fast_process and not metalearn_baseline: # if idx==0: # print("debug22", all_path_baselines[idx]) # print("debug23", path['returns']) path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) baselines.append(path_baselines[:-1]) if not fast_process: returns.append(path["returns"]) if "expert_actions" not in path.keys(): if ("expert_actions" in path["env_infos"].keys()): path["expert_actions"] = path["env_infos"]["expert_actions"] else: # assert False, "you shouldn't need expert_actions" path["expert_actions"] = np.array([[None]*len(path['actions'][0])] * len(path['actions'])) if not fast_process and not metalearn_baseline: # TODO: we want the ev eventually ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) l2 = np.linalg.norm(np.array(baselines)-np.array(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) if not fast_process: rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths]) if "env_infos" in paths[0].keys(): env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) if not fast_process and not metalearn_baseline: advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) # print("debug, advantages are", advantages,) # print("debug, shape of advantages is", type(advantages), np.shape(advantages)) expert_actions = tensor_utils.concat_tensor_list([path["expert_actions"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if not fast_process and not metalearn_baseline: if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) if "meta_predict" in dir(self.algo.baseline): # print("debug, advantages are", advantages, ) advantages = advantages + self.algo.baseline.meta_predict(observations) print("debug, metalearned baseline constant is", self.algo.baseline.meta_predict(observations)[0:2],"...",self.algo.baseline.meta_predict(observations)[-3:-1]) # print("debug, metalearned baseline constant shape is", np.shape(self.algo.baseline.meta_predict(observations))) # print("debug, advantages are", advantages[0:2],"...", advantages[-3:-1]) # print("debug, advantages shape is", np.shape(advantages)) # average_discounted_return = \ # np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path.get("rewards",[0])) for path in paths] # ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) if fast_process: samples_data = dict( observations=observations, actions=actions, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) elif metalearn_baseline: samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) if 'agent_infos_orig' in paths[0].keys(): agent_infos_orig = tensor_utils.concat_tensor_dict_list([path["agent_infos_orig"] for path in paths]) samples_data["agent_infos_orig"] = agent_infos_orig else: samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) if 'agent_infos_orig' in paths[0].keys(): agent_infos_orig = tensor_utils.concat_tensor_dict_list([path["agent_infos_orig"] for path in paths]) samples_data["agent_infos_orig"] = agent_infos_orig else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate([path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray([tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos] ) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path.get("rewards",[0])) for path in paths] # ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) if log: # logger.record_tabular('Iteration', itr) # logger.record_tabular('AverageDiscountedReturn', # average_discounted_return) logger.record_tabular(prefix + 'NumTrajs', len(paths)) if testitr and prefix == "1": # TODO make this functional for more than 1 iteration self.memory["AverageReturnLastTest"]=np.mean(undiscounted_returns) self.memory["AverageReturnBestTest"]=max(self.memory["AverageReturnLastTest"],self.memory["AverageReturnBestTest"]) if self.memory["AverageReturnBestTest"] == 0.0: self.memory["AverageReturnBestTest"] = self.memory["AverageReturnLastTest"] if not testitr and prefix == '1': logger.record_tabular(prefix + 'AverageExpertReturn', np.mean(undiscounted_returns)) #if testitr: logger.record_tabular(prefix + 'AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular(prefix + 'StdReturn', np.std(undiscounted_returns)) logger.record_tabular(prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.record_tabular(prefix + 'MinReturn', np.min(undiscounted_returns)) if not fast_process and not metalearn_baseline: logger.record_tabular(prefix + 'ExplainedVariance', ev) logger.record_tabular(prefix + 'BaselinePredLoss', l2) # logger.record_tabular(prefix + 'Entropy', ent) # logger.record_tabular(prefix + 'Perplexity', np.exp(ent)) # if "env_infos" in paths[0].keys() and "success_left" in paths[0]["env_infos"].keys(): # logger.record_tabular(prefix + 'success_left', eval_success_left(paths)) # logger.record_tabular(prefix + 'success_right', eval_success_right(paths)) # else: # logger.record_tabular(prefix + 'success_left', -1.0) # logger.record_tabular(prefix + 'success_right', -1.0) # if metalearn_baseline: # if hasattr(self.algo.baseline, "revert"): # self.algo.baseline.revert() return samples_data
def process_samples(self, itr, paths, update_baseline=True): baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if hasattr(self.algo, 'epopt_epsilon'): if self.algo.epopt_epsilon < 1.0 and self.algo.epopt_after_iter <= itr: # prune the paths target_path_size = len(paths) * self.algo.epopt_epsilon sorted_indices = np.argsort( [path["returns"][0] for path in paths]) idx = 0 si_idx = 0 while True: if sorted_indices[si_idx] > target_path_size: paths.pop(idx) idx -= 1 idx += 1 si_idx += 1 if idx >= len(paths): break ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [] ct = 0 for path in paths: if path['env_infos']['dyn_model_id'][-1] == 0: undiscounted_returns.append(sum(path["rewards"])) if path['env_infos']['dyn_model_id'][-1] == 1: ct += 1 print('path count with fake dynamics: ', ct, len(undiscounted_returns), len(paths)) ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) if update_baseline: logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples_skill_dependent(self, itr, paths): # need to generate the correct observations using the outer product new_paths = [] for i in range(len(paths)): latents = paths[i]['agent_infos']['latents'] observations = paths[i]['observations'] # insert the time_remaining time_remaining = paths[i]['agent_infos']['time_remaining'].reshape( len(observations), 1) extended_obs = np.concatenate([observations, time_remaining], axis=1) # new_observations = np.matmul(observations[:, :, np.newaxis], latents[:, np.newaxis, :]).reshape(observations.shape[0], -1) new_observations = np.matmul(extended_obs[:, :, np.newaxis], latents[:, np.newaxis, :]).reshape( extended_obs.shape[0], -1) new_observations = np.concatenate( [new_observations, extended_obs, latents], axis=1) new_paths.append( dict(observations=new_observations, rewards=paths[i]['rewards'], returns=paths[i]['returns'])) paths = new_paths baselines = [] returns = [] if hasattr(self.algo.skill_dependent_baseline, "predict_n"): all_path_baselines = self.algo.skill_dependent_baseline.predict_n( paths) else: all_path_baselines = [ self.algo.skill_dependent_baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) samples_data = dict(advantages=advantages, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) samples_data = dict(advantages=adv, ) logger.log("fitting skill-depdendent baseline...") if hasattr(self.algo.skill_dependent_baseline, 'fit_with_samples'): self.algo.skill_dependent_baseline.fit_with_samples( paths, samples_data) else: self.algo.skill_dependent_baseline.fit(paths) logger.log("fitted skill-dependent baseline") logger.record_tabular('SkillBaselineExplainedVariance', ev) return samples_data
def process_samples(self, itr, paths): if self.normalize_reward: # Update reward mean/std Q. rewards = [] for i in xrange(len(paths)): rewards.append(paths[i]['rewards']) rewards_flat = np.hstack(rewards) self._reward_mean.append(np.mean(rewards_flat)) self._reward_std.append(np.std(rewards_flat)) # Normalize rewards. reward_mean = np.mean(np.asarray(self._reward_mean)) reward_std = np.mean(np.asarray(self._reward_std)) for i in xrange(len(paths)): paths[i]['rewards'] = (paths[i]['rewards'] - reward_mean) / (reward_std + 1e-8) if itr > 0: kls = [] for i in xrange(len(paths)): kls.append(paths[i]['KL']) kls_flat = np.hstack(kls) logger.record_tabular('Expl_MeanKL', np.mean(kls_flat)) logger.record_tabular('Expl_StdKL', np.std(kls_flat)) logger.record_tabular('Expl_MinKL', np.min(kls_flat)) logger.record_tabular('Expl_MaxKL', np.max(kls_flat)) # Perform normlization of the intrinsic rewards. if self.use_kl_ratio: if self.use_kl_ratio_q: # Update kl Q self.kl_previous.append(np.median(np.hstack(kls))) previous_mean_kl = np.mean(np.asarray(self.kl_previous)) for i in xrange(len(kls)): kls[i] = kls[i] / previous_mean_kl # Add KL ass intrinsic reward to external reward for i in xrange(len(paths)): paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i] # Discount eta self.eta *= self.eta_discount else: logger.record_tabular('Expl_MeanKL', 0.) logger.record_tabular('Expl_StdKL', 0.) logger.record_tabular('Expl_MinKL', 0.) logger.record_tabular('Expl_MaxKL', 0.) baselines = [] returns = [] for path in paths: path_baselines = np.append(self.baseline.predict(path), 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.discount * self.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards_orig"], self.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if not self.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.center_adv: advantages = util.center_advantages(advantages) if self.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path["rewards_orig"]) for path in paths ] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array( [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array( [tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array( [tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] for idx, path in enumerate(paths): path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) path["advantages"] = path['returns'] if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): if itr > 0: surprise = [] for i in range(len(paths)): surprise.append(paths[i]['surprise']) surprise_flat = np.hstack(surprise) logger.record_tabular('Surprise_Mean', np.mean(surprise_flat)) logger.record_tabular('Surprise_Std', np.std(surprise_flat)) logger.record_tabular('Surprise_Min', np.min(surprise_flat)) logger.record_tabular('Surprise_Max', np.max(surprise_flat)) for i in range(len(paths)): paths[i][ 'rewards'] = paths[i]['rewards'] + self.eta * surprise[i] else: logger.record_tabular('Surprise_Mean', 0.) logger.record_tabular('Surprise_Std', 0.) logger.record_tabular('Surprise_Min', 0.) logger.record_tabular('Surprise_Max', 0.) baselines = [] returns = [] for path in paths: path_baselines = np.append(self.baseline.predict(path), 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.discount * self.gae_lambda) path["returns"] = special.discount_cumsum( path["rewards_extrinsic"], self.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.center_adv: advantages = util.center_advantages(advantages) if self.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path["rewards_extrinsic"]) for path in paths ] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) logger.log("fitting baseline...") self.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): if self.normalize_reward: # Update reward mean/std Q. rewards = [] for i in xrange(len(paths)): rewards.append(paths[i]['rewards']) rewards_flat = np.hstack(rewards) self._reward_mean.append(np.mean(rewards_flat)) self._reward_std.append(np.std(rewards_flat)) # Normalize rewards. reward_mean = np.mean(np.asarray(self._reward_mean)) reward_std = np.mean(np.asarray(self._reward_std)) for i in xrange(len(paths)): paths[i]['rewards'] = ( paths[i]['rewards'] - reward_mean) / (reward_std + 1e-8) if itr > 0: kls = [] for i in xrange(len(paths)): kls.append(paths[i]['KL']) kls_flat = np.hstack(kls) logger.record_tabular('Expl_MeanKL', np.mean(kls_flat)) logger.record_tabular('Expl_StdKL', np.std(kls_flat)) logger.record_tabular('Expl_MinKL', np.min(kls_flat)) logger.record_tabular('Expl_MaxKL', np.max(kls_flat)) # Perform normlization of the intrinsic rewards. if self.use_kl_ratio: if self.use_kl_ratio_q: # Update kl Q self.kl_previous.append(np.median(np.hstack(kls))) previous_mean_kl = np.mean(np.asarray(self.kl_previous)) for i in xrange(len(kls)): kls[i] = kls[i] / previous_mean_kl # Add KL ass intrinsic reward to external reward for i in xrange(len(paths)): paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i] # Discount eta self.eta *= self.eta_discount else: logger.record_tabular('Expl_MeanKL', 0.) logger.record_tabular('Expl_StdKL', 0.) logger.record_tabular('Expl_MinKL', 0.) logger.record_tabular('Expl_MaxKL', 0.) baselines = [] returns = [] for path in paths: path_baselines = np.append(self.baseline.predict(path), 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.discount * self.gae_lambda) path["returns"] = special.discount_cumsum( path["rewards_orig"], self.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if not self.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.center_adv: advantages = util.center_advantages(advantages) if self.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path["rewards_orig"]) for path in paths] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array( [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [ (path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array( [tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict( p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict( p, max_path_length) for p in env_infos] ) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array( [tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data