def process_samples(self, itr, paths, prefix='', log=True): baselines = [] returns = [] for idx, path in enumerate(paths): path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) if log: logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths, log=log) if log: logger.log("fitted") if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [self.algo.baseline.predict(path) for path in paths] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) #ev = special.explained_variance_1d( # np.concatenate(baselines), # np.concatenate(returns) #) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] #ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate([path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray([tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos] ) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) if log: #logger.record_tabular('Iteration', itr) #logger.record_tabular('AverageDiscountedReturn', # average_discounted_return) logger.record_tabular(prefix+'AverageReturn', np.mean(undiscounted_returns)) #logger.record_tabular(prefix+'ExplainedVariance', ev) logger.record_tabular(prefix+'NumTrajs', len(paths)) #logger.record_tabular(prefix+'Entropy', ent) #logger.record_tabular(prefix+'Perplexity', np.exp(ent)) logger.record_tabular(prefix+'StdReturn', np.std(undiscounted_returns)) logger.record_tabular(prefix+'MaxReturn', np.max(undiscounted_returns)) logger.record_tabular(prefix+'MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths, prefix='', log=True, fast_process=False, testitr=False, metalearn_baseline=False, comet_logger=None): baselines = [] returns = [] if testitr: metalearn_baseline = False train_baseline = (itr in BASELINE_TRAINING_ITRS) if not fast_process: for idx, path in enumerate(paths): path["returns"] = special.discount_cumsum( path["rewards"], self.algo.discount) if not fast_process and not metalearn_baseline: if log: logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples( paths, samples_data) # TODO: doesn't seem like this is ever used else: # print("debug21 baseline before fitting",self.algo.baseline.predict(paths[0])[0:2], "...",self.algo.baseline.predict(paths[0])[-3:-1]) # print("debug23 predloss before fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths])) self.algo.baseline.fit(paths, log=log) # print("debug25 predloss AFTER fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths])) # print("debug22 returns ",paths[0]['returns'][0:2], "...",paths[0]['returns'][-3:-1]) # print("debug24 baseline after fitting",self.algo.baseline.predict(paths[0])[0:2], "...", self.algo.baseline.predict(paths[0])[-3:-1]) if log: logger.log("fitted") if 'switch_to_init_dist' in dir(self.algo.baseline): self.algo.baseline.switch_to_init_dist() if train_baseline: self.algo.baseline.fit_train_baseline(paths) if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): if not fast_process and not metalearn_baseline: # if idx==0: # print("debug22", all_path_baselines[idx]) # print("debug23", path['returns']) path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) baselines.append(path_baselines[:-1]) if not fast_process: returns.append(path["returns"]) if "expert_actions" not in path.keys(): if "expert_actions" in path["env_infos"].keys(): path["expert_actions"] = path["env_infos"][ "expert_actions"] else: # assert False, "you shouldn't need expert_actions" path["expert_actions"] = np.array( [[None] * len(path['actions'][0])] * len(path['actions'])) if not fast_process and not metalearn_baseline: # TODO: we want the ev eventually ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) l2 = np.linalg.norm(np.array(baselines) - np.array(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) if not fast_process: rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) if "env_infos" in paths[0].keys(): env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) if not fast_process and not metalearn_baseline: advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) # print("debug, advantages are", advantages,) # print("debug, shape of advantages is", type(advantages), np.shape(advantages)) expert_actions = tensor_utils.concat_tensor_list( [path["expert_actions"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if not fast_process and not metalearn_baseline: if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) if "meta_predict" in dir(self.algo.baseline): # print("debug, advantages are", advantages, ) advantages = advantages + self.algo.baseline.meta_predict( observations) print("debug, metalearned baseline constant is", self.algo.baseline.meta_predict(observations)[0:2], "...", self.algo.baseline.meta_predict(observations)[-3:-1]) # print("debug, metalearned baseline constant shape is", np.shape(self.algo.baseline.meta_predict(observations))) # print("debug, advantages are", advantages[0:2],"...", advantages[-3:-1]) # print("debug, advantages shape is", np.shape(advantages)) # average_discounted_return = \ # np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path.get("rewards", [0])) for path in paths ] # ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) if fast_process: samples_data = dict( observations=observations, actions=actions, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) elif metalearn_baseline: samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) if 'agent_infos_orig' in paths[0].keys(): agent_infos_orig = tensor_utils.concat_tensor_dict_list( [path["agent_infos_orig"] for path in paths]) samples_data["agent_infos_orig"] = agent_infos_orig else: samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) if 'agent_infos_orig' in paths[0].keys(): agent_infos_orig = tensor_utils.concat_tensor_dict_list( [path["agent_infos_orig"] for path in paths]) samples_data["agent_infos_orig"] = agent_infos_orig else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path.get("rewards", [0])) for path in paths ] # ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) if log and comet_logger: comet_logger.log_metric('StdReturn', np.std(undiscounted_returns)) comet_logger.log_metric('MaxReturn', np.max(undiscounted_returns)) comet_logger.log_metric('MinReturn', np.min(undiscounted_returns)) comet_logger.log_metric('AverageReturn', np.mean(undiscounted_returns)) if log: # logger.record_tabular('Iteration', itr) # logger.record_tabular('AverageDiscountedReturn', # average_discounted_return) logger.record_tabular(prefix + 'AverageReturn', np.mean(undiscounted_returns)) if testitr and prefix == "1": # TODO make this functional for more than 1 iteration self.memory["AverageReturnLastTest"] = np.mean( undiscounted_returns) self.memory["AverageReturnBestTest"] = max( self.memory["AverageReturnLastTest"], self.memory["AverageReturnBestTest"]) if self.memory["AverageReturnBestTest"] == 0.0: self.memory["AverageReturnBestTest"] = self.memory[ "AverageReturnLastTest"] if not fast_process and not metalearn_baseline: logger.record_tabular(prefix + 'ExplainedVariance', ev) logger.record_tabular(prefix + 'BaselinePredLoss', l2) if comet_logger: comet_logger.log_metric('ExplainedVariance', ev) comet_logger.log_metric('BaselinePredLoss', l2) # if comet_logger: # comet_logger.log_metric('ExplainedVariance', ev) # comet_logger.log_metric('BaselinePredLoss', l2) logger.record_tabular(prefix + 'NumTrajs', len(paths)) # logger.record_tabular(prefix + 'Entropy', ent) # logger.record_tabular(prefix + 'Perplexity', np.exp(ent)) logger.record_tabular(prefix + 'StdReturn', np.std(undiscounted_returns)) logger.record_tabular(prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.record_tabular(prefix + 'MinReturn', np.min(undiscounted_returns)) if "env_infos" in paths[0].keys( ) and "success_left" in paths[0]["env_infos"].keys(): logger.record_tabular(prefix + 'success_left', eval_success_left(paths)) logger.record_tabular(prefix + 'success_right', eval_success_right(paths)) if comet_logger: comet_logger.log_metric('success_left', eval_success_left(paths)) comet_logger.log_metric('success_right', eval_success_right(paths)) # else: # logger.record_tabular(prefix + 'success_left', -1.0) # logger.record_tabular(prefix + 'success_right', -1.0) # if metalearn_baseline: # if hasattr(self.algo.baseline, "revert"): # self.algo.baseline.revert() return samples_data
def create_samples_dict(self, paths): if self.algo.safety_constraint: if self.use_safety_bonus: safety_key = 'safety_robust' + self.algo.safety_key[6:] else: safety_key = self.algo.safety_key logger.log("Policy optimization is using safety_key=%s." % safety_key) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) weights = tensor_utils.concat_tensor_list([path["weights"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, weights=weights, paths=paths, ) if self.algo.safety_constraint: safety_vals = tensor_utils.concat_tensor_list([path[safety_key] for path in paths]) samples_data['safety_values'] = safety_vals # for gradient calculation if self.algo.center_safety_vals: samples_data['safety_offset'] = np.mean(safety_vals) samples_data['safety_values'] = samples_data['safety_values'] - samples_data['safety_offset'] else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate([path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray([tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos] ) weights = [path["weights"] for path in paths] weights = tensor_utils.pad_tensor_n(weights, max_path_length) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, weights=weights, paths=paths, ) if self.algo.safety_constraint: safety_vals = [path[safety_key] for path in paths] if self.algo.center_safety_vals: samples_data['safety_offset'] = np.mean(safety_vals) safety_vals = safety_vals - samples_data['safety_offset'] safety_vals = tensor_utils.pad_tensor_n(safety_vals, max_path_length) samples_data['safety_values'] = safety_vals if self.algo.safety_constraint: # logic currently only supports linearization constant calculated on most recent batch of data # because importance sampling is complicated if self.algo.safety_key == 'safety_rewards': if self.use_safety_bonus: key = 'safety_robust_rewards' else: key = 'safety_rewards' safety_eval = np.mean(tensor_utils.concat_tensor_list( [path[key] for path in self.experience_replay[-1]] )) else: if self.use_safety_bonus: key = 'safety_robust_returns' else: key = 'safety_returns' safety_eval = np.mean( [path[key][0] for path in self.experience_replay[-1]] ) samples_data['safety_eval'] = safety_eval # linearization constant samples_data['safety_rescale'] = len(samples_data['safety_values']) / sum([len(paths) for paths in self.experience_replay]) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) baselines_tensor = tensor_utils.concat_tensor_list(baselines) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) etas = None if hasattr(self.algo, 'qprop') and self.algo.qprop and self.algo.qprop_enable: old_advantages = np.copy(advantages) old_advantages = self.process_advantages(old_advantages) old_advantages_scale = np.abs(old_advantages).mean() logger.record_tabular("AbsLearnSignalOld", old_advantages_scale) logger.log("Qprop, subtracting control variate") advantages_bar = self.algo.get_control_variate( observations=observations, actions=actions) if self.algo.qprop_eta_option == 'ones': etas = np.ones_like(advantages) elif self.algo.qprop_eta_option == 'adapt1': # conservative etas = (advantages * advantages_bar) > 0 etas = etas.astype(advantages.dtype) logger.log("Qprop, etas: %d 1s, %d 0s" % ((etas == 1).sum(), (etas == 0).sum())) elif self.algo.qprop_eta_option == 'adapt2': # aggressive etas = np.sign(advantages * advantages_bar) etas = etas.astype(advantages.dtype) logger.log("Qprop, etas: %d 1s, %d -1s" % ((etas == 1).sum(), (etas == -1).sum())) else: raise NotImplementedError(self.algo.qprop_eta_option) advantages -= etas * advantages_bar advantages = self.process_advantages(advantages) advantages_scale = np.abs(advantages).mean() logger.record_tabular("AbsLearnSignalNew", advantages_scale) logger.record_tabular("AbsLearnSignal", advantages_scale) else: advantages = self.process_advantages(advantages) advantages_scale = np.abs(advantages).mean() logger.record_tabular("AbsLearnSignal", advantages_scale) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, baselines=baselines_tensor, etas=etas, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) baselines_tensor = tensor_utils.pad_tensor_n( baselines, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, baselines=baselines_tensor, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): if self.algo.ma_mode == 'centralized': return super().process_samples(itr, paths) elif self.algo.ma_mode == 'decentralized': return super().process_samples( itr, list(itertools.chain.from_iterable(paths))) elif self.algo.ma_mode == 'concurrent': processed_samples = [] for ps, policy, baseline in zip(paths, self.algo.policies, self.algo.baselines): baselines = [] returns = [] if hasattr(baseline, "predict_n"): all_path_baselines = baseline.predict_n(ps) else: all_path_baselines = [ baseline.predict(path) for path in ps ] for idx, path in enumerate(ps): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum( path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in ps]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in ps]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in ps]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in ps]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in ps]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in ps]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in ps]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive( advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in ps]) undiscounted_returns = [ sum(path["rewards"]) for path in ps ] ent = np.mean(policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, ps=ps, ) else: max_path_length = max( [len(path["advantages"]) for path in ps]) # make all ps the same length (pad extra advantages with 0) obs = [path["observations"] for path in ps] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in ps]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in ps] else: adv = [path["advantages"] for path in ps] adv = np.asarray([ tensor_utils.pad_tensor(a, max_path_length) for a in adv ]) actions = [path["actions"] for path in ps] actions = tensor_utils.pad_tensor_n( actions, max_path_length) rewards = [path["rewards"] for path in ps] rewards = tensor_utils.pad_tensor_n( rewards, max_path_length) returns = [path["returns"] for path in ps] returns = tensor_utils.pad_tensor_n( returns, max_path_length) agent_infos = [path["agent_infos"] for path in ps] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in ps] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in ps] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in ps]) undiscounted_returns = [ sum(path["rewards"]) for path in ps ] ent = np.sum( policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, ps=ps, ) logger.log("fitting baseline...") if hasattr(baseline, 'fit_with_samples'): baseline.fit_with_samples(ps, samples_data) else: baseline.fit(ps) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(ps)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) processed_samples.append(samples_data) return processed_samples
def process_samples(self, itr, paths): baselines = [] returns = [] #---------------Updated by myself--------------- violation_cost = [] boundary_violation_cost = [] succ_rate = 0 succ_return = [] # log_liklihood=[] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) #-------- Updated by myself---------------- violation_cost.append(path["violation_cost"]) boundary_violation_cost.append(path["boundary_violation_cost"]) # log_liklihood.append(path["log_likelihood"]) succ_rate += path["succ_rate"] if not (path["succ_return"] == 0): succ_return.append(path["succ_return"]) succ_rate = succ_rate / len(paths) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, violation_cost=np.array(violation_cost), boundary_violation_cost=np.array(boundary_violation_cost), success_rate=succ_rate, successful_AverageReturn=np.array(succ_return), ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, violation_cost=np.array(violation_cost), boundary_violation_cost=np.array(boundary_violation_cost), success_rate=succ_rate, successful_AverageReturn=np.array(succ_return), ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) analysis_data = dict(Iteration=itr, AverageDiscountedReturn=average_discounted_return, AverageReturn=np.mean(undiscounted_returns), ExplainedVariance=ev, NumTrajs=len(paths), Entropy=ent, Perplexity=np.exp(ent), StdReturn=np.std(undiscounted_returns), MaxReturn=np.max(undiscounted_returns), MinReturn=np.min(undiscounted_returns)) return samples_data, analysis_data
def process_samples(self, itr, paths): if not self.initialized: self.initialize() baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): if hasattr(self.algo, '_kwargs'): if self.mode.startswith('inception'): if idx % 100 == 0: print("paths", idx) imgs = [ img[0] for img in path['env_infos']['imgs'] if img is not None ] feat = self.sess.run(self.model[1][self.layer], {self.image: imgs}) diff = self.means - feat diff[self.std == 0] = 0 diff = diff**2 / (self.std + 1e-5) means = np.mean(diff, axis=(1, 2, 3)) for j in range(25): path["rewards"][j * 2 + 1] -= means[j] * (j**2) elif self.mode == 'oracle': path["rewards"] += path["env_infos"]["reward_true"] elif self.mode.startswith('ours'): imgs = [ img for img in path['env_infos']['imgs'] if img is not None ] if not hasattr(self, 'means'): self.means = [] self.imgs = [] validdata = np.load(self.algo._kwargs['modeldata']) for vp in range(self.nvp): context = imgs[0][vp] timgs = [] tfeats = [] nvideos = validdata.shape[1] if self.mode == 'oursinception': nvideos = 50 for i in range(nvideos): if i % 10 == 0: print("feats", i) skip = 1 if self.name == 'real' or self.name == 'sweep': skip = 2 if self.mode == 'oursinception': input_img = validdata[::skip, i] else: input_img = ((validdata[::skip, i] + 1) * 127.5).astype(np.uint8) tfeat, timg = self.sess.run( [self.model.translated_z, self.model.out], { self.image: [ input_img, [context] * self.batch_size, [context] * self.batch_size ] }) timgs.append(timg) tfeats.append(tfeat) self.means.append(np.mean(tfeats, axis=0)) meanimgs = np.mean(timgs, axis=0) self.imgs.append(meanimgs) # for j in range(25): # scipy.misc.imsave('test/%d_%d.png' %(vp, j), arm_shaping.inverse_transform(meanimgs[j])) if idx % 10 == 0: print("feats", idx) # import IPython # IPython.embed() costs = 0 for vp in range(self.nvp): curimgs = [img[vp] for img in imgs] feats, image_trans = self.sess.run( [self.model.input_z, self.image_trans], { self.image: [ curimgs, [curimgs[0]] * self.batch_size, curimgs ] }) # import IPython # IPython.embed() # for j in range(25): # scipy.misc.imsave('test/' + str(j) + "_recon.png", arm_shaping.inverse_transform(image_recon[j])) # for j in range(25): # scipy.misc.imsave('test/' + str(j) + "_orig.png", arm_shaping.inverse_transform(image_trans[0][j])) if self.ablation_type == "None": costs += np.sum((self.means[vp] - feats)**2, axis = 1) + \ self.algo._kwargs['scale']*np.sum((self.imgs[vp] - image_trans[0])**2, axis = (1, 2, 3)) elif self.ablation_type == "nofeat": costs = self.algo._kwargs['scale'] * np.sum( (self.imgs - image_trans[0])**2, axis=(1, 2, 3)) elif self.ablation_type == "noimage": costs = np.sum((self.means - feats)**2, axis=1) elif self.ablation_type == 'recon': costs = np.sum((self.means - feats)**2, axis = 1) + \ self.algo._kwargs['scale']*np.sum((image_recon - image_trans[0])**2, axis = (1, 2, 3)) # costs = np.sum((self.means - feats)**2, axis = 1) + \ # self.algo._kwargs['scale']*np.sum((self.imgs - image_trans[0])**2, axis = (1, 2, 3)) for j in range(25): path["rewards"][j * 2 + 1] -= costs[j] * (j**2) path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) if 'reward_true' in paths[0]['env_infos']: trues = [sum(path["env_infos"]["reward_true"]) for path in paths] logger.record_tabular('ReturnTrue', np.mean(trues)) logger.record_tabular('MinTrue', np.min(trues)) logger.record_tabular('MaxTrue', np.max(trues)) logger.record_tabular('ArgmaxTrueReturn', trues[np.argmax(undiscounted_returns)]) # logger.record_tabular('Shaping', np.mean([path["shaping_reward"] for path in paths])) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def create_samples_dict(self, paths): if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) weights = tensor_utils.concat_tensor_list( [path["weights"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, weights=weights, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) weights = [path["weights"] for path in paths] weights = tensor_utils.pad_tensor_n(weights, max_path_length) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, weights=weights, paths=paths, ) return samples_data
def process_samples(self, itr, paths): # IMPORTANT: # Rewards accrued from a_t to a_t+1 are expected to be discounted by # the environment to values at time t #paths = list(itertools.chain.from_iterable(paths)) baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): t_sojourn = path["offset_t_sojourn"] gamma = self.algo.discount lamda = self.algo.gae_lambda discount_gamma = np.exp(-gamma * t_sojourn) discount_gamma_lambda = np.exp(-gamma * lamda * t_sojourn) path_baselines = np.append(all_path_baselines[idx], 0) if (len(path["rewards"]) != len(t_sojourn)): # TODO HANDLE INFINITE HORIZON GAMES pdb.set_trace() deltas = path["rewards"] + \ discount_gamma * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = variable_discount_cumsum( deltas, discount_gamma_lambda) path["returns"] = variable_discount_cumsum(path["rewards"], discount_gamma) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths, update_baseline=True): baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if hasattr(self.algo, 'epopt_epsilon'): if self.algo.epopt_epsilon < 1.0 and self.algo.epopt_after_iter <= itr: # prune the paths target_path_size = len(paths) * self.algo.epopt_epsilon sorted_indices = np.argsort( [path["returns"][0] for path in paths]) idx = 0 si_idx = 0 while True: if sorted_indices[si_idx] > target_path_size: paths.pop(idx) idx -= 1 idx += 1 si_idx += 1 if idx >= len(paths): break ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [] ct = 0 for path in paths: if path['env_infos']['dyn_model_id'][-1] == 0: undiscounted_returns.append(sum(path["rewards"])) if path['env_infos']['dyn_model_id'][-1] == 1: ct += 1 print('path count with fake dynamics: ', ct, len(undiscounted_returns), len(paths)) ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) if update_baseline: logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths, prefix='', log=True): baselines = [] returns = [] for idx, path in enumerate(paths): path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) if log: logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths, log=log) if log: logger.log("fitted") if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [self.algo.baseline.predict(path) for path in paths] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate([path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray([tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos] ) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) if log: #logger.record_tabular('Iteration', itr) #logger.record_tabular('AverageDiscountedReturn', # average_discounted_return) logger.record_tabular(prefix+'AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular(prefix+'ExplainedVariance', ev) logger.record_tabular(prefix+'NumTrajs', len(paths)) logger.record_tabular(prefix+'Entropy', ent) logger.record_tabular(prefix+'Perplexity', np.exp(ent)) logger.record_tabular(prefix+'StdReturn', np.std(undiscounted_returns)) logger.record_tabular(prefix+'MaxReturn', np.max(undiscounted_returns)) logger.record_tabular(prefix+'MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] for idx, path in enumerate(paths): path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) path["advantages"] = path['returns'] if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data