def _fit_baseline(self, samples_data): """Update baselines from samples. Args: samples_data (dict): Processed sample data. See process_samples() for details. """ policy_opt_input_values = self._policy_opt_input_values(samples_data) # Augment reward from baselines rewards_tensor = self._f_rewards(*policy_opt_input_values) returns_tensor = self._f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) paths = samples_data['paths'] valids = samples_data['valids'] baselines = [path['baselines'] for path in paths] # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) aug_rewards = concat_tensor_list(aug_rewards) aug_returns = concat_tensor_list(aug_returns) samples_data['rewards'] = aug_rewards samples_data['returns'] = aug_returns # Calculate explained variance ev = np_tensor_utils.explained_variance_1d(np.concatenate(baselines), aug_returns) tabular.record('{}/ExplainedVariance'.format(self.baseline.name), ev) # Fit baseline logger.log('Fitting baseline...') if hasattr(self.baseline, 'fit_with_samples'): self.baseline.fit_with_samples(paths, samples_data) else: self.baseline.fit(paths)
def _fit_baseline(self, samples_data): """ Update baselines from samples. """ policy_opt_input_values = self._policy_opt_input_values(samples_data) # Augment reward from baselines rewards_tensor = self.f_rewards(*policy_opt_input_values) returns_tensor = self.f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor) paths = samples_data["paths"] valids = samples_data["valids"] baselines = [path["baselines"] for path in paths] # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path["rewards"] = rew[val.astype(np.bool)] path["returns"] = ret[val.astype(np.bool)] aug_rewards.append(path["rewards"]) aug_returns.append(path["returns"]) aug_rewards = tensor_utils.concat_tensor_list(aug_rewards) aug_returns = tensor_utils.concat_tensor_list(aug_returns) samples_data["rewards"] = aug_rewards samples_data["returns"] = aug_returns # Calculate explained variance ev = special.explained_variance_1d(np.concatenate(baselines), aug_returns) logger.record_tabular( "{}/ExplainedVariance".format(self.baseline.name), ev) # Fit baseline logger.log("Fitting baseline...") if hasattr(self.baseline, "fit_with_samples"): self.baseline.fit_with_samples(paths, samples_data) else: self.baseline.fit(paths)
def evaluate(self, policy_opt_input_values, samples_data): # Everything else rewards_tensor = self.f_rewards(*policy_opt_input_values) returns_tensor = self.f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor) # TODO # TODO: check the squeeze/dimension handling for both convolutions paths = samples_data['paths'] valids = samples_data['valids'] baselines = [path['baselines'] for path in paths] env_rewards = [path['rewards'] for path in paths] env_rewards = tensor_utils.concat_tensor_list(env_rewards.copy()) env_returns = [path['returns'] for path in paths] env_returns = tensor_utils.concat_tensor_list(env_returns.copy()) env_average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) aug_rewards = tensor_utils.concat_tensor_list(aug_rewards) aug_returns = tensor_utils.concat_tensor_list(aug_returns) samples_data['rewards'] = aug_rewards samples_data['returns'] = aug_returns # Calculate effect of the entropy terms d_rewards = np.mean(aug_rewards - env_rewards) logger.record_tabular('Policy/EntRewards', d_rewards) aug_average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) d_returns = np.mean(aug_average_discounted_return - env_average_discounted_return) logger.record_tabular('Policy/EntReturns', d_returns) # Calculate explained variance ev = special.explained_variance_1d(np.concatenate(baselines), aug_returns) logger.record_tabular('Baseline/ExplainedVariance', ev) inference_rmse = (samples_data['trajectory_infos']['mean'] - samples_data['latents'])**2. inference_rmse = np.sqrt(inference_rmse.mean()) logger.record_tabular('Inference/RMSE', inference_rmse) inference_rrse = rrse(samples_data['latents'], samples_data['trajectory_infos']['mean']) logger.record_tabular('Inference/RRSE', inference_rrse) embed_ent = self.f_embedding_entropy(*policy_opt_input_values) logger.record_tabular('Embedding/Entropy', embed_ent) infer_ce = self.f_inference_ce(*policy_opt_input_values) logger.record_tabular('Inference/CrossEntropy', infer_ce) pol_ent = self.f_policy_entropy(*policy_opt_input_values) logger.record_tabular('Policy/Entropy', pol_ent) #task_ents = self.f_task_entropies(*policy_opt_input_values) #tasks = samples_data["tasks"][:, 0, :] #_, task_indices = np.nonzero(tasks) #path_lengths = np.sum(samples_data["valids"], axis=1) #for t in range(self.policy.n_tasks): #lengths = path_lengths[task_indices == t] #completed = lengths < self.max_path_length #pct_completed = np.mean(completed) #num_samples = np.sum(lengths) #num_trajs = lengths.shape[0] #logger.record_tabular('Tasks/EpisodeLength/t={}'.format(t), # np.mean(lengths)) # logger.record_tabular('Tasks/CompletionRate/t={}'.format(t), # pct_completed) # logger.record_tabular('Tasks/NumSamples/t={}'.format(t), # num_samples) # logger.record_tabular('Tasks/NumTrajs/t={}'.format(t), num_trajs) # logger.record_tabular('Tasks/Entropy/t={}'.format(t), task_ents[t]) return samples_data
def evaluate(self, policy_opt_input_values, samples_data): """Evaluate rewards and everything else. Args: policy_opt_input_values (list[np.ndarray]): Flattened policy optimization input values. samples_data (dict): Processed sample data. See process_samples() for details. Returns: dict: Processed sample data. """ # pylint: disable=too-many-statements # Augment reward from baselines rewards_tensor = self._f_rewards(*policy_opt_input_values) returns_tensor = self._f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) paths = samples_data['paths'] valids = samples_data['valids'] baselines = [path['baselines'] for path in paths] env_rewards = [path['rewards'] for path in paths] env_rewards = concat_tensor_list(env_rewards.copy()) env_returns = [path['returns'] for path in paths] env_returns = concat_tensor_list(env_returns.copy()) env_average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) aug_rewards = concat_tensor_list(aug_rewards) aug_returns = concat_tensor_list(aug_returns) samples_data['rewards'] = aug_rewards samples_data['returns'] = aug_returns # Calculate effect of the entropy terms d_rewards = np.mean(aug_rewards - env_rewards) tabular.record('{}/EntRewards'.format(self.policy.name), d_rewards) aug_average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) d_returns = np.mean(aug_average_discounted_return - env_average_discounted_return) tabular.record('{}/EntReturns'.format(self.policy.name), d_returns) # Calculate explained variance ev = np_tensor_utils.explained_variance_1d(np.concatenate(baselines), aug_returns) tabular.record('{}/ExplainedVariance'.format(self._baseline.name), ev) inference_rmse = (samples_data['trajectory_infos']['mean'] - samples_data['latents'])**2. inference_rmse = np.sqrt(inference_rmse.mean()) tabular.record('Inference/RMSE', inference_rmse) inference_rrse = np_tensor_utils.rrse( samples_data['latents'], samples_data['trajectory_infos']['mean']) tabular.record('Inference/RRSE', inference_rrse) embed_ent = self._f_encoder_entropy(*policy_opt_input_values) tabular.record('{}/Encoder/Entropy'.format(self.policy.name), embed_ent) infer_ce = self._f_inference_ce(*policy_opt_input_values) tabular.record('Inference/CrossEntropy', infer_ce) pol_ent = self._f_policy_entropy(*policy_opt_input_values) pol_ent = np.sum(pol_ent) / np.sum(samples_data['valids']) tabular.record('{}/Entropy'.format(self.policy.name), pol_ent) task_ents = self._f_task_entropies(*policy_opt_input_values) tasks = samples_data['tasks'][:, 0, :] _, task_indices = np.nonzero(tasks) path_lengths = np.sum(samples_data['valids'], axis=1) for t in range(self.policy.task_space.flat_dim): lengths = path_lengths[task_indices == t] completed = lengths < self.max_path_length pct_completed = np.mean(completed) tabular.record('Tasks/EpisodeLength/t={}'.format(t), np.mean(lengths)) tabular.record('Tasks/CompletionRate/t={}'.format(t), pct_completed) tabular.record('Tasks/Entropy/t={}'.format(t), task_ents[t]) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] max_path_length = self.algo.max_path_length action_space = self.algo.env.action_space observation_space = self.algo.env.observation_space if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["deltas"] = deltas # calculate trajectory tensors (TODO: probably can do this in TF) for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) returns.append(path["returns"]) # Calculate trajectory samples # # Pad and flatten action and observation traces act = tensor_utils.pad_tensor(path['actions'], max_path_length) obs = tensor_utils.pad_tensor(path['observations'], max_path_length) act_flat = action_space.flatten_n(act) obs_flat = observation_space.flatten_n(obs) # Create a time series of stacked [act, obs] vectors #XXX now the inference network only looks at obs vectors #act_obs = np.concatenate([act_flat, obs_flat], axis=1) # TODO reactivate for harder envs? act_obs = obs_flat # act_obs = act_flat # Calculate a forward-looking sliding window of the stacked vectors # # If act_obs has shape (n, d), then trajs will have shape # (n, window, d) # # The length of the sliding window is determined by the trajectory # inference spec. We smear the last few elements to preserve the # time dimension. window = self.algo.inference.input_space.shape[0] trajs = sliding_window(act_obs, window, 1, smear=True) trajs_flat = self.algo.inference.input_space.flatten_n(trajs) path['trajectories'] = trajs_flat # trajectory infos _, traj_infos = self.algo.inference.get_latents(trajs) path['trajectory_infos'] = traj_infos ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) #DEBUG CPU vars ###################### cpu_adv = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) cpu_deltas = tensor_utils.concat_tensor_list( [path["deltas"] for path in paths]) cpu_act = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) cpu_obs = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) cpu_agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: cpu_adv = utils.center_advantages(cpu_adv) if self.algo.positive_adv: cpu_adv = utils.shift_advantages_to_positive(cpu_adv) ##################################### # make all paths the same length obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) tasks = [path["tasks"] for path in paths] tasks = tensor_utils.pad_tensor_n(tasks, max_path_length) tasks_gt = [path['tasks_gt'] for path in paths] tasks_gt = tensor_utils.pad_tensor_n(tasks_gt, max_path_length) latents = [path['latents'] for path in paths] latents = tensor_utils.pad_tensor_n(latents, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) trajectories = tensor_utils.stack_tensor_list( [path["trajectories"] for path in paths]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) latent_infos = [path["latent_infos"] for path in paths] latent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in latent_infos ]) trajectory_infos = [path["trajectory_infos"] for path in paths] trajectory_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in trajectory_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, tasks=tasks, latents=latents, trajectories=trajectories, rewards=rewards, baselines=baselines, returns=returns, valids=valids, agent_infos=agent_infos, latent_infos=latent_infos, trajectory_infos=trajectory_infos, env_infos=env_infos, paths=paths, cpu_adv=cpu_adv, #DEBUG cpu_deltas=cpu_deltas, #DEBUG cpu_obs=cpu_obs, #DEBUG cpu_act=cpu_act, #DEBUG cpu_agent_infos=cpu_agent_infos, # DEBUG ) logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data