def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): init_policy_params = cur_policy_params = self.algo.policy.get_param_values( ) if hasattr(self.algo.env, "get_param_values"): try: cur_env_params = self.algo.env.get_param_values() except: cur_env_params = None else: cur_env_params = None import time start = time.time() if type(reset_args) != list and type(reset_args) != np.ndarray: reset_args = [reset_args] * self.n_envs if self.algo.policy.all_param_vals: cur_policy_params = [ flatten_tensors(x.values()) for x in self.algo.policy.all_param_vals ] else: cur_policy_params = [cur_policy_params] * self.n_envs # do tasks sequentially and parallelize within rollouts per task. paths = {} for i in range(self.n_envs): paths[i] = parallel_sampler.sample_paths( policy_params=cur_policy_params[i], env_params=cur_env_params, max_samples=self.algo.batch_size / self.n_envs, max_path_length=self.algo.max_path_length, scope=self.algo.scope, reset_arg=reset_args[i], show_prog_bar=False, ) total_time = time.time() - start logger.record_tabular(log_prefix + "TotalExecTime", total_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) self.algo.policy.set_param_values(init_policy_params) # currently don't support not whole paths (if desired, add code to truncate paths) assert self.algo.whole_paths return paths
def eval_loss_grad(params): self.policy.set_param_values(params, trainable=True) grad = f_loss_grad(*input) flattened_grad = tensor_utils.flatten_tensors( list(map(np.asarray, grad))) return flattened_grad.astype(np.float64)
def get_param_values(self, all_params=False, **tags): params = self.get_params(all_params, **tags) param_values = tf.get_default_session().run(params) return flatten_tensors(param_values)
def get_param_values(self, **tags): return flatten_tensors([ param.get_value(borrow=True) for param in self.get_params(**tags) ])