예제 #1
0
 def obtain_samples(self, itr):
     # print("obtain samples in batch_polopt")
     cur_params = self.algo.policy.get_param_values()  # a list of numbers
     try:
         cur_low_params = self.algo.low_policy.get_param_values()
         # env_params = cur_low_params if self.algo.train_low else None # need to reset low policy only when training low!
         paths = parallel_sampler.sample_paths(
             policy_params=cur_params,
             low_policy_params=
             cur_low_params,  # low policy params as env params!
             env_params=[self.algo.env.time_steps_agg,
                         self.algo],  # the parameters to recover for env!
             max_samples=self.algo.batch_size,
             max_path_length=self.algo.max_path_length,
             scope=self.algo.scope,
         )
     except AttributeError:
         paths = parallel_sampler.sample_paths(
             policy_params=cur_params,
             max_samples=self.algo.batch_size,
             max_path_length=self.algo.max_path_length,
             scope=self.algo.scope,
         )
     if self.algo.whole_paths:  # this line is run (whole path)
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(
             paths, self.algo.batch_size)
         return paths_truncated
예제 #2
0
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''):
        init_policy_params = cur_policy_params = self.algo.policy.get_param_values()
        if hasattr(self.algo.env,"get_param_values"):
            try:
                cur_env_params = self.algo.env.get_param_values()
            except:
                cur_env_params = None
        else:
            cur_env_params = None
        import time
        start = time.time()
        # first, a naive implementation.
        if type(reset_args) != list and type(reset_args)!=np.ndarray:
            reset_args = [reset_args]*self.n_envs
        if self.algo.policy.all_param_vals:
            cur_policy_params = [flatten_tensors(x.values()) for x in self.algo.policy.all_param_vals]
        else:
            cur_policy_params = [cur_policy_params]*self.n_envs
        # assume that n_envs = num parallel
        if self.n_envs == parallel_sampler.singleton_pool.n_parallel:
            raise NotImplementedError('this implementation is buggy.')
            # 1 thread per env
            paths = parallel_sampler.sample_paths(
                policy_params=cur_policy_params,
                env_params=cur_env_params,
                max_samples=self.algo.batch_size,
                max_path_length=self.algo.max_path_length,
                scope=self.algo.scope,
                reset_arg=reset_args,
                show_prog_bar=True,
                multi_task=True,
            )
        else:
            # do tasks sequentially and parallelize within rollouts per task.
            paths = {}
            for i in range(self.n_envs):
                paths[i] = parallel_sampler.sample_paths(
                    policy_params=cur_policy_params[i],
                    env_params=cur_env_params,
                    max_samples=self.algo.batch_size / self.n_envs,
                    max_path_length=self.algo.max_path_length,
                    scope=self.algo.scope,
                    reset_arg=reset_args[i],
                    show_prog_bar=False,
                )
        total_time = time.time() - start
        logger.record_tabular(log_prefix+"TotalExecTime", total_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())

        self.algo.policy.set_param_values(init_policy_params)

        # currently don't support not whole paths (if desired, truncate paths)
        assert self.algo.whole_paths

        return paths
예제 #3
0
    def obtain_samples(self, itr):
        cur_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_params,
            max_samples=self.algo.batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )

        for path in paths:
            logli = self.algo.policy.distribution.log_likelihood(path["actions"],path["agent_infos"])
            path["log_likelihood"] = logli


        if not(self.algo.all_paths):
            paths = local_truncate_paths(paths, self.algo.batch_size)

        self.env_interacts = sum([len(path["rewards"]) for path in paths])
        self.total_env_interacts += self.env_interacts
        self.mean_path_len = float(self.env_interacts)/len(paths)

        self.experience_replay.append(paths)
        self.env_interacts_memory.append(self.env_interacts)
        if len(self.experience_replay) > self.algo.batch_aggregate_n:
            self.experience_replay.pop(0)
            self.env_interacts_memory.pop(0)

        return paths
예제 #4
0
 def obtain_samples(self,
                    dyn_model=None,
                    itr=None,
                    policy=None,
                    rau=None,
                    delta=0,
                    constraint_fn=None,
                    constraint_cost_fn=None,
                    HCMPC_Activation=False,
                    Constrained=False):
     cur_params = self.algo.policy.get_param_values()
     paths = parallel_sampler.sample_paths(
         policy_params=cur_params,
         max_samples=self.algo.batch_size,
         dyn_model=dyn_model,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
         policy=policy,
         rau=rau,
         delta=delta,
         constraint_fn=constraint_fn,
         constraint_cost_fn=constraint_cost_fn,
         HCMPC_Activation=HCMPC_Activation,
         Constrained=Constrained,
     )
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(
             paths, self.algo.batch_size)
         return paths_truncated
예제 #5
0
    def obtain_samples(self, itr):
        cur_params = self.algo.policy.get_param_values()
        paths = parallel_sampler.sample_paths(
            policy_params=cur_params,
            max_samples=self.algo.batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )

        """log_likelihoods for importance sampling"""
        for path in paths:
            logli = self.algo.policy.distribution.log_likelihood(path["actions"],path["agent_infos"])
            path["log_likelihood"] = logli


        """keep data use per iteration approximately fixed"""
        if not(self.algo.all_paths):
            paths = local_truncate_paths(paths, self.algo.batch_size)

        """keep track of path length"""
        self.env_interacts = sum([len(path["rewards"]) for path in paths])
        self.total_env_interacts += self.env_interacts
        self.mean_path_len = float(self.env_interacts)/len(paths)

        """manage experience replay for old batch reuse"""
        self.experience_replay.append(paths)
        self.env_interacts_memory.append(self.env_interacts)
        if len(self.experience_replay) > self.algo.batch_aggregate_n:
            self.experience_replay.pop(0)
            self.env_interacts_memory.pop(0)

        return paths
 def obtain_samples(self, itr):
     paths = parallel_sampler.sample_paths(
         policy_params=None,
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     return paths
예제 #7
0
    def obtain_samples(self,
                       itr,
                       reset_args=None,
                       return_dict=False,
                       log_prefix=''):
        init_policy_params_list = cur_policy_params_list = [
            policy.get_param_values() for policy in self.algo.policy_list
        ]
        if hasattr(self.algo.env, "get_param_values"):
            try:
                cur_env_params = self.algo.env.get_param_values()
            except:
                cur_env_params = None
        else:
            cur_env_params = None
        import time
        start = time.time()
        if type(reset_args) != list and type(reset_args) != np.ndarray:
            reset_args = [reset_args] * self.n_envs
        if self.algo.policy_list[0].all_param_vals is not None:
            cur_policy_params_list = [[
                flatten_tensors(x.values()) for x in policy.all_param_vals
            ] for policy in self.algo.policy_list]
        else:
            cur_policy_params_list = [
                [cur_policy_params] * self.n_envs
                for cur_policy_params in cur_policy_params_list
            ]
        # do tasks sequentially and parallelize within rollouts per task.
        paths = {}
        for n in range(len(self.algo.policy_list)):
            for i in range(self.n_envs):
                paths[str(n) + "_" + str(i)] = parallel_sampler.sample_paths(
                    policy_params=cur_policy_params_list[n][i],
                    env_params=cur_env_params,
                    max_samples=self.algo.batch_size / self.n_envs,
                    max_path_length=self.algo.max_path_length,
                    scope=self.algo.scope,
                    reset_arg=reset_args[i],
                    show_prog_bar=False,
                )
        total_time = time.time() - start
        logger.record_tabular(log_prefix + "TotalExecTime", total_time)

        if not return_dict:
            flatten_list = lambda l: [
                item for sublist in l for item in sublist
            ]
            paths = flatten_list(paths.values())

        for n in range(len(self.algo.policy_list)):
            self.algo.policy_list[n].set_param_values(
                init_policy_params_list[n])

        # currently don't support not whole paths (if desired, add code to truncate paths)
        assert self.algo.whole_paths

        return paths
예제 #8
0
    def evaluate(self, steps, pool):
        logger.log("Collecting samples for evaluation")
        paths = parallel_sampler.sample_paths(
            policy_params=self.policy.get_param_values(),
            max_samples=self.eval_samples,
            max_path_length=self.max_path_length,
        )

        average_discounted_return = np.mean([
            special.discount_return(path["rewards"], self.discount)
            for path in paths
        ])

        returns = [sum(path["rewards"]) for path in paths]

        all_qs = np.concatenate(self.q_averages)
        all_ys = np.concatenate(self.y_averages)

        average_q_loss = np.mean(self.qf_loss_averages)
        average_policy_surr = np.mean(self.policy_surr_averages)
        average_action = np.mean(
            np.square(np.concatenate([path["actions"] for path in paths])))

        policy_reg_param_norm = np.linalg.norm(
            self.policy.get_param_values(regularizable=True))
        qfun_reg_param_norm = np.linalg.norm(
            self.qf.get_param_values(regularizable=True))

        logger.record_tabular('steps', steps)
        logger.record_tabular('AverageReturn', np.mean(returns))
        logger.record_tabular('StdReturn', np.std(returns))
        logger.record_tabular('MaxReturn', np.max(returns))
        logger.record_tabular('MinReturn', np.min(returns))
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageQLoss', average_q_loss)
        logger.record_tabular('AveragePolicySurr', average_policy_surr)
        logger.record_tabular('AverageQ', np.mean(all_qs))
        logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
        logger.record_tabular('AverageY', np.mean(all_ys))
        logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
        logger.record_tabular('AverageAbsQYDiff',
                              np.mean(np.abs(all_qs - all_ys)))
        logger.record_tabular('AverageAction', average_action)

        logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm)
        logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm)

        self.env.log_diagnostics(paths)
        self.policy.log_diagnostics(paths)

        self.qf_loss_averages = []
        self.policy_surr_averages = []

        self.q_averages = []
        self.y_averages = []
        self.es_path_returns = []
예제 #9
0
 def obtain_samples(self, itr):
     cur_params = self.policy.get_param_values()
     paths = parallel_sampler.sample_paths(
         policy_params=cur_params,
         max_samples=self.batch_size,
         max_path_length=self.max_path_length,
     )
     if self.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(paths, self.batch_size)
         return paths_truncated
 def obtain_samples(self, itr, env_params=None):
     all_paths = []
     for action_sequence in self.action_sequences:
         paths = parallel_sampler.sample_paths(
             policy_params=action_sequence,
             max_samples=self.n_traj,
             max_path_length=action_sequence.shape[0],
             scope=self.scope,
             count_traj=True,
             terminate_only_max_path=True,
             env_params=env_params)
         # truncate the paths if we collected more than self.n_traj
         all_paths += paths[:self.n_traj]
     return all_paths
예제 #11
0
    def obtain_samples(self,
                       itr,
                       reset_args=None,
                       policy_contexts=None,
                       return_dict=False):
        init_policy_params = cur_policy_params = self.algo.policy.get_param_values(
        )
        if hasattr(self.algo.env, "get_param_values"):
            try:
                cur_env_params = self.algo.env.get_param_values()
            except:
                cur_env_params = None
        else:
            cur_env_params = None
        import time
        start = time.time()
        if type(reset_args) != list and type(reset_args) != np.ndarray:
            reset_args = [reset_args] * self.n_envs
        if type(policy_contexts) != list and type(
                policy_contexts) != np.ndarray:
            policy_contexts = [policy_contexts] * self.n_envs
        cur_policy_params = [cur_policy_params] * self.n_envs
        # do tasks sequentially and parallelize within rollouts per task.
        paths = {}
        for i in range(self.n_envs):
            paths[i] = parallel_sampler.sample_paths(
                policy_params=cur_policy_params[i],
                env_params=cur_env_params,
                max_samples=self.algo.batch_size / self.n_envs,
                max_path_length=self.algo.max_path_length,
                scope=self.algo.scope,
                reset_arg=reset_args[i],
                policy_context=policy_contexts[i],
                show_prog_bar=False,
            )
        total_time = time.time() - start
        logger.record_tabular("TotalExecTime", total_time)

        if not return_dict:
            flatten_list = lambda l: [
                item for sublist in l for item in sublist
            ]
            paths = flatten_list(paths.values())

        self.algo.policy.set_param_values(init_policy_params)

        # currently don't support not whole paths (if desired, add code to truncate paths)
        assert self.algo.whole_paths

        return paths
예제 #12
0
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='',extra_input=None,extra_input_dim=None,  save_img_obs=False, preupdate=True):
        if extra_input is not None:
            assert False, "not implemented"
        if not preupdate:
            assert False, "not implemented"
        init_policy_params = cur_policy_params = self.algo.policy.get_param_values()
        if hasattr(self.algo.env,"get_param_values"):
            try:
                cur_env_params = self.algo.env.get_param_values()
            except:
                cur_env_params = None
        else:
            cur_env_params = None
        import time
        start = time.time()
        if type(reset_args) != list and type(reset_args)!=np.ndarray:
            reset_args = [reset_args]*self.n_envs
        if hasattr(self.algo.policy, 'all_param_vals'): #TODO: RK, need to make this less hacky and still work with non-maml policies
            if self.algo.policy.all_param_vals:
                cur_policy_params = [flatten_tensors(x.values()) for x in self.algo.policy.all_param_vals]
            else:
                cur_policy_params = [cur_policy_params]*self.n_envs
        else:
            cur_policy_params = [cur_policy_params]*self.n_envs
        # do tasks sequentially and parallelize within rollouts per task.
        paths = {}
        for i in range(self.n_envs):
            paths[i] = parallel_sampler.sample_paths(
                policy_params=cur_policy_params[i],
                env_params=cur_env_params,
                max_samples=self.algo.batch_size / self.n_envs,
                max_path_length=self.algo.max_path_length,
                scope=self.algo.scope,
                reset_arg=reset_args[i],
                show_prog_bar=False,
            )
        total_time = time.time() - start
        logger.record_tabular(log_prefix+"TotalExecTime", total_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())

        self.algo.policy.set_param_values(init_policy_params)

        # currently don't support not whole paths (if desired, add code to truncate paths)
        assert self.algo.whole_paths

        return paths
예제 #13
0
파일: dqn.py 프로젝트: leduckhc/rllab
    def evaluate(self, epoch, pool):
        logger.log('Collecting samples for evaluation')

        paths = parallel_sampler.sample_paths(
            policy_params=self.policy.get_param_values(),
            max_samples=self.eval_max_samples,
            max_path_length=self.eval_max_path_length,
        )

        average_discounted_return = np.mean(
            [special.discount_return(path['rewards'], self.discount) for path in paths]
        )
        returns = [sum(path['rewards']) for path in paths]

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('AverageReturn', np.mean(returns))
        logger.record_tabular('AverageDiscountedReturn', average_discounted_return)
        logger.record_tabular('StdReturn', np.std(returns))
        logger.record_tabular('MaxReturn', np.max(returns))
        logger.record_tabular('MinReturn', np.min(returns))

        if len(self.es_path_returns) > 0:
            logger.record_tabular('AverageEsReturn',
                                  np.mean(self.es_path_returns))
            logger.record_tabular('StdEsReturn',
                                  np.std(self.es_path_returns))
            logger.record_tabular('MaxEsReturn',
                                  np.max(self.es_path_returns))
            logger.record_tabular('MinEsReturn',
                                  np.min(self.es_path_returns))
            logger.record_tabular('AverageEsPathLength',
                                  np.mean(self.es_path_length))

        logger.record_tabular('AverageQLoss', np.mean(self.qf_loss_averages))

        all_qs = np.concatenate(self.qs_averages)
        all_ys = np.concatenate(self.ys_averages)

        logger.record_tabular('AverageQ', np.mean(all_qs))
        logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
        logger.record_tabular('AverageY', np.mean(all_ys))
        logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))

        self.qf_loss_averages = []
        self.qs_averages = []
        self.ys_averages = []

        self.es_path_length = []
        self.es_path_returns = []
예제 #14
0
파일: batch_polopt.py 프로젝트: zizai/EMI
 def obtain_samples(self, itr):
     cur_params = self.algo.policy.get_param_values()
     paths = parallel_sampler.sample_paths(
         policy_params=cur_params,
         max_samples=self.algo.batch_size,
         include_original_frames=True,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(
             paths, self.algo.batch_size)
         return paths_truncated
예제 #15
0
 def obtain_samples(self, itr):
     cur_params = self.algo.policy.get_param_values()
     paths = parallel_sampler.sample_paths(
         policy_params=cur_params,  # TODO - can I just pass in new parameters here? (the updated ones?)
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     # TODO - does the optimizer assume that the paths came from a policy with params cur_params?
     # Or can I just pass in cur_params - alpha*grads?
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size)
         return paths_truncated
예제 #16
0
파일: BP.py 프로젝트: hl00/maml_rl
 def obtain_samples(self, itr):
     cur_params = self.algo.policy.get_param_values()
     paths = parallel_sampler.sample_paths(
         policy_params=cur_params,  # TODO - can I just pass in new parameters here? (the updated ones?)
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     # TODO - does the optimizer assume that the paths came from a policy with params cur_params?
     # Or can I just pass in cur_params - alpha*grads?
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size)
         return paths_truncated
예제 #17
0
 def obtain_samples(self, itr):
     cur_pro_params = self.algo.pro_policy.get_param_values()
     cur_adv_params = self.algo.adv_policy.get_param_values()
     paths = parallel_sampler.sample_paths(
         pro_policy_params=cur_pro_params,
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
         adv_policy_params=cur_adv_params
     )
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size)
         return paths_truncated
예제 #18
0
 def obtain_samples(self, itr, determ=False):
     cur_policy_params = self.algo.policy.get_param_values()
     cur_env_params = self.algo.env.get_param_values()
     paths = parallel_sampler.sample_paths(
         policy_params=cur_policy_params,
         env_params=cur_env_params,
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(
             paths, self.algo.batch_size)
         return paths_truncated
예제 #19
0
 def train(self):
     parallel_sampler.populate_task(self.env, self.policy, self.scope)
     obs = self.env.reset()
     for i in range(10):
         logger.log("Epoch % d" % i)
         for _ in range(100):
             action, _ = self.policy.get_action(obs)
             next_obs, rew, done, info = self.env.step(action)
             obs = next_obs if not done else self.env.reset()
         logger.log("Evaluating...")
         paths = parallel_sampler.sample_paths(
             policy_params=self.policy.get_param_values(),
             max_samples=20,
             max_path_length=100,
         )
     parallel_sampler.terminate_task()
예제 #20
0
 def obtain_samples(self, itr):
     if config.TF_NN_SETTRACE:
         ipdb.set_trace()
     cur_policy_params = self.algo.policy.get_param_values()
     paths = parallel_sampler.sample_paths(
         policy_params=cur_policy_params,
         env_params=None,
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(
             paths, self.algo.batch_size)
         return paths_truncated
예제 #21
0
 def obtain_samples(self, itr):
     if hasattr(self.algo.policy, 'get_param_values_with_baseline'):
         cur_params = self.algo.policy.get_param_values_with_baseline()
     else:
         cur_params = self.algo.policy.get_param_values()
     paths = parallel_sampler.sample_paths(
         policy_params=cur_params,
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(
             paths, self.algo.batch_size)
         return paths_truncated
 def obtain_samples(self, itr, env_params=None):
     try:
         cur_params = self.policy.get_param_values()
     except AttributeError:
         cur_params = None
     paths = parallel_sampler.sample_paths(
         policy_params=cur_params,
         max_samples=self.n_traj,
         max_path_length=self.max_path_length,
         scope=self.scope,
         useImitationEnv=self.useImitationEnv,
         useImitationPolicy=self.useImitationPolicy,
         count_traj=True,
         terminate_only_max_path=self.terminate_only_max_path,
         env_params=env_params)
     # truncate the paths if we collected more than self.n_traj
     return paths[:self.n_traj]
예제 #23
0
 def obtain_samples(self, itr, include_joint_coords=False):
     # TODO: include_joint_coords not supported for BatchSampler yet.
     cur_policy_params = self.algo.policy.get_param_values()
     cur_env_params = self.algo.env.get_param_values()
     paths = parallel_sampler.sample_paths(
         policy_params=cur_policy_params,
         env_params=cur_env_params,
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(
             paths, self.algo.batch_size)
         return paths_truncated
예제 #24
0
 def obtain_samples(self, itr):
     cur_policy_params = self.algo.policy.get_param_values()
     if hasattr(self.algo.env,"get_param_values"):
         cur_env_params = self.algo.env.get_param_values()
     else:
         cur_env_params = None
     paths = parallel_sampler.sample_paths(
         policy_params=cur_policy_params,
         env_params=cur_env_params,
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size)
         return paths_truncated
예제 #25
0
 def obtain_samples(self, itr):
     cur_policy_params = self.algo.policy.get_param_values()
     #if hasattr(self.algo.env,"get_param_values"):
         #cur_env_params = self.algo.env.get_param_values()
     #else:
         #cur_env_params = None
     paths = parallel_sampler.sample_paths(
         policy_params=cur_policy_params,
         env_params=None,
         max_samples=self.algo.batch_size,
         max_path_length=self.algo.max_path_length,
         scope=self.algo.scope,
     )
     if self.algo.whole_paths:
         return paths
     else:
         paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size)
         return paths_truncated
예제 #26
0
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''):
        init_policy_params = cur_policy_params = self.algo.policy.get_param_values()
        if hasattr(self.algo.env,"get_param_values"):
            try:
                cur_env_params = self.algo.env.get_param_values()
            except:
                cur_env_params = None
        else:
            cur_env_params = None
        import time
        start = time.time()
        if type(reset_args) != list and type(reset_args)!=np.ndarray:
            reset_args = [reset_args]*self.n_envs
        if self.algo.policy.all_param_vals:
            cur_policy_params = [flatten_tensors(x.values()) for x in self.algo.policy.all_param_vals]
        else:
            cur_policy_params = [cur_policy_params]*self.n_envs
        # do tasks sequentially and parallelize within rollouts per task.
        paths = {}
        for i in range(self.n_envs):
            paths[i] = parallel_sampler.sample_paths(
                policy_params=cur_policy_params[i],
                env_params=cur_env_params,
                max_samples=self.algo.batch_size / self.n_envs,
                max_path_length=self.algo.max_path_length,
                scope=self.algo.scope,
                reset_arg=reset_args[i],
                show_prog_bar=False,
            )
        total_time = time.time() - start
        logger.record_tabular(log_prefix+"TotalExecTime", total_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())

        self.algo.policy.set_param_values(init_policy_params)

        # currently don't support not whole paths (if desired, add code to truncate paths)
        assert self.algo.whole_paths

        return paths
예제 #27
0
    def obtain_samples(self, itr, target_task=None):
        cur_params = self.algo.policy.get_param_values()

        paths = parallel_sampler.sample_paths(
            policy_params=cur_params,
            max_samples=self.algo.batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
            iter=itr,
            policy=self.algo.policy,
            env=self.algo.env,
            baseline=self.algo.baseline,
            target_task=target_task,
        )

        if self.algo.whole_paths:
            return paths
        else:
            paths_truncated = parallel_sampler.truncate_paths(
                paths, self.algo.batch_size)
            return paths_truncated
예제 #28
0
파일: a3c.py 프로젝트: leduckhc/rllab
    def evaluate(self, epoch, opt_info):
        logger.log('Collecting samples for evaluation')

        paths = parallel_sampler.sample_paths(
            policy_params=opt_info['target_policy'],
            max_samples=self.eval_max_samples,
            max_path_length=self.eval_max_path_length,
        )

        average_discounted_return = np.mean([
            special.discount_return(path['rewards'], self.discount)
            for path in paths
        ])
        returns = [sum(path['rewards']) for path in paths]

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('AverageReturn', np.mean(returns))
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('StdReturn', np.std(returns))
        logger.record_tabular('MaxReturn', np.max(returns))
        logger.record_tabular('MinReturn', np.min(returns))
예제 #29
0
    def obtain_samples(self, itr):
        cur_params = self.algo.policy.get_param_values()
        raw_paths = parallel_sampler.sample_paths(
            policy_params=cur_params,
            max_samples=self.algo.batch_size,
            max_path_length=self.algo.max_path_length,
            scope=self.algo.scope,
        )
        if self.period is None:  # hippo random p
            paths = raw_paths
        else:
            #todo: this will break for environments where the rollout terminates after goal is reached
            paths = []
            for path in raw_paths:
                new_length = (len(path['rewards']) //
                              self.period) * self.period
                for key in path.keys():
                    if isinstance(path[key], dict):
                        for key2 in path[key].keys():
                            path[key][key2] = path[key][key2][:new_length]
                    else:
                        path[key] = path[key][:new_length]
                if len(path['rewards']) > 0:
                    paths.append(path)

                # num_padding = self.period - (len(path['rewards']) % self.period)
                # for key in path.keys():
                #     if isinstance(path[key], dict):
                #         for key2 in path[key].keys():
                #             path[key][key2].
            # paths = raw_paths

        if self.algo.whole_paths:
            return paths
        else:
            paths_truncated = parallel_sampler.truncate_paths(
                paths, self.algo.batch_size)
            return paths_truncated
예제 #30
0
    def obtain_samples(self,
                       itr,
                       reset_args=None,
                       return_dict=False,
                       log_prefix='',
                       extra_input=None,
                       extra_input_dim=None,
                       save_img_obs=False,
                       preupdate=True,
                       numTrajs_perTask=None):

        # if not preupdate:
        #     assert False, "not implemented"
        init_policy_params = cur_policy_params = self.algo.policy.get_param_values(
        )
        if hasattr(self.algo.env, "get_param_values"):
            try:
                cur_env_params = self.algo.env.get_param_values()
            except:
                cur_env_params = None
        else:
            cur_env_params = None
        import time
        start = time.time()
        if type(reset_args) != list and type(reset_args) != np.ndarray:
            reset_args = [reset_args] * self.n_envs

        cur_policy_params = [cur_policy_params] * self.n_envs
        # do tasks sequentially and parallelize within rollouts per task.
        paths = {}
        all_param_vals_list = self.algo.policy.all_param_vals

        if extra_input == None:
            extra_infos = None
        else:
            assert extra_input in [
                "onehot_exploration", 'gaussian_exploration', 'onehot_hacked'
            ]
            extra_infos = [extra_input, extra_input_dim, preupdate]

        for i in range(self.n_envs):

            if self.algo.policy.all_param_vals is None:
                policy_params = cur_policy_params[i]

            else:
                policy_params = flatten_tensors(
                    all_param_vals_list[i].values())

            paths_i = parallel_sampler.sample_paths(
                policy_params=policy_params,
                env_params=cur_env_params,
                max_samples=self.algo.batch_size / self.n_envs,
                max_path_length=self.algo.max_path_length,
                scope=self.algo.scope,
                reset_arg=reset_args[i],
                taskIdx=i,
                show_prog_bar=False,
                extra_infos=extra_infos)
            if numTrajs_perTask != None:
                paths[i] = paths_i[:numTrajs_perTask]
            else:
                paths[i] = paths_i

        total_time = time.time() - start
        logger.record_tabular(log_prefix + "TotalExecTime", total_time)

        if not return_dict:
            flatten_list = lambda l: [
                item for sublist in l for item in sublist
            ]
            paths = flatten_list(paths.values())

        #self.algo.policy.set_param_values(init_policy_params)

        # currently don't support not whole paths (if desired, add code to truncate paths)
        assert self.algo.whole_paths

        return paths
예제 #31
0
파일: PDO_DDPG.py 프로젝트: victor856/cpo
    def evaluate(self, epoch, pool):
        logger.log("Collecting samples for evaluation")
        paths = parallel_sampler.sample_paths(
            policy_params=self.policy.get_param_values(),
            max_samples=self.eval_samples,
            max_path_length=self.max_path_length,
        )

        average_discounted_return = np.mean(
            [special.discount_return(path["rewards"], self.discount) for path in paths]
        )


        returns = [sum(path["rewards"]) for path in paths]
        for path in paths:
            path["safety_rewards"] = self.safety_constraint.evaluate(path) * self.env.bomb_cost
        costs = [sum(path["safety_rewards"]) for path in paths]

        all_qs = np.concatenate(self.q_averages)
        all_qs_cost = np.concatenate(self.q_cost_averages)
        all_ys = np.concatenate(self.y_averages)
        all_zs = np.concatenate(self.z_averages)

        average_q_loss = np.mean(self.qf_loss_averages)
        average_q_cost_loss = np.mean(self.qf_cost_loss_averages)
        average_policy_surr = np.mean(self.policy_surr_averages)
        average_action = np.mean(np.square(np.concatenate(
            [path["actions"] for path in paths]
        )))

        policy_reg_param_norm = np.linalg.norm(
            self.policy.get_param_values(regularizable=True)
        )
        qfun_reg_param_norm = np.linalg.norm(
            self.qf.get_param_values(regularizable=True)
        )

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('AverageReturn',
                              np.mean(returns))
        logger.record_tabular('StdReturn',
                              np.std(returns))
        logger.record_tabular('MaxReturn',
                              np.max(returns))
        logger.record_tabular('MinReturn',
                              np.min(returns))
        if len(self.es_path_returns) > 0:
            logger.record_tabular('AverageEsReturn',
                                  np.mean(self.es_path_returns))
            logger.record_tabular('StdEsReturn',
                                  np.std(self.es_path_returns))
            logger.record_tabular('MaxEsReturn',
                                  np.max(self.es_path_returns))
            logger.record_tabular('MinEsReturn',
                                  np.min(self.es_path_returns))
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageQLoss', average_q_loss)
        logger.record_tabular('AveragePolicySurr', average_policy_surr)
        logger.record_tabular('EstimatedQ', np.mean(all_qs))
        #logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
        #logger.record_tabular('AverageY', np.mean(all_ys))
        #logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
        #logger.record_tabular('AverageAbsQYDiff',
        #                      np.mean(np.abs(all_qs - all_ys)))
        #logger.record_tabular('AverageAction', average_action)

        #logger.record_tabular('PolicyRegParamNorm',
        #                      policy_reg_param_norm)
        #logger.record_tabular('QFunRegParamNorm',
        #                      qfun_reg_param_norm)
        logger.record_tabular('EstimatedQcost', np.mean(all_qs_cost))
        #logger.record_tabular('AverageZ', np.mean(all_zs))
        logger.record_tabular('AverageQcostLoss', average_q_cost_loss)
        logger.record_tabular('AverageCosts', np.mean(costs))
        logger.record_tabular('DualVariable', self.dual_var)
        logger.record_tabular('AvgDual', np.mean(self.dual_history[::200]))
        self.env.log_diagnostics(paths)
        self.policy.log_diagnostics(paths)
        print(self.dual_history[::200])

        f = open("/home/qingkai/ddpg_performance.csv", 'a')
        writer = csv.writer(f, delimiter=',')
        writer.writerow((epoch, np.mean(returns), np.mean(costs), self.dual_var, np.mean(all_qs), np.mean(all_qs_cost), self.avg_dual))
        f.close()


        self.qf_loss_averages = []
        self.qf_cost_loss_averages = []
        self.policy_surr_averages = []
        self.q_averages = []
        self.q_cost_averages = []
        self.y_averages = []
        self.z_averages = []
        self.es_path_returns = []
예제 #32
0
파일: ddpg.py 프로젝트: syllogismos/rllabpp
    def evaluate(self, epoch, pool):
        logger.log("Collecting samples for evaluation")
        paths = parallel_sampler.sample_paths(
            policy_params=self.exec_policy.get_param_values(),
            max_samples=self.eval_samples,
            max_path_length=self.max_path_length,
        )

        average_discounted_return = np.mean(
            [special.discount_return(path["rewards"], self.discount) for path in paths]
        )

        returns = [sum(path["rewards"]) for path in paths]

        average_action = np.mean(np.square(np.concatenate(
            [path["actions"] for path in paths]
        )))

        qfun_reg_param_norm = np.linalg.norm(
            self.qf.get_param_values(regularizable=True)
        )

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('Iteration', epoch)
        logger.record_tabular('AverageReturn', np.mean(returns))
        logger.record_tabular('StdReturn',
                              np.std(returns))
        logger.record_tabular('MaxReturn',
                              np.max(returns))
        logger.record_tabular('MinReturn',
                              np.min(returns))
        if len(self.es_path_returns) > 0:
            logger.record_tabular('AverageEsReturn',
                                  np.mean(self.es_path_returns))
            logger.record_tabular('StdEsReturn',
                                  np.std(self.es_path_returns))
            logger.record_tabular('MaxEsReturn',
                                  np.max(self.es_path_returns))
            logger.record_tabular('MinEsReturn',
                                  np.min(self.es_path_returns))
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageAction', average_action)

        logger.record_tabular('QFunRegParamNorm',
                              qfun_reg_param_norm)
        self.env.log_diagnostics(paths)
        self.log_critic_training()

        self.es_path_returns = []

        if not self.qf_dqn:
            average_policy_surr = np.mean(self.policy_surr_averages)
            policy_reg_param_norm = np.linalg.norm(
                self.policy.get_param_values(regularizable=True)
            )
            logger.record_tabular('AveragePolicySurr', average_policy_surr)
            logger.record_tabular('PolicyRegParamNorm',
                              policy_reg_param_norm)
            self.policy.log_diagnostics(paths)
            self.policy_surr_averages = []
예제 #33
0
    def evaluate(self, epoch, pool):
        logger.log("Collecting samples for evaluation")
        paths = parallel_sampler.sample_paths(
            policy_params=self.policy.get_param_values(),
            max_samples=self.eval_samples,
            max_path_length=self.max_path_length,
        )

        average_discounted_return = np.mean(
            [special.discount_return(path["rewards"], self.discount) for path in paths]
        )

        returns = [sum(path["rewards"]) for path in paths]

        all_qs = np.concatenate(self.q_averages)
        all_ys = np.concatenate(self.y_averages)

        average_q_loss = np.mean(self.qf_loss_averages)
        average_policy_surr = np.mean(self.policy_surr_averages)
        average_action = np.mean(np.square(np.concatenate(
            [path["actions"] for path in paths]
        )))

        policy_reg_param_norm = np.linalg.norm(
            self.policy.get_param_values(regularizable=True)
        )
        qfun_reg_param_norm = np.linalg.norm(
            self.qf.get_param_values(regularizable=True)
        )

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('AverageReturn',
                              np.mean(returns))
        logger.record_tabular('StdReturn',
                              np.std(returns))
        logger.record_tabular('MaxReturn',
                              np.max(returns))
        logger.record_tabular('MinReturn',
                              np.min(returns))
        if len(self.es_path_returns) > 0:
            logger.record_tabular('AverageEsReturn',
                                  np.mean(self.es_path_returns))
            logger.record_tabular('StdEsReturn',
                                  np.std(self.es_path_returns))
            logger.record_tabular('MaxEsReturn',
                                  np.max(self.es_path_returns))
            logger.record_tabular('MinEsReturn',
                                  np.min(self.es_path_returns))
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageQLoss', average_q_loss)
        logger.record_tabular('AveragePolicySurr', average_policy_surr)
        logger.record_tabular('AverageQ', np.mean(all_qs))
        logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
        logger.record_tabular('AverageY', np.mean(all_ys))
        logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
        logger.record_tabular('AverageAbsQYDiff',
                              np.mean(np.abs(all_qs - all_ys)))
        logger.record_tabular('AverageAction', average_action)

        logger.record_tabular('PolicyRegParamNorm',
                              policy_reg_param_norm)
        logger.record_tabular('QFunRegParamNorm',
                              qfun_reg_param_norm)

        self.env.log_diagnostics(paths)
        self.policy.log_diagnostics(paths)

        self.qf_loss_averages = []
        self.policy_surr_averages = []

        self.q_averages = []
        self.y_averages = []
        self.es_path_returns = []