예제 #1
0
    def run_alg(self, n_itrs, save_policy=True, save_policy_fun=None, save_freq=3,
                save_value_fun=None, save_sim_fun=None,
                pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs):
        start_time = time.time()
        if pretrain:  # algorithm-specific
            if rollout_kwargs is None:
                gr = self._gen_ro_raw
            elif (rollout_kwargs['max_n_rollouts'] is None and
                  rollout_kwargs['min_n_samples'] is None):
                gr = self._gen_ro_raw
            else:
                gr = functools.partial(generate_rollout, env=self._env, **rollout_kwargs)
            self._alg.pretrain(gr, **other_pretrain_kwargs)

        # Main loop
        for itr in range(n_itrs):
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)
            with timed('Generate env rollouts'):
                ro = self.gen_ro(to_log=True)
            
            # algorithm-specific
            if save_policy and isinstance(save_freq, int) and itr % save_freq == 0:
                mean_val = logz.get_val_from_LOG('MeanSumOfRewards')
                prefix = 'iter_{}_eval_'.format(itr) + '%.0f' % mean_val
                save_policy_fun(prefix + '_pi')
                save_value_fun(prefix + '_vfn')
                save_sim_fun(prefix + 'sim')
            self._alg.update(ro, gen_env_ro=self._gen_ro) 
            logz.dump_tabular()  # dump log

        # Save the final policy.
        if save_policy:
            save_policy_fun('final')
            cprint('Final policy has been saved.')
예제 #2
0
    def run_alg(self,
                n_itrs,
                pretrain=True,
                save_policy=False,
                save_freq=100,
                final_eval=False):
        start_time = time.time()
        if pretrain:  # algorithm-specific
            self._alg.pretrain(functools.partial(self.gen_ro, to_log=False))

        # Main loop
        for itr in range(n_itrs):
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)
            with timed('Generate env rollouts'):
                ro = self.gen_ro(self._alg.pi_ro,
                                 logp=self._alg.logp,
                                 to_log=True)
            self._alg.update(ro)  # algorithm-specific
            logz.dump_tabular()  # dump log
예제 #3
0
    def cal_variance(self, n_itrs, save_policy=True, save_value_fun=None, 
                save_policy_fun=None, save_freq=3,
                save_sim_fun=None,
                ro_file=None,
                save_np_file_path=None,
                prefix=None,
                save_gradient=None,
                pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs):
        start_time = time.time()
        
        assert prefix is not None
        assert ro_file is not None
        assert save_np_file_path is not None

        save_grad_frequency = 1

        ro_path = ro_file

        with open(ro_path, 'rb') as f:
            ros = pickle.load(f)

        # Main loop        
        itr = 0
        self._alg.reset_grads()
        while True:
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)
            
            self._alg.compute_grad(ros.pop(0), gen_env_ro=self._gen_ro)

            logz.dump_tabular()  # dump log
            
            if itr % save_grad_frequency == 0:
                np.save(save_np_file_path, self._alg.gradients)

            if ros == []:
                break
            
            itr += 1
예제 #4
0
    def run_alg(self,
                n_itrs,
                save_policy=None,
                save_policy_fun=None,
                save_freq=None,
                pretrain=True,
                rollout_kwargs=None,
                **other_pretrain_kwargs):
        start_time = time.time()
        if pretrain:  # algorithm-specific
            if rollout_kwargs is None:
                gr = self._gen_ro_raw
            elif (rollout_kwargs['max_n_rollouts'] is None
                  and rollout_kwargs['min_n_samples'] is None):
                gr = self._gen_ro_raw
            else:
                gr = functools.partial(generate_rollout,
                                       env=self._env,
                                       **rollout_kwargs)
            self._alg.pretrain(gr, **other_pretrain_kwargs)

        # Main loop
        for itr in range(n_itrs):
            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)
            with timed('Generate env rollouts'):
                ro = self.gen_ro(to_log=True)
            # algorithm-specific
            self._alg.update(ro, gen_env_ro=self._gen_ro)
            logz.dump_tabular()  # dump log
            if save_policy and isinstance(save_freq,
                                          int) and itr % save_freq == 0:
                save_policy_fun('{}'.format(itr))

        # Save the final policy.
        if save_policy:
            save_policy_fun('final')
            cprint('Final policy has been saved.')
예제 #5
0
    def est_mean(self, n_itrs, save_policy=True, save_value_fun=None, 
                save_policy_fun=None, save_freq=3,
                save_sim_fun=None,
                save_gradient=None,
                save_np_file_path=None,
                pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs):
        start_time = time.time()        

        # Main loop
        # default estimator number is 10000
        for itr in range(2000):
            ro = self._gen_ro()       

            logz.log_tabular("Time", time.time() - start_time)
            logz.log_tabular("Iteration", itr)

            # algorithm-specific
            self._alg.compute_grad(ro, gen_env_ro=self._gen_ro)
            logz.dump_tabular()  # dump log

        mean_st = self._alg.gradients       
        est_mean = np.mean(mean_st, axis=0, keepdims=True)     
        np.save(save_np_file_path, est_mean)