def run_alg(self, n_itrs, save_policy=True, save_policy_fun=None, save_freq=3, save_value_fun=None, save_sim_fun=None, pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs): start_time = time.time() if pretrain: # algorithm-specific if rollout_kwargs is None: gr = self._gen_ro_raw elif (rollout_kwargs['max_n_rollouts'] is None and rollout_kwargs['min_n_samples'] is None): gr = self._gen_ro_raw else: gr = functools.partial(generate_rollout, env=self._env, **rollout_kwargs) self._alg.pretrain(gr, **other_pretrain_kwargs) # Main loop for itr in range(n_itrs): logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) with timed('Generate env rollouts'): ro = self.gen_ro(to_log=True) # algorithm-specific if save_policy and isinstance(save_freq, int) and itr % save_freq == 0: mean_val = logz.get_val_from_LOG('MeanSumOfRewards') prefix = 'iter_{}_eval_'.format(itr) + '%.0f' % mean_val save_policy_fun(prefix + '_pi') save_value_fun(prefix + '_vfn') save_sim_fun(prefix + 'sim') self._alg.update(ro, gen_env_ro=self._gen_ro) logz.dump_tabular() # dump log # Save the final policy. if save_policy: save_policy_fun('final') cprint('Final policy has been saved.')
def run_alg(self, n_itrs, pretrain=True, save_policy=False, save_freq=100, final_eval=False): start_time = time.time() if pretrain: # algorithm-specific self._alg.pretrain(functools.partial(self.gen_ro, to_log=False)) # Main loop for itr in range(n_itrs): logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) with timed('Generate env rollouts'): ro = self.gen_ro(self._alg.pi_ro, logp=self._alg.logp, to_log=True) self._alg.update(ro) # algorithm-specific logz.dump_tabular() # dump log
def cal_variance(self, n_itrs, save_policy=True, save_value_fun=None, save_policy_fun=None, save_freq=3, save_sim_fun=None, ro_file=None, save_np_file_path=None, prefix=None, save_gradient=None, pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs): start_time = time.time() assert prefix is not None assert ro_file is not None assert save_np_file_path is not None save_grad_frequency = 1 ro_path = ro_file with open(ro_path, 'rb') as f: ros = pickle.load(f) # Main loop itr = 0 self._alg.reset_grads() while True: logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) self._alg.compute_grad(ros.pop(0), gen_env_ro=self._gen_ro) logz.dump_tabular() # dump log if itr % save_grad_frequency == 0: np.save(save_np_file_path, self._alg.gradients) if ros == []: break itr += 1
def run_alg(self, n_itrs, save_policy=None, save_policy_fun=None, save_freq=None, pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs): start_time = time.time() if pretrain: # algorithm-specific if rollout_kwargs is None: gr = self._gen_ro_raw elif (rollout_kwargs['max_n_rollouts'] is None and rollout_kwargs['min_n_samples'] is None): gr = self._gen_ro_raw else: gr = functools.partial(generate_rollout, env=self._env, **rollout_kwargs) self._alg.pretrain(gr, **other_pretrain_kwargs) # Main loop for itr in range(n_itrs): logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) with timed('Generate env rollouts'): ro = self.gen_ro(to_log=True) # algorithm-specific self._alg.update(ro, gen_env_ro=self._gen_ro) logz.dump_tabular() # dump log if save_policy and isinstance(save_freq, int) and itr % save_freq == 0: save_policy_fun('{}'.format(itr)) # Save the final policy. if save_policy: save_policy_fun('final') cprint('Final policy has been saved.')
def est_mean(self, n_itrs, save_policy=True, save_value_fun=None, save_policy_fun=None, save_freq=3, save_sim_fun=None, save_gradient=None, save_np_file_path=None, pretrain=True, rollout_kwargs=None, **other_pretrain_kwargs): start_time = time.time() # Main loop # default estimator number is 10000 for itr in range(2000): ro = self._gen_ro() logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", itr) # algorithm-specific self._alg.compute_grad(ro, gen_env_ro=self._gen_ro) logz.dump_tabular() # dump log mean_st = self._alg.gradients est_mean = np.mean(mean_st, axis=0, keepdims=True) np.save(save_np_file_path, est_mean)