def sample_paths(N, policy, T=1e6, env=None, env_name=None, pegasus_seed=None, num_cpu='max', paths_per_call=5, mode='sample'): """ params: N : number of sample points policy : policy to be used to sample the data T : maximum length of trajectory env : env object to sample from env_name : name of env to be sampled from (one of env or env_name must be specified) pegasus_seed : seed for environment (numpy speed must be set externally) """ if num_cpu == 1: return sample_paths_one_core(N, policy, T, env, env_name, pegasus_seed, mode) else: start_time = timer.time() print("####### Gathering Samples #######") sampled_so_far = 0 paths_so_far = 0 paths = [] while sampled_so_far <= N: if pegasus_seed is None: new_paths = trajectory_sampler.sample_paths_parallel(paths_per_call, policy, T, env_name, pegasus_seed, num_cpu, suppress_print=True, mode=mode) else: pegasus_seed += paths_so_far new_paths = trajectory_sampler.sample_paths_parallel(paths_per_call, policy, T, env_name, pegasus_seed, num_cpu, suppress_print=True, mode=mode) for path in new_paths: paths.append(path) paths_so_far += paths_per_call new_samples = np.sum([len(p['rewards']) for p in new_paths]) sampled_so_far += new_samples print("======= Samples Gathered ======= | >>>> Time taken = %f " % (timer.time()-start_time) ) print("................................. | >>>> # samples = %i # trajectories = %i " % (sampled_so_far, paths_so_far) ) return paths
def train_step(self, N, sample_mode='trajectories', env_name=None, T=1e6, gamma=0.995, gae_lambda=0.98, num_cpu='max'): # Clean up input arguments if env_name is None: env_name = self.env.env_id if sample_mode != 'trajectories' and sample_mode != 'samples': print("sample_mode in NPG must be either 'trajectories' or 'samples'") quit() ts = timer.time() if sample_mode == 'trajectories': paths = trajectory_sampler.sample_paths_parallel(N, self.policy, T, env_name, self.seed, num_cpu) elif sample_mode == 'samples': paths = batch_sampler.sample_paths(N, self.policy, T, env_name=env_name, pegasus_seed=self.seed, num_cpu=num_cpu) if self.save_logs: self.logger.log_kv('time_sampling', timer.time() - ts) self.seed = self.seed + N if self.seed is not None else self.seed # compute returns process_samples.compute_returns(paths, gamma) # compute advantages process_samples.compute_advantages(paths, self.baseline, gamma, gae_lambda) # train from paths eval_statistics = self.train_from_paths(paths) eval_statistics.append(N) # fit baseline if self.save_logs: ts = timer.time() error_before, error_after = self.baseline.fit(paths, return_errors=True) self.logger.log_kv('time_VF', timer.time()-ts) self.logger.log_kv('VF_error_before', error_before) self.logger.log_kv('VF_error_after', error_after) else: self.baseline.fit(paths) return eval_statistics
def _evaluation_rollout(agent, num_rollouts, num_cpu): """Performs an evaluation rollout. Args: agent: The MJRL agent to perform the rollout with. num_rollouts: The number of rollouts. num_cpu: The number of CPUs to perform the rollout on. Returns: The mean return over the rollout. """ eval_paths = sample_paths_parallel( N=num_rollouts, policy=agent.policy, num_cpu=num_cpu, env_name=agent.env.env_id, mode='evaluation', ) return np.mean([np.sum(path['rewards']) for path in eval_paths])
def train_agent( job_name, agent, seed=0, niter=101, gamma=0.995, gae_lambda=None, num_cpu=1, sample_mode='trajectories', num_traj=50, num_samples=50000, # has precedence, used with sample_mode = 'samples' save_freq=10, evaluation_rollouts=None, plot_keys=['stoc_pol_mean'], ): np.random.seed(seed) if os.path.isdir(job_name) == False: os.mkdir(job_name) previous_dir = os.getcwd() os.chdir(job_name) # important! we are now in the directory to save data if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs') best_policy = copy.deepcopy(agent.policy) best_perf = -1e8 train_curve = best_perf * np.ones(niter) mean_pol_perf = 0.0 e = GymEnv(agent.env.env_id) for i in range(niter): print( "......................................................................................" ) print("ITERATION : %i " % i) if train_curve[i - 1] > best_perf: best_policy = copy.deepcopy(agent.policy) best_perf = train_curve[i - 1] N = num_traj if sample_mode == 'trajectories' else num_samples args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu) stats = agent.train_step(**args) train_curve[i] = stats[0] if evaluation_rollouts is not None and evaluation_rollouts > 0: print("Performing evaluation rollouts ........") eval_paths = sample_paths_parallel(N=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu, env_name=e.env_id, mode='evaluation', pegasus_seed=seed) mean_pol_perf = np.mean( [np.sum(path['rewards']) for path in eval_paths]) if agent.save_logs: agent.logger.log_kv('eval_score', mean_pol_perf) if i % save_freq == 0 and i > 0: if agent.save_logs: agent.logger.save_log('logs/') make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') policy_file = 'policy_%i.pickle' % i baseline_file = 'baseline_%i.pickle' % i pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb')) pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb')) pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) # print results to console if i == 0: result_file = open('results.txt', 'w') print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n") result_file.write( "Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n") result_file.close() print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime( timer.time())), i, train_curve[i], mean_pol_perf, best_perf)) result_file = open('results.txt', 'a') result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf)) result_file.close() if agent.save_logs: print_data = sorted( filter(lambda v: np.asarray(v[1]).size == 1, agent.logger.get_current_log().items())) print(tabulate(print_data)) # final save pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) if agent.save_logs: agent.logger.save_log('logs/') make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') os.chdir(previous_dir)
f = open(iterdir + 'policy_199.pickle', 'rb') policy_stl = pickle.load(f) f.close() f = open(iterdir + 'baseline_199.pickle', 'rb') baseline_stl[task_id] = pickle.load(f) f.close() agent_stl = NPG(e[task_id], policy_stl, baseline_stl[task_id], normalized_step_size=0.01, seed=SEED, save_logs=False) agent_stl.set_task(task_id) eval_paths = trajectory_sampler.sample_paths_parallel( N=10, policy=policy_stl, num_cpu=num_cpu, env_name=e[task_id].env_id, mode='evaluation', pegasus_seed=SEED) forward_transfer_results[task_id] = np.mean( [np.sum(path['rewards']) for path in eval_paths]) result_file = open(job_name_stl_seed + '/finetune_results.txt', 'w') result_file.write(str(forward_transfer_results)) result_file.close() SEED += 10
def train_agent(job_name, agent, seed = 0, niter = 101, gamma = 0.995, gae_lambda = None, num_cpu = 1, sample_mode = 'trajectories', num_traj = 50, num_samples = 50000, # has precedence, used with sample_mode = 'samples' save_freq = 10, evaluation_rollouts = None, plot_keys = ['stoc_pol_mean'], ): np.random.seed(seed) if os.path.isdir(job_name) == False: os.mkdir(job_name) previous_dir = os.getcwd() os.chdir(job_name) # important! we are now in the directory to save data if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs') best_policy = copy.deepcopy(agent.policy) best_perf = -1e8 train_curve = best_perf*np.ones(niter) mean_pol_perf = 0.0 e = GymEnv(agent.env.env_id) for i in range(niter): print("......................................................................................") print("ITERATION : %i " % i) if train_curve[i-1] > best_perf: best_policy = copy.deepcopy(agent.policy) best_perf = train_curve[i-1] N = num_traj if sample_mode == 'trajectories' else num_samples args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu) stats = agent.train_step(**args) train_curve[i] = stats[0] if evaluation_rollouts is not None and evaluation_rollouts > 0: print("Performing evaluation rollouts ........") eval_paths = sample_paths_parallel(N=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu, env_name=e.env_id, mode='evaluation', pegasus_seed=seed) mean_pol_perf = np.mean([np.sum(path['rewards']) for path in eval_paths]) if agent.save_logs: agent.logger.log_kv('eval_score', mean_pol_perf) if i % save_freq == 0 and i > 0: if agent.save_logs: agent.logger.save_log('logs/') make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') policy_file = 'policy_%i.pickle' % i baseline_file = 'baseline_%i.pickle' % i pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb')) pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb')) pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) # print results to console if i == 0: result_file = open('results.txt', 'w') print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n") result_file.write("Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n") result_file.close() print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime(timer.time())), i, train_curve[i], mean_pol_perf, best_perf)) result_file = open('results.txt', 'a') result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf)) result_file.close() if agent.save_logs: print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1, agent.logger.get_current_log().items())) print(tabulate(print_data)) # final save pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) if agent.save_logs: agent.logger.save_log('logs/') make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') os.chdir(previous_dir)