def launch_job(tag, variant): seed, env, algo, optim, curv_type, lr, batch_size, cg_iters, cg_residual_tol, cg_prev_init_coef, \ cg_precondition_empirical, cg_precondition_regu_coef, cg_precondition_exp, \ shrinkage_method, lanczos_amortization, lanczos_iters, approx_adaptive, betas, use_nn_policy, total_samples = variant beta1, beta2 = betas iters = int(total_samples / batch_size) save_dir = build_log_dir(tag, variant) print("Save: ", save_dir) save_dir = \ "/Users/trevorbarron/Documents/dev.nosync/thesis/adacurv/experiments/mjrl/results/final/best_runs/\ results/pybullet_sample_mode_bball_random_hoop/BasketballEnvRandomHoop-v0/trpo/natural_adam/optim_adaptive/\ curv_type_fisher/cg_iters_10/cg_residual_tol_1e-10/cg_prev_init_coef_0.5/cg_precondition_empirical_true/\ cg_precondition_regu_coef_0.001/cg_precondition_exp_0.75/shrunk_true/cg/nn_policy/adam_vfn_opt/\ total_samples_2000000/batch_size_2000/lr_0.01/betas0.9_0.9/1/iterations/" # /Users/trevorbarron/Documents/dev.nosync/thesis/adacurv/experiments/mjrl/results/results_serv_tmp/" policy_path = os.path.join(save_dir, 'best_policy.pickle') # policy_path = os.path.join(save_dir, 'iterations/best_policy.pickle') with open(policy_path, 'rb') as f: policy = pickle.load(f) print(policy) e = GymEnv('BasketballEnvRandomHoopRendered-v0') e.reset() input("Continue?") N = 100 T = 250 paths = base_sampler.do_rollout(N, policy, T, e, None) for p in paths: print(p['rewards'].sum())
def policy_rollout( num_traj, env, policy, fitted_model, init_state=None, eval_mode=False, horizon=1e6, env_kwargs=None, seed=None, ): # get the correct env behavior if type(env) == str: env = GymEnv(env) elif isinstance(env, GymEnv): env = env elif callable(env): env = env(**env_kwargs) else: print("Unsupported environment format") raise AttributeError if seed is not None: env.set_seed(seed) torch.manual_seed(seed) # get initial states if init_state is None: st = np.array([env.reset() for _ in range(num_traj)]) st = torch.from_numpy(st).float() elif type(init_state) == np.ndarray: st = torch.from_numpy(init_state).float() elif type(init_state) == list: st = torch.from_numpy(np.array(init_state)).float() else: print("Unsupported format for init state") quit() # perform batched rollouts horizon = min(horizon, env.horizon) obs = [] act = [] for t in range(horizon): at = policy.model.forward(st) if eval_mode is not True: at = at + torch.randn(at.shape) * torch.exp(policy.log_std) stp1 = fitted_model.forward(st, at) obs.append(st.to('cpu').data.numpy()) act.append(at.to('cpu').data.numpy()) st = stp1 obs = np.array(obs) obs = np.swapaxes(obs, 0, 1) # (num_traj, horizon, state_dim) act = np.array(act) act = np.swapaxes(act, 0, 1) # (num_traj, horizon, action_dim) paths = dict(observations=obs, actions=act) return paths
def __init__( self, env_name, policy, expert_paths=None, # for the initial seeding epochs=5, batch_size=64, lr=1e-3, optimizer=None, loss_type='MSE', # can be 'MLE' or 'MSE' seed=123, buffer_size=50, # measured in number of trajectories mpc_params=None, save_logs=True, ): super().__init__( expert_paths=expert_paths, policy=policy, epochs=epochs, batch_size=batch_size, lr=lr, optimizer=optimizer, loss_type=loss_type, save_logs=save_logs, ) self.expert_paths = [] if self.expert_paths is None else self.expert_paths self.buffer_size = buffer_size # For the MPC policy self.env = GymEnv(env_name) self.env.reset(seed=seed) if mpc_params is None: mean = np.zeros(self.env.action_dim) sigma = 1.0 * np.ones(self.env.action_dim) filter_coefs = [sigma, 0.05, 0.0, 0.0] mpc_params = dict(env=GymEnv(env_name), H=10, paths_per_cpu=25, num_cpu=1, kappa=10.0, gamma=1.0, mean=mean, filter_coefs=filter_coefs, seed=seed) else: mpc_params['env'] = GymEnv(env_name) mpc_params['seed'] = seed self.mpc_params = mpc_params self.mpc_policy = MPCActor(**mpc_params)
def sample_paths( num_traj, env, policy, # mpc policy on fitted model horizon=1e6, eval_mode=True, base_seed=None, noise_level=0.1, ): # get the correct env behavior if type(env) == str: env = GymEnv(env) elif isinstance(env, GymEnv): env = env elif callable(env): env = env() else: print("Unsupported environment format") raise AttributeError if base_seed is not None: env.set_seed(base_seed) horizon = min(horizon, env.horizon) paths = [] for ep in range(num_traj): env.reset() observations = [] actions = [] rewards = [] env_infos = [] t = 0 done = False while t < horizon and done is False: obs = env.get_obs() ifo = env.get_env_infos() act = policy.get_action(obs) if eval_mode is False and type(act) != list: act = act + np.random.uniform( low=-noise_level, high=noise_level, size=act.shape[0]) if type(act) == list: act = act[0] if eval_mode is False else act[1]['evaluation'] next_obs, reward, done, _ = env.step(act) t = t + 1 observations.append(obs) actions.append(act) rewards.append(reward) env_infos.append(ifo) path = dict(observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), terminated=done, env_infos=tensor_utils.stack_tensor_dict_list(env_infos)) paths.append(path) return paths
def main(): # get args args = get_args() # load env if args.include is not "": exec("import " + args.include) e = GymEnv(args.env_name) # load policy policy = args.policy if args.policy == "": pol = MLP(e.spec, init_log_std=-1) mode = "exploration" else: pol = pickle.load(open(policy, 'rb')) mode = "evaluation" # Visualized policy if args.render == "onscreen": # On screen e.env.env.visualize_policy(pol, horizon=e.horizon, num_episodes=args.num_episodes, mode=mode) else: # Offscreen buffer e.env.env.visualize_policy_offscreen(pol, horizon=100, num_episodes=args.num_episodes, mode=mode, filename=args.filename) # Close envs e.env.env.close_env()
def demo_playback(env_name, demo_paths): e = GymEnv(env_name) e.reset() for path in demo_paths: e.set_env_state(path['init_state_dict']) actions = path['actions'] for t in range(actions.shape[0]): e.step(actions[t]) e.env.mj_render()
def pol_playback(env_name, num_trajs=100): e = GymEnv(env_name) e.reset() obs_ = [] act_ = [] rew_ = [] term_ = [] info_qpos_ = [] info_qvel_ = [] ravg = [] for n in range(num_trajs): e.reset() returns = 0 for t in range(e._horizon): obs = e.get_obs() obs_.append(obs) info_qpos_.append(e.env.data.qpos.ravel().copy()) info_qvel_.append(e.env.data.qvel.ravel().copy()) action = e.action_space.sample() act_.append(action) _, rew, _, info = e.step(action) returns += rew rew_.append(rew) done = False if t == (e._horizon - 1): done = True term_.append(done) # e.env.mj_render() # this is much faster # e.render() ravg.append(returns) # write out hdf5 file obs_ = np.array(obs_).astype(np.float32) act_ = np.array(act_).astype(np.float32) rew_ = np.array(rew_).astype(np.float32) term_ = np.array(term_).astype(np.bool_) info_qpos_ = np.array(info_qpos_).astype(np.float32) info_qvel_ = np.array(info_qvel_).astype(np.float32) dataset = h5py.File("%s_random.hdf5" % env_name, "w") # dataset.create_dataset('observations', obs_.shape, dtype='f4') dataset.create_dataset("observations", data=obs_, compression="gzip") dataset.create_dataset("actions", data=act_, compression="gzip") dataset.create_dataset("rewards", data=rew_, compression="gzip") dataset.create_dataset("terminals", data=term_, compression="gzip") dataset.create_dataset("infos/qpos", data=info_qpos_, compression="gzip") dataset.create_dataset("infos/qvel", data=info_qvel_, compression="gzip")
def main(env_name, mode, path, iteration, job_name, horizon, run_no): env_kwargs = {} if path and ('.pickle' in path or 'pkl' in path): policy_path = path else: if job_name: path = os.path.join('../inverse_rl_dexterous_hand/training/Runs/', job_name, 'run_' + str(run_no), 'iterations') if iteration: if iteration == 'last': checkpoint_file = get_last_iteration_checkpoint(path) else: checkpoint_file = "checkpoint_{}.pickle".format(iteration) policy_path = os.path.join(path, checkpoint_file) else: policy_path = os.path.join(path, "best_policy.pickle") if env_name is None: cfg_path = os.path.join(os.path.dirname(policy_path), "../..", "..", "config.yaml") if not os.path.exists(cfg_path): cfg_path = os.path.join(os.path.dirname(cfg_path), "../..", "config.yaml") if not os.path.exists(cfg_path): cfg_path = None if cfg_path is not None: cfg = yamlreader.yaml_load(cfg_path) env_name = cfg['env'] env_kwargs = cfg['env_kwargs'] else: print( "Config file not found, cannot infer environment name. Please provide env_name parameter." ) exit(1) e = GymEnv(env_name, **env_kwargs) print("Checkpoint path:", policy_path) policy = pickle.load(open(policy_path, 'rb')) if isinstance(policy, list): policy = policy[0] # render policy if horizon is None: horizon = e.horizon e.visualize_policy(policy, num_episodes=100, horizon=horizon, mode=mode)
def main(env_name, policy, mode, seed, episodes): e = GymEnv(env_name) e.set_seed(seed) if policy is not None: pi = pickle.load(open(policy, 'rb')) else: pi = MLP(e.spec, hidden_sizes=(32, 32), seed=seed, init_log_std=-1.0) # render policy e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode)
def main(file, seed, noise_level, num_episodes, config, device_path): exp_data = pickle.load(open(file, 'rb')) policy = exp_data['policy'] model = exp_data['fitted_model'] model = model[-1] if type(model) == list else model env_id = policy.env.env_id render = True # TODO(Aravind): Map to hardware if device_path is specified env = GymEnv(env_id) policy.env = env env.set_seed(seed) np.random.seed(seed) torch.manual_seed(seed) if config is not None: try: with open(config, 'r') as f: config = eval(f.read()) except: with open(config, 'r') as f: config = json.load(f) policy.plan_horizon = config['plan_horizon'] policy.num_traj = config['plan_paths'] policy.kappa = config['kappa'] policy.filter_coefs = [ config['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4'] ] policy.omega = config['omega'] if 'omega' in config.keys() else 0.0 # TODO(Aravind): Implement capability to set predicted state for rendering purposes # evaluate_policy(env, policy, model, noise_level, real_step=False, num_episodes=num_episodes, visualize=render) evaluate_policy(env, policy, model, noise_level, real_step=True, num_episodes=num_episodes, visualize=render) # final close out env.reset()
def main(env_name, num_trajs): e = GymEnv(env_name) # render policy pol_playback(env_name, num_trajs)
def single_process(job): job_start_time = timer.time() # Allow process to parallelize things internally curr_proc = mp.current_process() curr_proc.daemon = False os.chdir(cwd) dirpath = os.path.join(job['save_dir'], job['job_name']) os.makedirs(dirpath, exist_ok=True) # start job os.chdir(cwd) job_start_time = timer.time() print('Started New Job : ', job['job_name'], '=======================') print('Job specifications : \n', job) # Make Env e = GymEnv(job['env_name']) # Make baseline baseline = MLPBaseline(e.spec) # save job details job['horizon'] = e.horizon job['ctrl_timestep'] = e.env.env.dt job['sim_timestep'] = e.env.env.model.opt.timestep # job['sim_skip'] = e.env.env.skip job_data_file = open(dirpath + '/job_data.txt', 'w') pprint.pprint(job, stream=job_data_file) job_data_file.close() # Make policy (???vik: sizes are hard coded) if 'init_policy' in job: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32, 32), seed=job['seed']) loaded_policy = pickle.load(open(job['init_policy'], 'rb')) loaded_params = loaded_policy.get_param_values() print('log std values in loaded policy = ') print(params[-policy.m:]) # NOTE: if the log std is too small # (say <-2.0, it is problem dependent and intuition should be used) # then we need to bump it up so that it explores # params[-policy.m:] += 1.0 policy.set_param_values(loaded_params) del job['init_policy'] else: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32, 32), seed=job['seed']) # Agent agent = NPG(e, policy, baseline, seed=job['seed'], \ normalized_step_size=job['normalized_step_size'], \ save_logs=job['save_logs'], FIM_invert_args=job['FIM_invert_args']) # Train Agent train_agent( job_name=dirpath, agent=agent, seed=job['seed'], niter=job['niter'], gamma=job['gamma'], gae_lambda=job['gae_lambda'], num_cpu=job['num_cpu'], sample_mode=job['sample_mode'], num_traj=job['num_traj'], evaluation_rollouts=job['evaluation_rollouts'], save_freq=job['save_freq'], plot_keys={'stoc_pol_mean', 'stoc_pol_std'}, ) total_job_time = timer.time() - job_start_time print('Job', job['job_name'], 'took %f seconds ==============' % total_job_time) return total_job_time
def get_environment(env_name=None, **kwargs): if env_name is None: print("Need to specify environment name") e = GymEnv(env_name) # can make procedural modifications here if needed using kwargs return e
def get_environment(env_name=None): if env_name is None: print("Need to specify environment name") return GymEnv(env_name)
def train_agent( job_name, agent, seed=0, niter=101, gamma=0.995, gae_lambda=None, num_cpu=1, sample_mode='trajectories', num_traj=50, num_samples=50000, # has precedence, used with sample_mode = 'samples' save_freq=10, evaluation_rollouts=None, plot_keys=['stoc_pol_mean'], ): np.random.seed(seed) if os.path.isdir(job_name) == False: os.mkdir(job_name) previous_dir = os.getcwd() os.chdir(job_name) # important! we are now in the directory to save data if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs') best_policy = copy.deepcopy(agent.policy) best_perf = -1e8 train_curve = best_perf * np.ones(niter) mean_pol_perf = 0.0 e = GymEnv(agent.env.env_id) for i in range(niter): print( "......................................................................................" ) print("ITERATION : %i " % i) if train_curve[i - 1] > best_perf: best_policy = copy.deepcopy(agent.policy) best_perf = train_curve[i - 1] N = num_traj if sample_mode == 'trajectories' else num_samples args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu) stats = agent.train_step(**args) train_curve[i] = stats[0] if evaluation_rollouts is not None and evaluation_rollouts > 0: print("Performing evaluation rollouts ........") eval_paths = sample_paths_parallel(N=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu, env_name=e.env_id, mode='evaluation', pegasus_seed=seed) mean_pol_perf = np.mean( [np.sum(path['rewards']) for path in eval_paths]) if agent.save_logs: agent.logger.log_kv('eval_score', mean_pol_perf) if i % save_freq == 0 and i > 0: if agent.save_logs: agent.logger.save_log('logs/') make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') policy_file = 'policy_%i.pickle' % i baseline_file = 'baseline_%i.pickle' % i pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb')) pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb')) pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) # print results to console if i == 0: result_file = open('results.txt', 'w') print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n") result_file.write( "Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n") result_file.close() print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime( timer.time())), i, train_curve[i], mean_pol_perf, best_perf)) result_file = open('results.txt', 'a') result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf)) result_file.close() if agent.save_logs: print_data = sorted( filter(lambda v: np.asarray(v[1]).size == 1, agent.logger.get_current_log().items())) print(tabulate(print_data)) # final save pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) if agent.save_logs: agent.logger.save_log('logs/') make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') os.chdir(previous_dir)
def main(env_name, mode, num_trajs, clip=True): e = GymEnv(env_name) policy = "./policies/" + env_name + ".pickle" pi = pickle.load(open(policy, "rb")) # render policy pol_playback(env_name, pi, num_trajs, clip=clip)
env_ids = pickle.load(f) f.close() e_unshuffled = {} for task_id in range(num_tasks): size_factors = size_factors_list[task_id] env_id = env_ids[task_id] gym.envs.register( id=env_id, entry_point= 'gym_extensions.continuous.mujoco.modified_half_cheetah:HalfCheetahModifiedBodyPartSizeEnv', max_episode_steps=1000, reward_threshold=3800.0, kwargs=dict(body_parts=['torso', 'fthigh', 'fshin', 'ffoot'], size_scales=size_factors)) e_unshuffled[task_id] = GymEnv( env_id ) # only do the environment here, so different files can create the same tasks for i in range(num_seeds): np.random.seed(SEED) torch.manual_seed(SEED) job_name_lpgftw_seed = job_name_lpgftw + '/seed_{}'.format(i) f = open(job_name_lpgftw_seed + '/task_order.pickle', 'rb') task_order = pickle.load(f) f.close() e = {} for task_id in range(num_tasks): e[task_id] = e_unshuffled[task_order[task_id]]
def main(env_name, snapshot_file, mode, num_trajs, clip=True): e = GymEnv(env_name) pi = pickle.load(gzip.open(snapshot_file, 'rb')) import pdb pdb.set_trace() pass
if job_data['sample_mode'] == 'trajectories': assert 'rl_num_traj' in job_data.keys() job_data['rl_num_samples'] = 0 # will be ignored elif job_data['sample_mode'] == 'samples': assert 'rl_num_samples' in job_data.keys() job_data['rl_num_traj'] = 0 # will be ignored else: print("Unknown sampling mode. Choose either trajectories or samples") exit() # =============================================================================== # Train Loop # =============================================================================== e = GymEnv(job_data['env']) policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=job_data['seed'], init_log_std=job_data['init_log_std']) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], hidden_sizes=job_data['vf_hidden_size'], epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate']) # Construct the algorithm if job_data['algorithm'] == 'NPG': # Other hyperparameters (like number of CG steps) can be specified in config for pass through # or default hyperparameters will be used
from mjrl.utils.gym_env import GymEnv from mjrl.policies.gaussian_linear import LinearPolicy from mjrl.baselines.mlp_baseline import MLPBaseline from mjrl.algos.npg_cg import NPG from mjrl.utils.train_agent import train_agent import time as timer SEED = 500 e = GymEnv('Hopper-v2') policy = LinearPolicy(e.spec, seed=SEED) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=10, learn_rate=1e-4) agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) ts = timer.time() train_agent(job_name='hopper_nominal', agent=agent, seed=SEED, niter=500, gamma=0.995, gae_lambda=0.97, num_cpu=4,
def policy_rollout( num_traj, env, policy, learned_model, init_state=None, eval_mode=False, horizon=1e6, env_kwargs=None, seed=None, s_min=None, s_max=None, a_min=None, a_max=None, large_value=float(1e2), ): # Only CPU rollouts are currently supported. # TODO(Aravind) : Extend GPU support # get the correct env behavior if type(env) == str: env = GymEnv(env) elif isinstance(env, GymEnv): env = env elif callable(env): env = env(**env_kwargs) else: print("Unsupported environment format") raise AttributeError if seed is not None: env.set_seed(seed) torch.manual_seed(seed) # get initial states if init_state is None: st = np.array([env.reset() for _ in range(num_traj)]) st = torch.from_numpy(st).float() elif type(init_state) == np.ndarray: st = torch.from_numpy(init_state).float() elif type(init_state) == list: st = torch.from_numpy(np.array(init_state)).float() elif type(init_state) == torch.Tensor: assert init_state.device == 'cpu' pass else: print("Unsupported format for init state") quit() # perform batched rollouts horizon = min(horizon, env.horizon) obs = [] act = [] for t in range(horizon): at = policy.model.forward(st) if eval_mode is not True: at = at + torch.randn(at.shape) * torch.exp(policy.log_std) # clamp states and actions to avoid blowup at = enforce_tensor_bounds(at, a_min, a_max, large_value) stp1 = learned_model.forward(st, at) stp1 = enforce_tensor_bounds(stp1, s_min, s_max, large_value) obs.append(st.to('cpu').data.numpy()) act.append(at.to('cpu').data.numpy()) st = stp1 obs = np.array(obs) obs = np.swapaxes(obs, 0, 1) # (num_traj, horizon, state_dim) act = np.array(act) act = np.swapaxes(act, 0, 1) # (num_traj, horizon, action_dim) paths = dict(observations=obs, actions=act) return paths
import time as timer import pickle SEED = 100 # ------------------------------ # Get demonstrations print("========================================") print("Collecting expert demonstrations") print("========================================") demo_paths = pickle.load( open('../demonstrations/relocate-v0_demos.pickle', 'rb')) # ------------------------------ # Train BC e = GymEnv('relocate-v0') policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED) bc_agent = BC(demo_paths, policy=policy, epochs=5, batch_size=32, lr=1e-3) ts = timer.time() print("========================================") print("Running BC with expert demonstrations") print("========================================") bc_agent.train() print("========================================") print("BC training complete !!!") print("time taken = %f" % (timer.time() - ts)) print("========================================") score = e.evaluate_policy(policy, num_episodes=10, mean_action=True) print("Score with behavior cloning = %f" % score[0][0])
def single_process(job): job_start_time = timer.time() # Allow process to parallelize things internally curr_proc = mp.current_process() curr_proc.daemon = False # Create a directory for the job results. job_dir = os.path.join(job['output_dir']) if not os.path.isdir(job_dir): os.mkdir(job_dir) # start job job_start_time = timer.time() print('Started New Job : ', job['job_name'], '=======================') print('Job specifications : \n', job) # Make Env env_name = job['env_name'] # adept_envs.global_config.set_config(env_name, { # 'robot_params': job['robot'], # **job.get('env_params', {}), # }) e = GymEnv(env_name) # Make baseline baseline = MLPBaseline(e.spec) # save job details job['horizon'] = e.horizon job['ctrl_timestep'] = e.env.env.dt job['sim_timestep'] = e.env.env.model.opt.timestep # job['sim_skip'] = e.env.env.skip with open(os.path.join(job_dir, 'job_data.txt'), 'w') as job_data_file: pprint.pprint(job, stream=job_data_file) if 'init_policy' in job: policy = MLP(e.spec, init_log_std=job['init_std'], hidden_sizes=(32,32), seed=job['seed']) loaded_policy = pickle.load(open(job['init_policy'], 'rb')) loaded_params = loaded_policy.get_param_values() print("log std values in loaded policy = ") print(loaded_params[-policy.m:]) # NOTE: if the log std is too small # (say <-2.0, it is problem dependent and intuition should be used) # then we need to bump it up so that it explores loaded_params[-policy.m:] += job['init_std'] policy.set_param_values(loaded_params) del job['init_policy'] else: policy = MLP( e.spec, init_log_std=job['init_std'], hidden_sizes=job['hidden_sizes'], # hidden_sizes=(32, 32), seed=job['seed']) # Agent agent = NPG( e, policy, baseline, seed=job['seed'], normalized_step_size=job['normalized_step_size'], save_logs=job['save_logs'], FIM_invert_args=job['FIM_invert_args']) # Train Agent train_agent( job_name=job['job_name'], agent=agent, # save_dir=job_dir, seed=job['seed'], niter=job['niter'], gamma=job['gamma'], gae_lambda=job['gae_lambda'], num_cpu=job['num_cpu'], sample_mode=job['sample_mode'], num_traj=job.get('num_traj'), num_samples=job.get('num_samples'), evaluation_rollouts=job['evaluation_rollouts'], save_freq=job['save_freq'], plot_keys={'stoc_pol_mean', 'stoc_pol_std'}, ) total_job_time = timer.time() - job_start_time print('Job', job['job_name'], 'took %f seconds ==============' % total_job_time) return total_job_time
def pol_playback(env_name, num_trajs=100): e = GymEnv(env_name) e.reset() obs_ = [] act_ = [] rew_ = [] term_ = [] timeout_ = [] info_qpos_ = [] info_qvel_ = [] info_env_state_ = [] ravg = [] for n in range(num_trajs): e.reset() returns = 0 for t in range(e._horizon): obs = e.get_obs() obs_.append(obs) info_qpos_.append(e.env.data.qpos.ravel().copy()) info_qvel_.append(e.env.data.qvel.ravel().copy()) info_env_state_.append(e.get_env_state()) action = e.action_space.sample() act_.append(action) _, rew, done, info = e.step(action) returns += rew rew_.append(rew) if t == (e._horizon-1): timeout = True done = False else: timeout = False term_.append(done) timeout_.append(timeout) if done or timeout: e.reset() #e.env.mj_render() # this is much faster # e.render() ravg.append(returns) # write out hdf5 file obs_ = np.array(obs_).astype(np.float32) act_ = np.array(act_).astype(np.float32) rew_ = np.array(rew_).astype(np.float32) term_ = np.array(term_).astype(np.bool_) timeout_ = np.array(timeout_).astype(np.bool_) info_qpos_ = np.array(info_qpos_).astype(np.float32) info_qvel_ = np.array(info_qvel_).astype(np.float32) dataset = h5py.File('%s_random.hdf5' % env_name, 'w') #dataset.create_dataset('observations', obs_.shape, dtype='f4') dataset.create_dataset('observations', data=obs_, compression='gzip') dataset.create_dataset('actions', data=act_, compression='gzip') dataset.create_dataset('rewards', data=rew_, compression='gzip') dataset.create_dataset('terminals', data=term_, compression='gzip') dataset.create_dataset('timeouts', data=timeout_, compression='gzip') dataset.create_dataset('infos/qpos', data=info_qpos_, compression='gzip') dataset.create_dataset('infos/qvel', data=info_qvel_, compression='gzip') dataset.create_dataset('infos/env_state', data=np.array(info_env_state_, dtype=np.float32), compression='gzip')
from mjrl.utils.gym_env import GymEnv from mjrl.policies.gaussian_linear import LinearPolicy from mjrl.baselines.mlp_baseline import MLPBaseline from mjrl.algos.npg_cg import NPG from mjrl.utils.train_agent import train_agent import time as timer SEED = 500 e = GymEnv('Walker2d-v2') policy = LinearPolicy(e.spec, seed=SEED) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) ts = timer.time() train_agent(job_name='walker_nominal', agent=agent, seed=SEED, niter=500, gamma=0.995, gae_lambda=0.97, num_cpu=4,
from mjrl.utils.gym_env import GymEnv from mjrl.policies.gaussian_mlp import MLP from mjrl.baselines.quadratic_baseline import QuadraticBaseline from mjrl.baselines.mlp_baseline import MLPBaseline from mjrl.algos.npg_cg import NPG from mjrl.utils.train_agent import train_agent import mjrl.envs import time as timer SEED = 500 e = GymEnv('mjrl_point_mass-v0') policy = MLP(e.spec, hidden_sizes=(32, 32), seed=SEED) baseline = QuadraticBaseline(e.spec) agent = NPG(e, policy, baseline, normalized_step_size=0.2, seed=SEED, save_logs=True) ts = timer.time() train_agent(job_name='vis_exp', agent=agent, seed=SEED, niter=30, gamma=0.95, gae_lambda=0.97, num_cpu=1, sample_mode='trajectories', num_traj=100, save_freq=5,
class MBAC(BC): def __init__( self, env_name, policy, expert_paths=None, # for the initial seeding epochs=5, batch_size=64, lr=1e-3, optimizer=None, loss_type='MSE', # can be 'MLE' or 'MSE' seed=123, buffer_size=50, # measured in number of trajectories mpc_params=None, save_logs=True, ): super().__init__( expert_paths=expert_paths, policy=policy, epochs=epochs, batch_size=batch_size, lr=lr, optimizer=optimizer, loss_type=loss_type, save_logs=save_logs, ) self.expert_paths = [] if self.expert_paths is None else self.expert_paths self.buffer_size = buffer_size # For the MPC policy self.env = GymEnv(env_name) self.env.reset(seed=seed) if mpc_params is None: mean = np.zeros(self.env.action_dim) sigma = 1.0 * np.ones(self.env.action_dim) filter_coefs = [sigma, 0.05, 0.0, 0.0] mpc_params = dict(env=GymEnv(env_name), H=10, paths_per_cpu=25, num_cpu=1, kappa=10.0, gamma=1.0, mean=mean, filter_coefs=filter_coefs, seed=seed) else: mpc_params['env'] = GymEnv(env_name) mpc_params['seed'] = seed self.mpc_params = mpc_params self.mpc_policy = MPCActor(**mpc_params) def collect_paths(self, num_traj=10, mode='policy', horizon=None, render=False): horizon = self.env.horizon if horizon is None else horizon paths = [] for i in tqdm(range(num_traj)): self.env.reset() obs, act_pi, act_mpc, rew, states = [], [], [], [], [] for t in range(horizon): o = self.env.get_obs() s = self.env.get_env_state() a_pi = self.policy.get_action(o)[0] a_mpc = self.mpc_policy.get_action(s) a = a_pi if mode == 'policy' else a_mpc next_o, r, done, _ = self.env.step(a) if render: self.env.render() # store data obs.append(o) rew.append(r) states.append(s) act_pi.append(a_pi) act_mpc.append(a_mpc) # kill if done if done: break path = dict( observations=np.array(obs), actions=np.array(act_pi), expert_actions=np.array(act_mpc), rewards=np.array(rew), states=states, ) paths.append(path) return paths def add_paths_to_buffer(self, paths): for path in paths: self.expert_paths.append(path) if len(self.expert_paths) > self.buffer_size: # keep recent trajectories # TODO: Also consider keeping best performing trajectories self.expert_paths = self.expert_paths[-self.buffer_size:] if self.save_logs: self.logger.log_kv('buffer_size', len(self.expert_paths)) def get_data_from_buffer(self): observations = np.concatenate( [path["observations"] for path in self.expert_paths]) expert_actions = np.concatenate( [path["expert_actions"] for path in self.expert_paths]) observations = torch.Tensor(observations).float() expert_actions = torch.Tensor(expert_actions).float() data = dict(observations=observations, expert_actions=expert_actions) return data def train_step(self, num_traj=10, **kwargs): # collect data using policy actions # fit policy to expert actions on these states new_paths = self.collect_paths(num_traj, mode='policy') self.add_paths_to_buffer(new_paths) data = self.get_data_from_buffer() self.fit(data, **kwargs) stoc_pol_perf = np.mean( [np.sum(path['rewards']) for path in new_paths]) return stoc_pol_perf
def do_rollout(num_traj, env, policy, eval_mode=False, horizon=1e6, base_seed=None, env_kwargs=None, init_states_per_cpu=None): """ :param num_traj: number of trajectories (int) :param env: environment (env class, str with env_name, or factory function) :param policy: policy to use for action selection :param eval_mode: use evaluation mode for action computation (bool) :param horizon: max horizon length for rollout (<= env.horizon) :param base_seed: base seed for rollouts (int) :param env_kwargs: dictionary with parameters, will be passed to env generator :param init_states_per_cpu: list of init states to initialize from for fixed evaluation :return: """ # get the correct env behavior if type(env) == str: if isinstance(env_kwargs, dict): env = GymEnv(env, **env_kwargs) else: env = GymEnv(env) elif isinstance(env, GymEnv): env = env elif callable(env): env = env(**env_kwargs) else: print("Unsupported environment format") raise AttributeError if base_seed is not None: env.set_seed(base_seed) np.random.seed(base_seed) else: np.random.seed() horizon = min(horizon, env.horizon) paths = [] for ep in range(num_traj): # seeding if base_seed is not None: seed = base_seed + ep env.set_seed(seed) np.random.seed(seed) observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() if init_states_per_cpu is not None: o = env.set_env_state(init_states_per_cpu[ep]) assert o is not None, 'set_env_state of env ' + env.env_id + ' returns None, should return observation' done = False t = 0 while t < horizon and done != True: a, agent_info = policy.get_action(o) if eval_mode: a = agent_info['evaluation'] env_info_base = env.get_env_infos() next_o, r, done, env_info_step = env.step(a) # below is important to ensure correct env_infos for the timestep env_info = env_info_step if env_info_base == {} else env_info_base observations.append(o) actions.append(a) rewards.append(r) agent_infos.append(agent_info) env_infos.append(env_info) o = next_o t += 1 path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), terminated=done) paths.append(path) del (env) return paths
e_unshuffled = {} for task_id, (env_id, entry_point) in enumerate(env_dict.items()): kwargs = {'obs_type': 'plain'} if env_id == 'reach-v1': kwargs['task_type'] = 'reach' elif env_id == 'push-v1': kwargs['task_type'] = 'push' elif env_id == 'pick-place-v1': kwargs['task_type'] = 'pick_place' gym.envs.register(id=env_id, entry_point='metaworld.envs.mujoco.sawyer_xyz.' + entry_point, max_episode_steps=150, kwargs=kwargs) e_unshuffled[task_id] = GymEnv(env_id) for i in range(num_seeds): np.random.seed(SEED) torch.manual_seed(SEED) job_name_er_seed = job_name_er + '/seed_{}'.format(i) e = {} task_order = np.random.permutation(num_tasks) for task_id in range(num_tasks): e[task_id] = e_unshuffled[task_order[task_id]] baseline_mtl = {} forward_transfer_results = {} for task_id in range(num_tasks): iterdir = job_name_er_seed + '/iterations/task_{}/'.format(task_id) f = open(iterdir + 'policy_199.pickle', 'rb')
job_data = eval(f.read()) assert 'algorithm' in job_data.keys() assert any([job_data['algorithm'] == a for a in ['NPG', 'BCRL', 'DAPG']]) job_data['lam_0'] = 0.0 if 'lam_0' not in job_data.keys( ) else job_data['lam_0'] job_data['lam_1'] = 0.0 if 'lam_1' not in job_data.keys( ) else job_data['lam_1'] EXP_FILE = JOB_DIR + '/job_config.json' with open(EXP_FILE, 'w') as f: json.dump(job_data, f, indent=4) # =============================================================================== # Train Loop # =============================================================================== e = GymEnv(job_data['env']) policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=job_data['seed']) baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate']) # Get demonstration data if necessary and behavior clone if job_data['algorithm'] != 'NPG': print("========================================") print("Collecting expert demonstrations") print("========================================") demo_paths = pickle.load(open(job_data['demo_file'], 'rb'))