def main(config): baseline = LinearFeatureBaseline() # env = normalize(HalfCheetahRandDirecEnv()) env = HopperRandParamsEnv(3.5) policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = PPOMAML( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_ppo_steps'], num_minibatches=config['num_minibatches'], clip_eps=config['clip_eps'], clip_outer=config['clip_outer'], target_outer_step=config['target_outer_step'], target_inner_step=config['target_inner_step'], init_outer_kl_penalty=config['init_outer_kl_penalty'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_outer_kl_penalty=config['adaptive_outer_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], anneal_factor=config['anneal_factor'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config[ 'num_inner_grad_steps'], # This is repeated in MAMLPPO, it's confusing ) trainer.train()
def construct_from_feed_dict( self, policy_pickle, env_pickle, baseline_pickle, dynamics_model_pickle, feed_dict, ): from meta_mb.samplers.mbmpo_samplers.mbmpo_sampler import MBMPOSampler from meta_mb.samplers.bptt_samplers.meta_bptt_sampler import MetaBPTTSampler from meta_mb.samplers.meta_samplers.maml_sample_processor import MAMLSampleProcessor from meta_mb.meta_algos.trpo_maml import TRPOMAML env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) baseline = pickle.loads(baseline_pickle) dynamics_model = pickle.loads(dynamics_model_pickle) self.policy = policy self.baseline = baseline if self.sampler_str == 'mbmpo': self.model_sampler = MBMPOSampler(env=env, policy=policy, dynamics_model=dynamics_model, **feed_dict['model_sampler']) elif self.sampler_str == 'bptt': self.model_sampler = MetaBPTTSampler(env=env, policy=policy, dynamics_model=dynamics_model, **feed_dict['model_sampler']) else: raise NotImplementedError self.model_sample_processor = MAMLSampleProcessor( baseline=baseline, **feed_dict['model_sample_processor']) self.algo = TRPOMAML(policy=policy, **feed_dict['algo'])
def main(config): baseline = LinearFeatureBaseline() env = normalize(HalfCheetahRandDirecEnv()) policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = TRPOMAML(policy=policy, step_size=config['step_size'], inner_type=config['inner_type'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], inner_lr=config['inner_lr']) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config[ 'num_inner_grad_steps'], # This is repeated in MAMLPPO, it's confusing ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME + kwargs.get('exp_name', '') logger.configure(dir=exp_dir, format_strs=['csv', 'stdout', 'log'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = kwargs['env']() # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble( 'dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], rolling_average_persitency=kwargs['rolling_average_persitency']) env_sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['real_env_rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], ) model_sampler = MBMPOSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], dynamics_model=dynamics_model, deterministic=kwargs['deterministic'], ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) model_sample_processor = MAMLSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = TRPOMAML( policy=policy, step_size=kwargs['step_size'], inner_type=kwargs['inner_type'], inner_lr=kwargs['inner_lr'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], exploration=kwargs['exploration'], ) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, model_sample_processor=model_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, num_rollouts_per_iter=int(kwargs['meta_batch_size'] * kwargs['fraction_meta_batch_size']), n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], meta_steps_per_iter=kwargs['meta_steps_per_iter'], sample_from_buffer=kwargs['sample_from_buffer'], sess=sess, ) trainer.train()
env = normalize(AntRandGoalEnv()) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=BATCH_SIZE, meta_batch_size=META_BATCH_SIZE, max_path_length=PATH_LENGTH, parallel=True, envs_per_task=20, ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=0.99, gae_lambda=1, normalize_adv=True, positive_adv=False, ) # Doesn't matter which algo algo = VPGMAML( policy=policy, inner_lr=0.1, meta_batch_size=META_BATCH_SIZE, inner_type='likelihood_ratio', num_inner_grad_steps=NUM_INNER_GRAD_STEPS, ) uninit_vars = [ var for var in tf.global_variables()
class WorkerPolicy(Worker): def __init__(self, num_inner_grad_steps, sampler_str='mbmpo'): super().__init__() self.num_inner_grad_steps = num_inner_grad_steps self.policy = None self.baseline = None self.model_sampler = None self.model_sample_processor = None self.algo = None self.sampler_str = sampler_str def construct_from_feed_dict( self, policy_pickle, env_pickle, baseline_pickle, dynamics_model_pickle, feed_dict, ): from meta_mb.samplers.mbmpo_samplers.mbmpo_sampler import MBMPOSampler from meta_mb.samplers.bptt_samplers.meta_bptt_sampler import MetaBPTTSampler from meta_mb.samplers.meta_samplers.maml_sample_processor import MAMLSampleProcessor from meta_mb.meta_algos.trpo_maml import TRPOMAML env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) baseline = pickle.loads(baseline_pickle) dynamics_model = pickle.loads(dynamics_model_pickle) self.policy = policy self.baseline = baseline if self.sampler_str == 'mbmpo': self.model_sampler = MBMPOSampler(env=env, policy=policy, dynamics_model=dynamics_model, **feed_dict['model_sampler']) elif self.sampler_str == 'bptt': self.model_sampler = MetaBPTTSampler(env=env, policy=policy, dynamics_model=dynamics_model, **feed_dict['model_sampler']) else: raise NotImplementedError self.model_sample_processor = MAMLSampleProcessor( baseline=baseline, **feed_dict['model_sample_processor']) self.algo = TRPOMAML(policy=policy, **feed_dict['algo']) def prepare_start(self): dynamics_model = pickle.loads(self.queue.get()) self.model_sampler.dynamics_model = dynamics_model if hasattr(self.model_sampler, 'vec_env'): self.model_sampler.vec_env.dynamics_model = dynamics_model self.step() self.push() def step(self): time_step = time.time() ''' --------------- MAML steps --------------- ''' self.policy.switch_to_pre_update() # Switch to pre-update policy all_samples_data = [] for step in range(self.num_inner_grad_steps + 1): if self.verbose: logger.log("Policy Adaptation-Step %d **" % step) """ -------------------- Sampling --------------------------""" #time_sampling = time.time() paths = self.model_sampler.obtain_samples(log=True, log_prefix='Policy-', buffer=None) #time_sampling = time.time() - time_sampling """ ----------------- Processing Samples ---------------------""" #time_sample_proc = time.time() samples_data = self.model_sample_processor.process_samples( paths, log='all', log_prefix='Policy-') all_samples_data.append(samples_data) #time_sample_proc = time.time() - time_sample_proc self.log_diagnostics(sum(list(paths.values()), []), prefix='Policy-') """ ------------------- Inner Policy Update --------------------""" #time_algo_adapt = time.time() if step < self.num_inner_grad_steps: self.algo._adapt(samples_data) #time_algo_adapt = time.time() - time_algo_adapt """ ------------------ Outer Policy Update ---------------------""" if self.verbose: logger.log("Policy is optimizing...") # This needs to take all samples_data so that it can construct graph for meta-optimization. #time_algo_opt = time.time() self.algo.optimize_policy(all_samples_data, prefix='Policy-') #time_algo_opt = time.time() - time_algo_opt time_step = time.time() - time_step self.policy = self.model_sampler.policy logger.logkv('Policy-TimeStep', time_step) def _synch(self, dynamics_model_state_pickle): time_synch = time.time() if self.verbose: logger.log('Policy is synchronizing...') dynamics_model_state = pickle.loads(dynamics_model_state_pickle) assert isinstance(dynamics_model_state, dict) self.model_sampler.dynamics_model.set_shared_params( dynamics_model_state) if hasattr(self.model_sampler, 'vec_env'): self.model_sampler.vec_env.dynamics_model.set_shared_params( dynamics_model_state) time_synch = time.time() - time_synch logger.logkv('Policy-TimeSynch', time_synch) def push(self): time_push = time.time() policy_state_pickle = pickle.dumps( self.policy.get_shared_param_values()) assert policy_state_pickle is not None while self.queue_next.qsize() > 5: try: logger.log('Policy is off loading data from queue_next...') _ = self.queue_next.get_nowait() except Empty: # very rare chance to reach here break self.queue_next.put(policy_state_pickle) time_push = time.time() - time_push logger.logkv('Policy-TimePush', time_push) def log_diagnostics(self, paths, prefix): self.policy.log_diagnostics(paths, prefix) self.baseline.log_diagnostics(paths, prefix)
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], ) # Load policy here sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = TRPOMAML( policy=policy, step_size=kwargs['step_size'], inner_type=kwargs['inner_type'], inner_lr=kwargs['inner_lr'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], exploration=kwargs['exploration'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], sess=sess, ) trainer.train()
def run_experiment(**config): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(config, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(config['seed']) baseline = config['baseline']() env = normalize(config['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], learn_std=config['learn_std'], hidden_nonlinearity=config['hidden_nonlinearity'], output_nonlinearity=config['output_nonlinearity'], ) # Load policy here sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = PPOMAML( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_ppo_steps'], num_minibatches=config['num_minibatches'], clip_eps=config['clip_eps'], clip_outer=config['clip_outer'], target_outer_step=config['target_outer_step'], target_inner_step=config['target_inner_step'], init_outer_kl_penalty=config['init_outer_kl_penalty'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_outer_kl_penalty=config['adaptive_outer_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], anneal_factor=config['anneal_factor'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble('dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) env_sampler = SingleMetaSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['real_env_rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], ) model_sampler = MBMPOSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], dynamics_model=dynamics_model, ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) model_sample_processor = MAMLSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = TRPOMAML( policy=policy, step_size=kwargs['step_size'], inner_type=kwargs['inner_type'], inner_lr=kwargs['inner_lr'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], exploration=kwargs['exploration'], ) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, model_sample_processor=model_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], meta_steps_per_iter=kwargs['meta_steps_per_iter'], initial_random_samples=True, sample_from_buffer=True, ) trainer.train()