def construct_from_feed_dict(self, policy_pickle, env_pickle, baseline_pickle, dynamics_model_pickle, feed_dict): from meta_mb.samplers.meta_samplers.meta_sampler import MetaSampler from meta_mb.samplers.mb_sample_processor import ModelSampleProcessor env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) baseline = pickle.loads(baseline_pickle) self.env = env self.env_sampler = MetaSampler(env=env, policy=policy, **feed_dict['env_sampler']) self.dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, **feed_dict['dynamics_sample_processor'])
def construct_from_feed_dict( self, policy_pickle, env_pickle, baseline_pickle, # UNUSED dynamics_model_pickle, feed_dict, ): from meta_mb.samplers.sampler import Sampler from meta_mb.samplers.mb_sample_processor import ModelSampleProcessor env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) self.env = env self.env_sampler = Sampler(env=env, policy=policy, **feed_dict['sampler']) self.dynamics_sample_processor = ModelSampleProcessor( **feed_dict['sample_processor'])
def run_experiment(**kwargs): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + kwargs.get( 'exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) env = normalize(kwargs['env']()) # Wrappers? baseline = NNValueFun( 'value-function', env, hidden_nonlinearity=kwargs['vfun_hidden_nonlinearity'], hidden_sizes=kwargs['vfun_hidden_sizes'], output_nonlinearity=kwargs['vfun_output_nonlinearity'], learning_rate=kwargs['vfun_learning_rate'], batch_size=kwargs['vfun_batch_size'], buffer_size=kwargs['vfun_buffer_size'], normalize_input=False, ) policy = GaussianMLPPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsModel( 'prob-dynamics', env=env, hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], normalize_input=False, ) assert kwargs['num_rollouts'] % kwargs['n_parallel'] == 0 sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = SVG1( policy=policy, dynamics_model=dynamics_model, value_function=baseline, tf_reward=env.tf_reward, learning_rate=kwargs['svg_learning_rate'], num_grad_steps=kwargs['num_rollouts'] * kwargs['max_path_length'] // kwargs['svg_batch_size'], batch_size=kwargs['svg_batch_size'], discount=kwargs['discount'], kl_penalty=kwargs['kl_penalty'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, dynamics_model=dynamics_model, value_function=baseline, n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], vfun_max_epochs=kwargs['vfun_max_epochs'], sess=sess, ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/parallel_mb_ppo/' + EXP_NAME + '/' + kwargs.get('exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = GaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble('dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) env_sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) model_sampler = METRPOSampler( env=env, policy=policy, num_rollouts=kwargs['imagined_num_rollouts'], max_path_length=kwargs['max_path_length'], dynamics_model=dynamics_model, deterministic=kwargs['deterministic'], ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) model_sample_processor = SampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = PPO( policy=policy, learning_rate=kwargs['learning_rate'], clip_eps=kwargs['clip_eps'], max_epochs=kwargs['num_ppo_steps'], ) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, model_sample_processor=model_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], steps_per_iter=kwargs['steps_per_iter'], sample_from_buffer=True, sess=sess, ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + kwargs.get( 'exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() if not kwargs['use_images']: env = normalize(kwargs['env'](policytask=kwargs['task'])) vae = None else: vae = VAE(latent_dim=kwargs['latent_dim'], channels=3 * kwargs['time_steps']) env = image_wrapper(normalize(kwargs['env']()), latent_dim=kwargs['latent_dim'], time_steps=kwargs['time_steps']) policy = NNPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['hidden_sizes'], normalization=kwargs['normalization'], ) env_sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], vae=vae, ) model_sampler = ARSSampler( env=env, policy=policy, rollouts_per_policy=kwargs['rollouts_per_policy'], max_path_length=kwargs['max_path_length'], num_deltas=kwargs['num_deltas'], n_parallel=kwargs['num_deltas'], vae=vae, ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) ars_sample_processor = ARSSamplerProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = RandomSearchOptimizer(policy=policy, learning_rate=kwargs['learning_rate'], num_deltas=kwargs['num_deltas'], percentile=kwargs['percentile']) trainer = Trainer(algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, ars_sample_processor=ars_sample_processor, dynamics_sample_processor=dynamics_sample_processor, num_deltas=kwargs['num_deltas'], n_itr=kwargs['n_itr'], log_real_performance=kwargs['log_real_performance'], steps_per_iter=kwargs['steps_per_iter'], delta_std=kwargs['delta_std'], sess=sess) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) Qs = [ValueFunction(name="q_fun_%d" % i, obs_dim=int(np.prod(env.observation_space.shape)), action_dim=int(np.prod(env.action_space.shape)) ) for i in range(2)] Q_targets = [ValueFunction(name="q_fun_target_%d" % i, obs_dim=int(np.prod(env.observation_space.shape)), action_dim=int(np.prod(env.action_space.shape)) ) for i in range(2)] policy = TanhGaussianMLPPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = SAC( policy=policy, discount=kwargs['discount'], learning_rate=kwargs['learning_rate'], env=env, Qs=Qs, Q_targets=Q_targets, reward_scale=kwargs['reward_scale'] ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], sess=sess, ) trainer.train() sess.__exit__()
def run_experiment(**config): exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + config.get('exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(config, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config_sess = tf.ConfigProto() config_sess.gpu_options.allow_growth = True config_sess.gpu_options.per_process_gpu_memory_fraction = config.get('gpu_frac', 0.95) sess = tf.Session(config=config_sess) with sess.as_default() as sess: env = config['env']() if config['recurrent']: dynamics_model = RNNDynamicsEnsemble( name="dyn_model", env=env, hidden_sizes=config['hidden_sizes_model'], learning_rate=config['learning_rate'], backprop_steps=config['backprop_steps'], cell_type=config['cell_type'], num_models=config['num_models'], batch_size=config['batch_size_model'], normalize_input=True, ) policy = RNNMPCController( name="policy", env=env, dynamics_model=dynamics_model, discount=config['discount'], n_candidates=config['n_candidates'], horizon=config['horizon'], use_cem=config['use_cem'], num_cem_iters=config['num_cem_iters'], use_reward_model=config['use_reward_model'] ) else: dynamics_model = MLPDynamicsEnsemble( name="dyn_model", env=env, learning_rate=config['learning_rate'], hidden_sizes=config['hidden_sizes_model'], weight_normalization=config['weight_normalization_model'], num_models=config['num_models'], valid_split_ratio=config['valid_split_ratio'], rolling_average_persitency=config['rolling_average_persitency'], hidden_nonlinearity=config['hidden_nonlinearity_model'], batch_size=config['batch_size_model'], ) policy = MPCController( name="policy", env=env, dynamics_model=dynamics_model, discount=config['discount'], n_candidates=config['n_candidates'], horizon=config['horizon'], use_cem=config['use_cem'], num_cem_iters=config['num_cem_iters'], ) sampler = Sampler( env=env, policy=policy, num_rollouts=config['num_rollouts'], max_path_length=config['max_path_length'], n_parallel=config['n_parallel'], ) sample_processor = ModelSampleProcessor() algo = Trainer( env=env, policy=policy, dynamics_model=dynamics_model, sampler=sampler, dynamics_sample_processor=sample_processor, n_itr=config['n_itr'], initial_random_samples=config['initial_random_samples'], dynamics_model_max_epochs=config['dynamic_model_epochs'], initial_sinusoid_samples=config['initial_sinusoid_samples'], sess=sess, ) algo.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + kwargs.get( 'exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() if not kwargs['use_images']: env = normalize(kwargs['env']()) else: vae = VAE(latent_dim=8) env = image_wrapper(normalize(kwargs['env']()), vae=vae, latent_dim=32) policy = NNPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['hidden_sizes'], normalization=None, ) dynamics_model = MLPDynamicsEnsemble( 'dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) # dynamics_model = None assert kwargs['rollouts_per_policy'] % kwargs['num_models'] == 0 env_sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['num_rollouts'], ) # TODO: I'm not sure if it works with more than one rollout per model model_sampler = ARSSampler( env=env, policy=policy, dynamics_model=dynamics_model, rollouts_per_policy=kwargs['rollouts_per_policy'], max_path_length=kwargs['horizon'], num_deltas=kwargs['num_deltas'], n_parallel=1, ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) ars_sample_processor = ARSSamplerProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], uncertainty_coeff=kwargs['uncertainty_coeff']) algo = RandomSearchOptimizer(policy=policy, learning_rate=kwargs['learning_rate'], num_deltas=kwargs['num_deltas'], percentile=kwargs['percentile']) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, ars_sample_processor=ars_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, num_deltas=kwargs['num_deltas'], n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], steps_per_iter=kwargs['steps_per_iter'], delta_std=kwargs['delta_std'], sess=sess, initial_random_samples=True, sample_from_buffer=kwargs['sample_from_buffer']) trainer.train()
class WorkerData(Worker): def __init__(self, simulation_sleep): super().__init__() self.simulation_sleep = simulation_sleep self.env = None self.env_sampler = None self.dynamics_sample_processor = None self.samples_data_arr = [] def construct_from_feed_dict( self, policy_pickle, env_pickle, baseline_pickle, # UNUSED dynamics_model_pickle, feed_dict, ): from meta_mb.samplers.sampler import Sampler from meta_mb.samplers.mb_sample_processor import ModelSampleProcessor env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) self.env = env self.env_sampler = Sampler(env=env, policy=policy, **feed_dict['sampler']) self.dynamics_sample_processor = ModelSampleProcessor( **feed_dict['sample_processor']) def prepare_start(self): random_sinusoid = self.queue.get() self.step(random_sinusoid) self.push() def step(self, random_sinusoid=(False, False)): time_step = time.time() if self.itr_counter == 1 and self.env_sampler.policy.dynamics_model.normalization is None: if self.verbose: logger.log('Data starts first step...') self.env_sampler.policy.dynamics_model = pickle.loads( self.queue.get()) if self.verbose: logger.log('Data first step done...') '''------------- Obtaining samples from the environment -----------''' if self.verbose: logger.log("Data is obtaining samples...") env_paths = self.env_sampler.obtain_samples( log=True, random=random_sinusoid[0], sinusoid=random_sinusoid[1], log_prefix='Data-EnvSampler-', ) '''-------------- Processing environment samples -------------------''' if self.verbose: logger.log("Data is processing samples...") samples_data = self.dynamics_sample_processor.process_samples( env_paths, log=True, log_prefix='Data-EnvTrajs-', ) self.samples_data_arr.append(samples_data) time_step = time.time() - time_step time_sleep = max(self.simulation_sleep - time_step, 0) time.sleep(time_sleep) logger.logkv('Data-TimeStep', time_step) logger.logkv('Data-TimeSleep', time_sleep) def _synch(self, dynamics_model_state_pickle): time_synch = time.time() dynamics_model_state = pickle.loads(dynamics_model_state_pickle) assert isinstance(dynamics_model_state, dict) self.env_sampler.policy.dynamics_model.set_shared_params( dynamics_model_state) time_synch = time.time() - time_synch logger.logkv('Data-TimeSynch', time_synch) def push(self): time_push = time.time() self.queue_next.put(pickle.dumps(self.samples_data_arr)) self.samples_data_arr = [] time_push = time.time() - time_push logger.logkv('Data-TimePush', time_push) def set_stop_cond(self): if self.itr_counter >= self.n_itr: self.stop_cond.set()
class WorkerData(Worker): def __init__(self, num_rollouts_per_iter, simulation_sleep): super().__init__() self.num_rollouts_per_iter = num_rollouts_per_iter self.simulation_sleep = simulation_sleep self.env = None self.env_sampler = None self.dynamics_sample_processor = None self.samples_data_arr = [] def construct_from_feed_dict(self, policy_pickle, env_pickle, baseline_pickle, dynamics_model_pickle, feed_dict): from meta_mb.samplers.meta_samplers.meta_sampler import MetaSampler from meta_mb.samplers.mb_sample_processor import ModelSampleProcessor env = pickle.loads(env_pickle) policy = pickle.loads(policy_pickle) baseline = pickle.loads(baseline_pickle) self.env = env self.env_sampler = MetaSampler(env=env, policy=policy, **feed_dict['env_sampler']) self.dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, **feed_dict['dynamics_sample_processor']) def prepare_start(self): initial_random_samples = self.queue.get() self.step(initial_random_samples) self.push() def step(self, random=False): time_step = time.time() '''------------- Obtaining samples from the environment -----------''' if self.verbose: logger.log("Data is obtaining samples...") env_paths = self.env_sampler.obtain_samples( log=True, random=random, log_prefix='Data-EnvSampler-', ) '''-------------- Processing environment samples -------------------''' if self.verbose: logger.log("Data is processing samples...") if type(env_paths) is dict or type(env_paths) is OrderedDict: env_paths = list(env_paths.values()) idxs = np.random.choice(range(len(env_paths)), size=self.num_rollouts_per_iter, replace=False) env_paths = sum([env_paths[idx] for idx in idxs], []) elif type(env_paths) is list: idxs = np.random.choice(range(len(env_paths)), size=self.num_rollouts_per_iter, replace=False) env_paths = [env_paths[idx] for idx in idxs] else: raise TypeError samples_data = self.dynamics_sample_processor.process_samples( env_paths, log=True, log_prefix='Data-EnvTrajs-', ) self.samples_data_arr.append(samples_data) time_step = time.time() - time_step time_sleep = max(self.simulation_sleep - time_step, 0) time.sleep(time_sleep) logger.logkv('Data-TimeStep', time_step) logger.logkv('Data-TimeSleep', time_sleep) def _synch(self, policy_state_pickle): time_synch = time.time() policy_state = pickle.loads(policy_state_pickle) assert isinstance(policy_state, dict) self.env_sampler.policy.set_shared_params(policy_state) time_synch = time.time() - time_synch logger.logkv('Data-TimeSynch', time_synch) def push(self): time_push = time.time() self.queue_next.put(pickle.dumps(self.samples_data_arr)) self.samples_data_arr = [] time_push = time.time() - time_push logger.logkv('Data-TimePush', time_push) def set_stop_cond(self): if self.itr_counter >= self.n_itr: self.stop_cond.set()
def run_experiment(**kwargs): num = Num() exp_name = EXP_NAME + str(num.EXP_NUM) exp_dir = os.getcwd() + '/data/video_peg/' + EXP_NAME + kwargs.get( 'exp_name', '') logger.configure(dir=exp_dir, format_strs=['csv', 'stdout', 'log'], snapshot_mode='all') #change to all json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) Num.EXP_NUM += 1 with sess.as_default() as sess: # Instantiate classesLogger set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble( 'dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) env_sampler = BaseSampler( env=env, policy=policy, # rollouts_per_meta_task=kwargs['real_env_rollouts_per_meta_task'], num_rollouts=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], sleep_reset=2.5, #parallel=kwargs['parallel'], # parallel=False ) model_sampler = MBMPOSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], dynamics_model=dynamics_model, deterministic=kwargs['deterministic'], ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) model_sample_processor = MAMLSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = TRPOMAML( policy=policy, step_size=kwargs['step_size'], inner_type=kwargs['inner_type'], inner_lr=kwargs['inner_lr'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], exploration=kwargs['exploration'], ) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, model_sample_processor=model_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], meta_steps_per_iter=kwargs['meta_steps_per_iter'], sample_from_buffer=True, sess=sess, ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble('dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) env_sampler = SingleMetaSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['real_env_rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], ) model_sampler = MBMPOSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], dynamics_model=dynamics_model, ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) model_sample_processor = MAMLSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = TRPOMAML( policy=policy, step_size=kwargs['step_size'], inner_type=kwargs['inner_type'], inner_lr=kwargs['inner_lr'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], exploration=kwargs['exploration'], ) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, model_sample_processor=model_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], meta_steps_per_iter=kwargs['meta_steps_per_iter'], initial_random_samples=True, sample_from_buffer=True, ) trainer.train()