def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME exp_dir += "/" + kwargs['opt_type'] + "_" + kwargs['exp_name'] os.makedirs(exp_dir) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) set_seed(kwargs['seed']) # Instantiate classes t0 = time.time() run_1d_experiment( opt_type=kwargs['opt_type'], num_meta_tasks=kwargs['num_meta_tasks'], horizon=kwargs['horizon'], num_samples=kwargs['num_samples'], num_itr=kwargs['num_itr'], lr=kwargs['lr'], inner_lr=kwargs['inner_lr'], init_state_std=kwargs['init_state_std'], exp_dir=exp_dir, ) print("time taken: ", time.time() - t0)
def main(config): set_seed(config['seed']) tf.compat.v1.disable_eager_execution() physical_devices = tf.config.list_physical_devices('GPU') for gpu_id in range(len(physical_devices)): tf.config.experimental.set_memory_growth(physical_devices[gpu_id], True) baseline = LinearFeatureBaseline() env = normalize(ENV_DICT[config['env']]()) policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) Algo = VPGSGMRL if args.algo == 'sgmrl' else VPGMAML algo = Algo(policy=policy, inner_type=config['inner_type'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], inner_lr=config['inner_lr'], learning_rate=config['learning_rate'], exploration=(args.algo == 'emaml')) trainer = Trainer(algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps']) tester = Tester(algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=50, num_inner_grad_steps=config['num_inner_grad_steps']) best_itr = trainer.train(tester) print(best_itr)
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), # Todo...? action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], ) # Load policy here sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], envs_per_task=1, ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = ProMP( policy=policy, inner_lr=kwargs['inner_lr'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], learning_rate=kwargs['learning_rate'], num_ppo_steps=kwargs['num_ppo_steps'], num_minibatches=kwargs['num_minibatches'], clip_eps=kwargs['clip_eps'], clip_outer=kwargs['clip_outer'], target_outer_step=kwargs['target_outer_step'], target_inner_step=kwargs['target_inner_step'], init_outer_kl_penalty=kwargs['init_outer_kl_penalty'], init_inner_kl_penalty=kwargs['init_inner_kl_penalty'], adaptive_outer_kl_penalty=kwargs['adaptive_outer_kl_penalty'], adaptive_inner_kl_penalty=kwargs['adaptive_inner_kl_penalty'], anneal_factor=kwargs['anneal_factor'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], ) trainer.train()