def setUp(self): self.test_env = TestEnv() self.random_env = RandomEnv() self.test_policy = TestPolicy(obs_dim=3, action_dim=4) self.return_policy = ReturnPolicy(obs_dim=3, action_dim=4) self.random_policy = RandomPolicy(obs_dim=3, action_dim=4) self.meta_batch_size = 3 self.batch_size = 4 self.path_length = 5 self.it_sampler = MetaSampler(self.test_env, self.test_policy, self.batch_size, self.meta_batch_size, self.path_length, parallel=False) self.par_sampler = MetaSampler(self.test_env, self.test_policy, self.batch_size, self.meta_batch_size, self.path_length, parallel=True) self.sample_processor = SampleProcessor(baseline=LinearFeatureBaseline()) self.Meta_sample_processor = MetaSampleProcessor(baseline=LinearFeatureBaseline())
def setUp(self): self.random_env = RandomEnv() self.random_policy = RandomPolicy(1, 1) self.meta_batch_size = 2 self.batch_size = 10 self.path_length = 100 self.linear = LinearFeatureBaseline() self.sampler = MetaSampler(self.random_env, self.random_policy, self.batch_size, self.meta_batch_size, self.path_length, parallel=True)
class TestLinearFeatureBaseline(unittest.TestCase): def setUp(self): self.random_env = RandomEnv() self.random_policy = RandomPolicy(1, 1) self.meta_batch_size = 2 self.batch_size = 10 self.path_length = 100 self.linear = LinearFeatureBaseline() self.sampler = MetaSampler(self.random_env, self.random_policy, self.batch_size, self.meta_batch_size, self.path_length, parallel=True) def testFit(self): paths = self.sampler.obtain_samples() for task in paths.values(): unfit_error = 0 for path in task: path["returns"] = utils.discount_cumsum(path["rewards"], 0.99) unfit_pred = self.linear.predict(path) unfit_error += sum([ np.square(pred - actual) for pred, actual in zip(unfit_pred, path['returns']) ]) self.linear.fit(task) fit_error = 0 for path in task: fit_pred = self.linear.predict(path) fit_error += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) self.assertTrue(fit_error < unfit_error) def testSerialize(self): paths = self.sampler.obtain_samples() for task in paths.values(): for path in task: path["returns"] = utils.discount_cumsum(path["rewards"], 0.99) self.linear.fit(task) fit_error_pre = 0 for path in task: fit_pred = self.linear.predict(path) fit_error_pre += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) pkl = pickle.dumps(self.linear) self.linear = pickle.loads(pkl) fit_error_post = 0 for path in task: fit_pred = self.linear.predict(path) fit_error_post += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) self.assertEqual(fit_error_pre, fit_error_post)
def main(config): set_seed(config['seed']) reward_baseline = LinearTimeBaseline() # the usual baseline return_baseline = LinearFeatureBaseline( ) # the additional baseline for DICE env = globals()[config['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env meta_baseline = MetaNNBaseline( input_size=env.observation_space.shape[0]) # the meta baseline policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = TMAMLMetaSampleProcessor( baseline=reward_baseline, max_path_length=config['max_path_length'], discount=config['discount'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], return_baseline=return_baseline, metabaseline=meta_baseline, ) algo = TMAML(policy=policy, max_path_length=config['max_path_length'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], inner_lr=config['inner_lr'], learning_rate=config['learning_rate']) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], ) trainer.train()
def main(config): baseline = LinearFeatureBaseline() #env = rl2env(HalfCheetahRandDirecEnv()) env = rl2env(globals()[config['env']]()) # instantiate env obs_dim = np.prod(env.observation_space.shape) + np.prod(env.action_space.shape) + 1 + 1 policy = GaussianRNNPolicy( name="meta-policy", obs_dim=obs_dim, action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], cell_type=config['cell_type'] ) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], envs_per_task=1, ) sample_processor = RL2SampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = PPO( policy=policy, learning_rate=config['learning_rate'], max_epochs=config['max_epochs'] ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], ) trainer.train()
def setUp(self): self.env = env = MetaPointEnv() self.baseline = baseline = LinearFeatureBaseline() self.policy = policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=10, hidden_sizes=(16, 16), learn_std=True, hidden_nonlinearity=tf.tanh, output_nonlinearity=None, ) self.sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=2, meta_batch_size=10, max_path_length=50, parallel=False, ) self.sample_processor = MetaSampleProcessor( baseline=baseline, discount=0.99, gae_lambda=1.0, normalize_adv=True, positive_adv=False, ) self.algo = ProMP( policy=policy, inner_lr=0.1, meta_batch_size=10, num_inner_grad_steps=2, learning_rate=1e-3, num_ppo_steps=5, num_minibatches=1, clip_eps=0.5, target_inner_step=2e-2, init_inner_kl_penalty=1e-3, )
def test_process_samples_advantages2(self): for normalize_adv in [True, False]: for paths in [self.paths, self.paths_rand]: return_baseline = LinearFeatureBaseline() dice_sample_processor = DiceSampleProcessor(self.baseline, max_path_length=6, gae_lambda=1.0, discount=0.97, normalize_adv=normalize_adv, return_baseline=return_baseline) dice_samples_data = dice_sample_processor.process_samples(paths[0]) mask = dice_samples_data['mask'] # reshape data and filter out masked items: sample_processor = SampleProcessor(return_baseline, gae_lambda=1.0, discount=0.97, normalize_adv=normalize_adv) samples_data = sample_processor.process_samples(paths[0]) self.assertAlmostEqual(np.sum(mask[:,:, None]*dice_samples_data['observations']), np.sum(samples_data['observations'])) self.assertAlmostEqual(np.sum(mask[:, :, None] * dice_samples_data['actions']), np.sum(samples_data['actions'])) self.assertAlmostEqual(np.sum(mask * dice_samples_data['advantages']), np.sum(samples_data['advantages']), places=2) self.assertAlmostEqual(np.sum(mask * dice_samples_data['rewards']), np.sum(samples_data['rewards']))
def test_process_samples_advantages1(self): return_baseline = LinearFeatureBaseline() sample_processor = DiceSampleProcessor(self.baseline, max_path_length=6, return_baseline=return_baseline) samples_data = sample_processor.process_samples(self.paths[0]) self.assertAlmostEqual(samples_data['advantages'].shape, (self.batch_size, 6)) self.assertAlmostEqual(samples_data['advantages'].ndim, 2)
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) reward_baseline = LinearTimeBaseline() return_baseline = LinearFeatureBaseline() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), # Todo...? action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], ) # Load policy here sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], envs_per_task=int(kwargs['rollouts_per_meta_task'] / 2)) sample_processor = DiceMAMLSampleProcessor( baseline=reward_baseline, max_path_length=kwargs['max_path_length'], discount=kwargs['discount'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], return_baseline=return_baseline) algo = VPG_DICEMAML(policy=policy, max_path_length=kwargs['max_path_length'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], inner_lr=kwargs['inner_lr'], learning_rate=kwargs['learning_rate']) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], ) trainer.train()
type=json.loads, default={}, help='accepts json for overriding training parameters') parser.add_argument('--video_filename', default=None) parser.add_argument('--num_trajs', type=int, default=10) args = parser.parse_args(sys.argv[1:]) params_path = os.path.join( os.path.split(args.restore_path)[0], 'params.json') with open(params_path, 'r') as f: params = json.load(f) params.update(args.overrides) baseline = LinearFeatureBaseline() env = globals()[params['env']]() # instantiate env env = normalize(env) # apply normalize wrapper to env gpu_config = tf.ConfigProto() gpu_config.gpu_options.allow_growth = True # pylint: disable=E1101 sess = tf.Session(config=gpu_config) policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=params['meta_batch_size'], hidden_sizes=params['hidden_sizes'], cell_size=params['cell_size'],