def build_critic(args, data, env, writer=None): if args.use_critic_replay_memory: critic_replay_memory = hgail.misc.utils.KeyValueReplayMemory(maxsize=3 * args.batch_size) else: critic_replay_memory = None critic_dataset = CriticDataset( data, replay_memory=critic_replay_memory, batch_size=args.critic_batch_size, flat_recurrent=args.policy_recurrent ) critic_network = ObservationActionMLP( name='critic', hidden_layer_dims=args.critic_hidden_layer_dims, dropout_keep_prob=args.critic_dropout_keep_prob ) critic = WassersteinCritic( obs_dim=env.observation_space.flat_dim, act_dim=env.action_space.flat_dim, dataset=critic_dataset, network=critic_network, gradient_penalty=args.gradient_penalty, optimizer=tf.train.RMSPropOptimizer(args.critic_learning_rate), n_train_epochs=args.n_critic_train_epochs, summary_writer=writer, grad_norm_rescale=args.critic_grad_rescale, verbose=2, debug_nan=True ) return critic
def build_recognition_model(args, env, writer=None): if args.use_infogail: recognition_dataset = RecognitionDataset( args.batch_size, flat_recurrent=args.policy_recurrent ) recognition_network = ObservationActionMLP( name='recog', hidden_layer_dims=args.recognition_hidden_layer_dims, output_dim=args.latent_dim ) recognition_model = RecognitionModel( obs_dim=env.observation_space.flat_dim, act_dim=env.action_space.flat_dim, dataset=recognition_dataset, network=recognition_network, variable_type='categorical', latent_dim=args.latent_dim, optimizer=tf.train.AdamOptimizer(args.recognition_learning_rate), n_train_epochs=args.n_recognition_train_epochs, summary_writer=writer, verbose=2 ) else: recognition_model = None return recognition_model
def test_train_simple(self): dataset = RecognitionDataset(batch_size=200, domain=True) # create dataset where x and a from unit gaussian and # c = [1,0] if x+a < 0 else [0,1] n_samples = 2000 x = np.random.randn(n_samples).reshape(-1, 1) a = np.random.randn(n_samples).reshape(-1, 1) c = np.zeros((n_samples, 2), dtype=np.int32) zero_idxs = np.where(x + a < 0)[0] one_idxs = np.where(x + a >= 0)[0] c[zero_idxs, 0] = 1 c[one_idxs, 1] = 1 c = np.int32(c) d = np.zeros((n_samples, 2), dtype=np.int32) d[:(n_samples // 2), 0] = 1 d[(n_samples // 2):, 1] = 1 data = dict(observations=x, actions=a, agent_infos=dict(latent=c), env_infos=dict(domain=d)) with tf.Session() as session: latent_classifier = ObservationActionMLP(name='encoder', hidden_layer_dims=[32], output_dim=2, return_features=True) domain_classifier = Classifier(name='domain_classifier', hidden_layer_dims=[32], output_dim=2) recog = DomainAdvRecognitionModel( latent_classifier=latent_classifier, domain_classifier=domain_classifier, obs_dim=1, act_dim=1, dataset=dataset, variable_type='categorical', latent_dim=2) session.run(tf.global_variables_initializer()) n_epochs = 100 for epoch in range(n_epochs): recog.train(epoch, data) probs = recog._probs(data['observations'], data['actions']) idxs = np.argmax(c, axis=1) loss = -np.log(probs[np.arange(n_samples), idxs]).mean() self.assertTrue(loss < .1)
def test_train_gaussian(self): dataset = RecognitionDataset(batch_size=200) n_samples = 100 latent_dim = 2 x = np.random.randn(n_samples, latent_dim) a = np.random.randn(n_samples, latent_dim) mean = x + a sigma = np.exp(x * a / 2) c = [] for (mean_i, sigma_i) in zip(mean, sigma): c.append( np.random.multivariate_normal(mean_i, np.eye(latent_dim) * sigma_i)) c = np.array(c) data = dict(observations=x, actions=a, agent_infos=dict(latent=c)) with tf.Session() as session: network = ObservationActionMLP(name='recognition', hidden_layer_dims=[64, 64], output_dim=latent_dim * 2) recog = RecognitionModel(obs_dim=2, act_dim=2, dataset=dataset, network=network, variable_type='gaussian', latent_dim=latent_dim, optimizer=tf.train.AdamOptimizer( .001, beta1=.5, beta2=.9), verbose=0) session.run(tf.global_variables_initializer()) n_epochs = 200 for epoch in range(n_epochs): recog.train(epoch, data) probs = recog._probs(data['observations'], data['actions'], data['agent_infos']['latent']) loss = np.mean(-np.log(probs)) self.assertTrue(loss < 1.) # just whether it can overfit
def test_train_categorical(self): dataset = RecognitionDataset(batch_size=200) # create dataset where x and a from unit gaussian and # c = [1,0] if x+a < 0 else [0,1] n_samples = 2000 x = np.random.randn(n_samples).reshape(-1, 1) a = np.random.randn(n_samples).reshape(-1, 1) c = np.zeros((n_samples, 2), dtype=np.int32) zero_idxs = np.where(x + a < 0)[0] one_idxs = np.where(x + a >= 0)[0] c[zero_idxs, 0] = 1 c[one_idxs, 1] = 1 c = np.int32(c) data = dict(observations=x, actions=a, agent_infos=dict(latent=c)) with tf.Session() as session: network = ObservationActionMLP(name='recognition', hidden_layer_dims=[16], output_dim=2) recog = RecognitionModel(obs_dim=1, act_dim=1, dataset=dataset, network=network, variable_type='categorical', latent_dim=2) session.run(tf.global_variables_initializer()) n_epochs = 100 for epoch in range(n_epochs): recog.train(epoch, data) probs = recog._probs(data['observations'], data['actions']) idxs = np.argmax(c, axis=1) loss = -np.log(probs[np.arange(n_samples), idxs]).mean() self.assertTrue(loss < .1)
def test_recognize_gaussian(self): network = ObservationActionMLP(name='recognition', hidden_layer_dims=[16], output_dim=2 * 2) dataset = RecognitionDataset(batch_size=10) with tf.Session() as session: recog = RecognitionModel(obs_dim=1, act_dim=1, dataset=dataset, network=network, variable_type='gaussian', latent_dim=2) session.run(tf.global_variables_initializer()) paths = [ dict(observations=[[1], [2]], actions=[[1], [2]], rewards=[[1], [2]], agent_infos=dict(latent_info=dict( latent=[[.5, .1], [.5, .1]]))), dict(observations=[[1], [2], [3]], actions=[[1], [2], [3]], rewards=[[1], [2], [3]], agent_infos=dict(latent_info=dict( latent=[[-.1, .1], [.4, .4], [.6, .7]]))), dict(observations=[[1]], actions=[[1]], rewards=[[1]], agent_infos=dict(latent_info=dict(latent=[[-1., 1.]]))), ] rewards = recog.recognize(1, paths) self.assertTrue(len(rewards[0]) == 2) self.assertTrue(len(rewards[1]) == 3) self.assertTrue(len(rewards[2]) == 1)
else: critic_replay_memory = None critic_dataset = CriticDataset(data, replay_memory=critic_replay_memory, batch_size=1000) # session for actual training with tf.Session() as session: # summary writer summary_writer = tf.summary.FileWriter( os.path.join(exp_dir, 'imitate', 'summaries')) # build the critic critic_network = ObservationActionMLP( name='critic', hidden_layer_dims=[64, 64], dropout_keep_prob=critic_dropout_keep_prob) critic = WassersteinCritic( obs_dim=env.observation_space.flat_dim, act_dim=env.action_space.n, dataset=critic_dataset, network=critic_network, gradient_penalty=1., optimizer=tf.train.RMSPropOptimizer(critic_learning_rate), n_train_epochs=n_critic_train_epochs, summary_writer=summary_writer, grad_norm_rescale=50., verbose=2, ) if use_infogail:
else: critic_replay_memory = None critic_dataset = CriticDataset(data, batch_size=4000, replay_memory=critic_replay_memory) # session for actual training with tf.Session() as session: # summary writer summary_writer = tf.summary.FileWriter( os.path.join(exp_dir, phase, 'summaries')) # build the critic with tf.variable_scope('critic'): critic_network = ObservationActionMLP( name='critic', hidden_layer_dims=[64,64] ) critic = WassersteinCritic( obs_dim=env.observation_space.flat_dim, act_dim=env.action_space.n, dataset=critic_dataset, network=critic_network, gradient_penalty=10., optimizer=tf.train.RMSPropOptimizer(critc_lr), n_train_epochs=n_critic_train_epochs, summary_writer=summary_writer, verbose=2, ) # level 2 base_latent_sampler = UniformlyRandomLatentSampler(
def build_hgail(env, critic_dataset, batch_size): # critic with tf.variable_scope('critic'): critic_network = ObservationActionMLP(name='critic', hidden_layer_dims=[32, 32]) critic = WassersteinCritic(obs_dim=3, act_dim=2, dataset=critic_dataset, network=critic_network, gradient_penalty=.01, optimizer=tf.train.AdamOptimizer(.001, beta1=.5, beta2=.9), n_train_epochs=50) # base latent variable sampler base_latent_sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(), name='base_latent_sampler', dim=3) with tf.variable_scope('level_1'): recog_dataset_1 = RecognitionDataset(batch_size=batch_size) recog_network_1 = ObservationActionMLP(name='recog', hidden_layer_dims=[32, 32], output_dim=3) recog_1 = RecognitionModel(obs_dim=3, act_dim=2, dataset=recog_dataset_1, network=recog_network_1, variable_type='categorical', latent_dim=3, name='recognition_1') latent_sampler = CategoricalLatentSampler( scheduler=ConstantIntervalScheduler(k=1), name='latent_sampler', policy_name='latent_sampler_policy', dim=2, env_spec=env.spec, latent_sampler=base_latent_sampler, max_n_envs=20) baseline_1 = LinearFeatureBaseline(env_spec=env.spec) algo_1 = TRPO( env=env, policy=latent_sampler, baseline=baseline_1, batch_size=4000, max_path_length=100, n_itr=15, discount=0.99, step_size=0.01, sampler_cls=HierarchySampler, ) reward_handler_1 = RewardHandler(use_env_rewards=False, critic_final_scale=1.) level_1 = Level(depth=1, algo=algo_1, reward_handler=reward_handler_1, recognition_model=recog_1) with tf.variable_scope('level_0'): # recognition model recog_dataset_0 = RecognitionDataset(batch_size=batch_size) recog_network_0 = ObservationActionMLP(name='recog', hidden_layer_dims=[32, 32], output_dim=2) recog_0 = RecognitionModel(obs_dim=3, act_dim=2, dataset=recog_dataset_0, network=recog_network_0, variable_type='categorical', latent_dim=2, name='recognition_0') policy = CategoricalLatentVarMLPPolicy(policy_name="policy", latent_sampler=latent_sampler, env_spec=env.spec) baseline_0 = LinearFeatureBaseline(env_spec=env.spec) algo_0 = TRPO(env=env, policy=policy, baseline=baseline_0, batch_size=4000, max_path_length=100, n_itr=5, discount=0.99, step_size=0.1, sampler_args=dict(n_envs=1)) reward_handler_0 = RewardHandler(use_env_rewards=False, critic_final_scale=1.) level_0 = Level(depth=0, algo=algo_0, reward_handler=reward_handler_0, recognition_model=recog_0) hierarchy = [level_0, level_1] algo = HGAIL( critic=critic, hierarchy=hierarchy, ) return algo
def test_infogail_two_round_stochastic_env(self): env = TfEnv(TwoRoundNondeterministicRewardEnv()) # dataset of one-hot obs and acts # optimal actions: 0, 1 # first state n_expert_samples = 1000 batch_size = 1000 half = int(n_expert_samples / 2) rx = np.zeros((n_expert_samples, 3)) rx[:half, 2] = 1 rx[half:, 0] = 1 ra = np.zeros((n_expert_samples, 2)) ra[:half, 0] = 1 ra[half:, 1] = 1 with tf.Session() as session: # critic critic_dataset = CriticDataset(dict(observations=rx, actions=ra), batch_size=batch_size) critic_network = ObservationActionMLP(name='critic', hidden_layer_dims=[32, 32]) critic = WassersteinCritic(obs_dim=3, act_dim=2, dataset=critic_dataset, network=critic_network, gradient_penalty=.01, optimizer=tf.train.AdamOptimizer( .001, beta1=.5, beta2=.9), n_train_epochs=50) # recognition model recog_dataset = RecognitionDataset(batch_size=batch_size) recog_network = ObservationActionMLP(name='recog', hidden_layer_dims=[32, 32], output_dim=2) recog = RecognitionModel(obs_dim=3, act_dim=2, dataset=recog_dataset, network=recog_network, variable_type='categorical', latent_dim=2) # policy env.spec.num_envs = 10 latent_sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(), name='latent_sampler', dim=2) policy = CategoricalLatentVarMLPPolicy( policy_name="policy", latent_sampler=latent_sampler, env_spec=env.spec) # gail reward_handler = RewardHandler(use_env_rewards=False, critic_final_scale=1.) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = GAIL(critic=critic, recognition=recog, reward_handler=reward_handler, env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=200, n_itr=15, discount=.99, step_size=.01, sampler_args=dict(n_envs=env.spec.num_envs)) session.run(tf.global_variables_initializer()) # run it! algo.train(sess=session) # evaluate l0_state_infos = dict(latent=[[1, 0]]) l0_dist_2 = policy.dist_info([[0., 0., 1.]], l0_state_infos)['prob'] l0_dist_0 = policy.dist_info([[1., 0., 0.]], l0_state_infos)['prob'] l1_state_infos = dict(latent=[[0, 1]]) l1_dist_2 = policy.dist_info([[0., 0., 1.]], l1_state_infos)['prob'] l1_dist_0 = policy.dist_info([[1., 0., 0.]], l1_state_infos)['prob'] np.testing.assert_array_almost_equal(l0_dist_2, [[1, 0]], 1) np.testing.assert_array_almost_equal(l0_dist_0, [[0, 1]], 1) np.testing.assert_array_almost_equal(l1_dist_2, [[1, 0]], 1) np.testing.assert_array_almost_equal(l1_dist_0, [[0, 1]], 1)
def test_train_domain_matters(self): dataset = RecognitionDataset(batch_size=1000, domain=True) # need a case where if you have domain adversarial training # the classifier isn't able to work, but if you don't have it # then it does work n_samples = 100 obs_dim = act_dim = 2 xs = np.ones((n_samples, obs_dim)) ys = np.zeros((n_samples, 2)) ys[:, 0] = 1 xt = -np.ones((n_samples, obs_dim)) yt = np.zeros((n_samples, 2)) yt[:, 1] = 1 x = np.concatenate((xs, xt), 0) y = np.concatenate((ys, yt), 0) # random permute beforehand because otherwise it seems to have some # unusual behavior because each batch contains only one of the domains # that is, the loss just keeps increasing with feature values also # increasing # shouldn't the feature values just match over time? # instead seemingly arbitrary values just grow increasingly large # maybe it requires an l2 penalty to work? or dropout? idxs = np.random.permutation(n_samples * 2) data = dict(observations=x[idxs], actions=x[idxs], agent_infos=dict(latent=y[idxs]), env_infos=dict(domain=y[idxs])) with tf.Session() as session: latent_classifier = ObservationActionMLP(name='encoder', hidden_layer_dims=[16, 4], output_dim=2, return_features=True, dropout_keep_prob=1., l2_reg=0.) domain_classifier = Classifier(name='domain_classifier', hidden_layer_dims=[16, 16], output_dim=2, dropout_keep_prob=1.) recog = DomainAdvRecognitionModel( latent_classifier=latent_classifier, domain_classifier=domain_classifier, obs_dim=obs_dim, act_dim=act_dim, dataset=dataset, variable_type='categorical', latent_dim=2, lambda_final=1e10, lambda_initial=1e10, grad_clip=1000.0, grad_scale=50.0, verbose=0) session.run(tf.global_variables_initializer()) n_epochs = 500 for epoch in range(n_epochs): recog.train(epoch, data) feed = {recog.x: x, recog.a: x, recog.c: y, recog.d: y} outputs_list = [ recog.features, recog.acc, recog.domain_acc, recog.domain_probs, recog.probs, recog.gradients ] features, acc, domain_acc, domain_probs, probs, grads = session.run( outputs_list, feed_dict=feed) src_features = features[:n_samples] tgt_features = features[n_samples:] self.assertTrue(np.abs(domain_probs[0][0] - .5) < .1) self.assertTrue(np.abs(domain_probs[n_samples][0] - .5) < .1)