Пример #1
0
def build_critic(args, data, env, writer=None):
    if args.use_critic_replay_memory:
        critic_replay_memory = hgail.misc.utils.KeyValueReplayMemory(maxsize=3 * args.batch_size)
    else:
        critic_replay_memory = None

    critic_dataset = CriticDataset(
        data, 
        replay_memory=critic_replay_memory,
        batch_size=args.critic_batch_size,
        flat_recurrent=args.policy_recurrent
    )

    critic_network = ObservationActionMLP(
        name='critic', 
        hidden_layer_dims=args.critic_hidden_layer_dims,
        dropout_keep_prob=args.critic_dropout_keep_prob
    )
    critic = WassersteinCritic(
        obs_dim=env.observation_space.flat_dim,
        act_dim=env.action_space.flat_dim,
        dataset=critic_dataset, 
        network=critic_network,
        gradient_penalty=args.gradient_penalty,
        optimizer=tf.train.RMSPropOptimizer(args.critic_learning_rate),
        n_train_epochs=args.n_critic_train_epochs,
        summary_writer=writer,
        grad_norm_rescale=args.critic_grad_rescale,
        verbose=2,
        debug_nan=True
    )
    return critic
Пример #2
0
def build_recognition_model(args, env, writer=None):
    if args.use_infogail:
        recognition_dataset = RecognitionDataset(
            args.batch_size,
            flat_recurrent=args.policy_recurrent
        )
        recognition_network = ObservationActionMLP(
            name='recog', 
            hidden_layer_dims=args.recognition_hidden_layer_dims,
            output_dim=args.latent_dim
        )
        recognition_model = RecognitionModel(
            obs_dim=env.observation_space.flat_dim,
            act_dim=env.action_space.flat_dim,
            dataset=recognition_dataset, 
            network=recognition_network,
            variable_type='categorical',
            latent_dim=args.latent_dim,
            optimizer=tf.train.AdamOptimizer(args.recognition_learning_rate),
            n_train_epochs=args.n_recognition_train_epochs,
            summary_writer=writer,
            verbose=2
        )
    else:
        recognition_model = None
    return recognition_model
    def test_train_simple(self):
        dataset = RecognitionDataset(batch_size=200, domain=True)

        # create dataset where x and a from unit gaussian and
        # c = [1,0] if x+a < 0 else [0,1]
        n_samples = 2000
        x = np.random.randn(n_samples).reshape(-1, 1)
        a = np.random.randn(n_samples).reshape(-1, 1)
        c = np.zeros((n_samples, 2), dtype=np.int32)
        zero_idxs = np.where(x + a < 0)[0]
        one_idxs = np.where(x + a >= 0)[0]
        c[zero_idxs, 0] = 1
        c[one_idxs, 1] = 1
        c = np.int32(c)
        d = np.zeros((n_samples, 2), dtype=np.int32)
        d[:(n_samples // 2), 0] = 1
        d[(n_samples // 2):, 1] = 1

        data = dict(observations=x,
                    actions=a,
                    agent_infos=dict(latent=c),
                    env_infos=dict(domain=d))

        with tf.Session() as session:
            latent_classifier = ObservationActionMLP(name='encoder',
                                                     hidden_layer_dims=[32],
                                                     output_dim=2,
                                                     return_features=True)

            domain_classifier = Classifier(name='domain_classifier',
                                           hidden_layer_dims=[32],
                                           output_dim=2)

            recog = DomainAdvRecognitionModel(
                latent_classifier=latent_classifier,
                domain_classifier=domain_classifier,
                obs_dim=1,
                act_dim=1,
                dataset=dataset,
                variable_type='categorical',
                latent_dim=2)
            session.run(tf.global_variables_initializer())

            n_epochs = 100
            for epoch in range(n_epochs):
                recog.train(epoch, data)

            probs = recog._probs(data['observations'], data['actions'])
            idxs = np.argmax(c, axis=1)
            loss = -np.log(probs[np.arange(n_samples), idxs]).mean()
            self.assertTrue(loss < .1)
Пример #4
0
    def test_train_gaussian(self):
        dataset = RecognitionDataset(batch_size=200)

        n_samples = 100
        latent_dim = 2
        x = np.random.randn(n_samples, latent_dim)
        a = np.random.randn(n_samples, latent_dim)
        mean = x + a
        sigma = np.exp(x * a / 2)
        c = []
        for (mean_i, sigma_i) in zip(mean, sigma):
            c.append(
                np.random.multivariate_normal(mean_i,
                                              np.eye(latent_dim) * sigma_i))
        c = np.array(c)

        data = dict(observations=x, actions=a, agent_infos=dict(latent=c))

        with tf.Session() as session:
            network = ObservationActionMLP(name='recognition',
                                           hidden_layer_dims=[64, 64],
                                           output_dim=latent_dim * 2)
            recog = RecognitionModel(obs_dim=2,
                                     act_dim=2,
                                     dataset=dataset,
                                     network=network,
                                     variable_type='gaussian',
                                     latent_dim=latent_dim,
                                     optimizer=tf.train.AdamOptimizer(
                                         .001, beta1=.5, beta2=.9),
                                     verbose=0)
            session.run(tf.global_variables_initializer())

            n_epochs = 200
            for epoch in range(n_epochs):
                recog.train(epoch, data)

            probs = recog._probs(data['observations'], data['actions'],
                                 data['agent_infos']['latent'])
            loss = np.mean(-np.log(probs))
            self.assertTrue(loss < 1.)  # just whether it can overfit
Пример #5
0
    def test_train_categorical(self):
        dataset = RecognitionDataset(batch_size=200)

        # create dataset where x and a from unit gaussian and
        # c = [1,0] if x+a < 0 else [0,1]
        n_samples = 2000
        x = np.random.randn(n_samples).reshape(-1, 1)
        a = np.random.randn(n_samples).reshape(-1, 1)
        c = np.zeros((n_samples, 2), dtype=np.int32)
        zero_idxs = np.where(x + a < 0)[0]
        one_idxs = np.where(x + a >= 0)[0]
        c[zero_idxs, 0] = 1
        c[one_idxs, 1] = 1
        c = np.int32(c)

        data = dict(observations=x, actions=a, agent_infos=dict(latent=c))

        with tf.Session() as session:
            network = ObservationActionMLP(name='recognition',
                                           hidden_layer_dims=[16],
                                           output_dim=2)
            recog = RecognitionModel(obs_dim=1,
                                     act_dim=1,
                                     dataset=dataset,
                                     network=network,
                                     variable_type='categorical',
                                     latent_dim=2)
            session.run(tf.global_variables_initializer())

            n_epochs = 100
            for epoch in range(n_epochs):
                recog.train(epoch, data)

            probs = recog._probs(data['observations'], data['actions'])
            idxs = np.argmax(c, axis=1)
            loss = -np.log(probs[np.arange(n_samples), idxs]).mean()
            self.assertTrue(loss < .1)
Пример #6
0
    def test_recognize_gaussian(self):

        network = ObservationActionMLP(name='recognition',
                                       hidden_layer_dims=[16],
                                       output_dim=2 * 2)
        dataset = RecognitionDataset(batch_size=10)

        with tf.Session() as session:

            recog = RecognitionModel(obs_dim=1,
                                     act_dim=1,
                                     dataset=dataset,
                                     network=network,
                                     variable_type='gaussian',
                                     latent_dim=2)
            session.run(tf.global_variables_initializer())
            paths = [
                dict(observations=[[1], [2]],
                     actions=[[1], [2]],
                     rewards=[[1], [2]],
                     agent_infos=dict(latent_info=dict(
                         latent=[[.5, .1], [.5, .1]]))),
                dict(observations=[[1], [2], [3]],
                     actions=[[1], [2], [3]],
                     rewards=[[1], [2], [3]],
                     agent_infos=dict(latent_info=dict(
                         latent=[[-.1, .1], [.4, .4], [.6, .7]]))),
                dict(observations=[[1]],
                     actions=[[1]],
                     rewards=[[1]],
                     agent_infos=dict(latent_info=dict(latent=[[-1., 1.]]))),
            ]
            rewards = recog.recognize(1, paths)
            self.assertTrue(len(rewards[0]) == 2)
            self.assertTrue(len(rewards[1]) == 3)
            self.assertTrue(len(rewards[2]) == 1)
Пример #7
0
else:
    critic_replay_memory = None
critic_dataset = CriticDataset(data,
                               replay_memory=critic_replay_memory,
                               batch_size=1000)

# session for actual training
with tf.Session() as session:

    # summary writer
    summary_writer = tf.summary.FileWriter(
        os.path.join(exp_dir, 'imitate', 'summaries'))

    # build the critic
    critic_network = ObservationActionMLP(
        name='critic',
        hidden_layer_dims=[64, 64],
        dropout_keep_prob=critic_dropout_keep_prob)
    critic = WassersteinCritic(
        obs_dim=env.observation_space.flat_dim,
        act_dim=env.action_space.n,
        dataset=critic_dataset,
        network=critic_network,
        gradient_penalty=1.,
        optimizer=tf.train.RMSPropOptimizer(critic_learning_rate),
        n_train_epochs=n_critic_train_epochs,
        summary_writer=summary_writer,
        grad_norm_rescale=50.,
        verbose=2,
    )

    if use_infogail:
Пример #8
0
else:
    critic_replay_memory = None

critic_dataset = CriticDataset(data, batch_size=4000, replay_memory=critic_replay_memory)

# session for actual training
with tf.Session() as session:
 
    # summary writer 
    summary_writer = tf.summary.FileWriter(
        os.path.join(exp_dir, phase, 'summaries'))

    # build the critic
    with tf.variable_scope('critic'):
        critic_network = ObservationActionMLP(
            name='critic', 
            hidden_layer_dims=[64,64]
        )
        critic = WassersteinCritic(
            obs_dim=env.observation_space.flat_dim,
            act_dim=env.action_space.n,
            dataset=critic_dataset, 
            network=critic_network,
            gradient_penalty=10.,
            optimizer=tf.train.RMSPropOptimizer(critc_lr),
            n_train_epochs=n_critic_train_epochs,
            summary_writer=summary_writer,
            verbose=2,
        )

    # level 2
    base_latent_sampler = UniformlyRandomLatentSampler(
Пример #9
0
def build_hgail(env, critic_dataset, batch_size):

    # critic
    with tf.variable_scope('critic'):
        critic_network = ObservationActionMLP(name='critic',
                                              hidden_layer_dims=[32, 32])
        critic = WassersteinCritic(obs_dim=3,
                                   act_dim=2,
                                   dataset=critic_dataset,
                                   network=critic_network,
                                   gradient_penalty=.01,
                                   optimizer=tf.train.AdamOptimizer(.001,
                                                                    beta1=.5,
                                                                    beta2=.9),
                                   n_train_epochs=50)

    # base latent variable sampler
    base_latent_sampler = UniformlyRandomLatentSampler(
        scheduler=ConstantIntervalScheduler(),
        name='base_latent_sampler',
        dim=3)

    with tf.variable_scope('level_1'):
        recog_dataset_1 = RecognitionDataset(batch_size=batch_size)
        recog_network_1 = ObservationActionMLP(name='recog',
                                               hidden_layer_dims=[32, 32],
                                               output_dim=3)
        recog_1 = RecognitionModel(obs_dim=3,
                                   act_dim=2,
                                   dataset=recog_dataset_1,
                                   network=recog_network_1,
                                   variable_type='categorical',
                                   latent_dim=3,
                                   name='recognition_1')

        latent_sampler = CategoricalLatentSampler(
            scheduler=ConstantIntervalScheduler(k=1),
            name='latent_sampler',
            policy_name='latent_sampler_policy',
            dim=2,
            env_spec=env.spec,
            latent_sampler=base_latent_sampler,
            max_n_envs=20)
        baseline_1 = LinearFeatureBaseline(env_spec=env.spec)

        algo_1 = TRPO(
            env=env,
            policy=latent_sampler,
            baseline=baseline_1,
            batch_size=4000,
            max_path_length=100,
            n_itr=15,
            discount=0.99,
            step_size=0.01,
            sampler_cls=HierarchySampler,
        )
        reward_handler_1 = RewardHandler(use_env_rewards=False,
                                         critic_final_scale=1.)
        level_1 = Level(depth=1,
                        algo=algo_1,
                        reward_handler=reward_handler_1,
                        recognition_model=recog_1)

    with tf.variable_scope('level_0'):

        # recognition model
        recog_dataset_0 = RecognitionDataset(batch_size=batch_size)
        recog_network_0 = ObservationActionMLP(name='recog',
                                               hidden_layer_dims=[32, 32],
                                               output_dim=2)
        recog_0 = RecognitionModel(obs_dim=3,
                                   act_dim=2,
                                   dataset=recog_dataset_0,
                                   network=recog_network_0,
                                   variable_type='categorical',
                                   latent_dim=2,
                                   name='recognition_0')

        policy = CategoricalLatentVarMLPPolicy(policy_name="policy",
                                               latent_sampler=latent_sampler,
                                               env_spec=env.spec)
        baseline_0 = LinearFeatureBaseline(env_spec=env.spec)

        algo_0 = TRPO(env=env,
                      policy=policy,
                      baseline=baseline_0,
                      batch_size=4000,
                      max_path_length=100,
                      n_itr=5,
                      discount=0.99,
                      step_size=0.1,
                      sampler_args=dict(n_envs=1))

        reward_handler_0 = RewardHandler(use_env_rewards=False,
                                         critic_final_scale=1.)
        level_0 = Level(depth=0,
                        algo=algo_0,
                        reward_handler=reward_handler_0,
                        recognition_model=recog_0)

    hierarchy = [level_0, level_1]
    algo = HGAIL(
        critic=critic,
        hierarchy=hierarchy,
    )
    return algo
Пример #10
0
    def test_infogail_two_round_stochastic_env(self):

        env = TfEnv(TwoRoundNondeterministicRewardEnv())

        # dataset of one-hot obs and acts
        # optimal actions: 0, 1
        # first state
        n_expert_samples = 1000
        batch_size = 1000
        half = int(n_expert_samples / 2)
        rx = np.zeros((n_expert_samples, 3))
        rx[:half, 2] = 1
        rx[half:, 0] = 1
        ra = np.zeros((n_expert_samples, 2))
        ra[:half, 0] = 1
        ra[half:, 1] = 1

        with tf.Session() as session:
            # critic
            critic_dataset = CriticDataset(dict(observations=rx, actions=ra),
                                           batch_size=batch_size)
            critic_network = ObservationActionMLP(name='critic',
                                                  hidden_layer_dims=[32, 32])
            critic = WassersteinCritic(obs_dim=3,
                                       act_dim=2,
                                       dataset=critic_dataset,
                                       network=critic_network,
                                       gradient_penalty=.01,
                                       optimizer=tf.train.AdamOptimizer(
                                           .001, beta1=.5, beta2=.9),
                                       n_train_epochs=50)

            # recognition model
            recog_dataset = RecognitionDataset(batch_size=batch_size)
            recog_network = ObservationActionMLP(name='recog',
                                                 hidden_layer_dims=[32, 32],
                                                 output_dim=2)
            recog = RecognitionModel(obs_dim=3,
                                     act_dim=2,
                                     dataset=recog_dataset,
                                     network=recog_network,
                                     variable_type='categorical',
                                     latent_dim=2)

            # policy
            env.spec.num_envs = 10
            latent_sampler = UniformlyRandomLatentSampler(
                scheduler=ConstantIntervalScheduler(),
                name='latent_sampler',
                dim=2)
            policy = CategoricalLatentVarMLPPolicy(
                policy_name="policy",
                latent_sampler=latent_sampler,
                env_spec=env.spec)

            # gail
            reward_handler = RewardHandler(use_env_rewards=False,
                                           critic_final_scale=1.)
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = GAIL(critic=critic,
                        recognition=recog,
                        reward_handler=reward_handler,
                        env=env,
                        policy=policy,
                        baseline=baseline,
                        batch_size=4000,
                        max_path_length=200,
                        n_itr=15,
                        discount=.99,
                        step_size=.01,
                        sampler_args=dict(n_envs=env.spec.num_envs))

            session.run(tf.global_variables_initializer())

            # run it!
            algo.train(sess=session)

            # evaluate
            l0_state_infos = dict(latent=[[1, 0]])
            l0_dist_2 = policy.dist_info([[0., 0., 1.]],
                                         l0_state_infos)['prob']
            l0_dist_0 = policy.dist_info([[1., 0., 0.]],
                                         l0_state_infos)['prob']

            l1_state_infos = dict(latent=[[0, 1]])
            l1_dist_2 = policy.dist_info([[0., 0., 1.]],
                                         l1_state_infos)['prob']
            l1_dist_0 = policy.dist_info([[1., 0., 0.]],
                                         l1_state_infos)['prob']

            np.testing.assert_array_almost_equal(l0_dist_2, [[1, 0]], 1)
            np.testing.assert_array_almost_equal(l0_dist_0, [[0, 1]], 1)
            np.testing.assert_array_almost_equal(l1_dist_2, [[1, 0]], 1)
            np.testing.assert_array_almost_equal(l1_dist_0, [[0, 1]], 1)
    def test_train_domain_matters(self):
        dataset = RecognitionDataset(batch_size=1000, domain=True)

        # need a case where if you have domain adversarial training
        # the classifier isn't able to work, but if you don't have it
        # then it does work
        n_samples = 100
        obs_dim = act_dim = 2
        xs = np.ones((n_samples, obs_dim))
        ys = np.zeros((n_samples, 2))
        ys[:, 0] = 1
        xt = -np.ones((n_samples, obs_dim))
        yt = np.zeros((n_samples, 2))
        yt[:, 1] = 1
        x = np.concatenate((xs, xt), 0)
        y = np.concatenate((ys, yt), 0)

        # random permute beforehand because otherwise it seems to have some
        # unusual behavior because each batch contains only one of the domains
        # that is, the loss just keeps increasing with feature values also
        # increasing
        # shouldn't the feature values just match over time?
        # instead seemingly arbitrary values just grow increasingly large
        # maybe it requires an l2 penalty to work? or dropout?
        idxs = np.random.permutation(n_samples * 2)
        data = dict(observations=x[idxs],
                    actions=x[idxs],
                    agent_infos=dict(latent=y[idxs]),
                    env_infos=dict(domain=y[idxs]))

        with tf.Session() as session:
            latent_classifier = ObservationActionMLP(name='encoder',
                                                     hidden_layer_dims=[16, 4],
                                                     output_dim=2,
                                                     return_features=True,
                                                     dropout_keep_prob=1.,
                                                     l2_reg=0.)

            domain_classifier = Classifier(name='domain_classifier',
                                           hidden_layer_dims=[16, 16],
                                           output_dim=2,
                                           dropout_keep_prob=1.)

            recog = DomainAdvRecognitionModel(
                latent_classifier=latent_classifier,
                domain_classifier=domain_classifier,
                obs_dim=obs_dim,
                act_dim=act_dim,
                dataset=dataset,
                variable_type='categorical',
                latent_dim=2,
                lambda_final=1e10,
                lambda_initial=1e10,
                grad_clip=1000.0,
                grad_scale=50.0,
                verbose=0)
            session.run(tf.global_variables_initializer())

            n_epochs = 500
            for epoch in range(n_epochs):
                recog.train(epoch, data)

            feed = {recog.x: x, recog.a: x, recog.c: y, recog.d: y}
            outputs_list = [
                recog.features, recog.acc, recog.domain_acc,
                recog.domain_probs, recog.probs, recog.gradients
            ]
            features, acc, domain_acc, domain_probs, probs, grads = session.run(
                outputs_list, feed_dict=feed)
            src_features = features[:n_samples]
            tgt_features = features[n_samples:]

            self.assertTrue(np.abs(domain_probs[0][0] - .5) < .1)
            self.assertTrue(np.abs(domain_probs[n_samples][0] - .5) < .1)