Exemplo n.º 1
0
    def __init__(self, policy, rate, train=True):
        self.rate = rate
        self.policy = policy

        with tf.variable_scope('policy_estimator'):
            self.policy.setup()

            self.X = policy.X
            self.a = policy.a
            self.target = tf.placeholder(dtype='float',
                                         shape=[None, 1],
                                         name='target')

            self.a_pred = policy.a_pred
            self.var = policy.var

            dist = Normal(self.a_pred, self.var)
            self.log_probs = dist.log_pdf(self.a)

            self.losses = self.log_probs * self.target
            self.loss = tf.reduce_sum(self.losses, name='loss')

            if train:
                self.opt = tf.train.RMSPropOptimizer(rate, 0.99, 0.0, 1e-6)
                self.grads_and_vars = self.opt.compute_gradients(self.loss)
                self.grads_and_vars = [(g, v) for g, v in self.grads_and_vars
                                       if g is not None]
                self.update = self.opt.apply_gradients(self.grads_and_vars)
Exemplo n.º 2
0
    def _create_network(self):
        # Initialize autoencode network weights and biases
        network_weights = self._initialize_weights(**self.network_architecture)

        # Use recognition network to determine mean and
        # (log) variance of Gaussian distribution in latent
        # space
        self.z_mean, self.z_log_sigma_sq = \
            self._recognition_network(network_weights["weights_recog"],
                                      network_weights["biases_recog"],
                                      self.x)

        # Draw one sample z from Gaussian distribution
        n_z = self.network_architecture["n_z"]
        eps = tf.random_normal((self.batch_size, n_z), 0, 1,
                               dtype=tf.float32)
        # z = mu + sigma*epsilon
        self.z = tf.add(self.z_mean,
                        tf.mul(tf.sqrt(tf.exp(self.z_log_sigma_sq)), eps),
                        name='z')

        # Use generator to determine mean of
        # Bernoulli distribution of reconstructed input
        self.x_reconstr_mean = \
            self._generator_network(network_weights["weights_gener"],
                                    network_weights["biases_gener"],
                                    z=self.z)

        ####
        ####
        ####
        eps = tf.random_normal((self.batch_size, n_z), 0, 1,
                               dtype=tf.float32)

        self.z_theta = tf.add(0.0, tf.mul(1.0, eps), name='z_theta')

        self.x_prime = self._generator_network(network_weights["weights_gener"],
                                               network_weights["biases_gener"],
                                               z=self.z_theta)

        self.z_prime_mean, self.z_prime_log_sigma_sq = self._recognition_network(
            network_weights["weights_recog"],
            network_weights["biases_recog"],
            self.x_prime)

        dist = Normal(mu=self.z_prime_mean, sigma=tf.sqrt(tf.exp(self.z_prime_log_sigma_sq)))
        logli = tf.reduce_sum(dist.log_pdf(self.z_theta, name='x_entropy'), reduction_indices=1)

        self.cross_entropy = tf.reduce_mean(- logli)
        #self.cross_entropy = tf.reduce_mean(- dist.log_pdf(self.z_theta, name='x_entropy'))
        self.entropy = tf.constant(28.37)
Exemplo n.º 3
0
def main_pendulum(logdir,
                  seed,
                  n_iter,
                  gamma,
                  min_timesteps_per_batch,
                  initial_stepsize,
                  desired_kl,
                  vf_type,
                  vf_params,
                  animate=False):
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env = gym.make("Pendulum-v0")
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    logz.configure_output_dir(logdir)
    if vf_type == 'linear':
        vf = LinearValueFunction(**vf_params)
    elif vf_type == 'nn':
        vf = NnValueFunction(ob_dim=ob_dim, **vf_params)

    ####
    # YOUR_CODE_HERE

    # batch of observations
    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    # batch of actions
    sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.float32)
    # batch of advantage function estimates
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)

    # 2-layer network to learn state from observation
    sy_h1 = lrelu(dense(sy_ob_no, 32, "h1",
                        weight_init=normc_initializer(1.0)))
    sy_h2 = lrelu(dense(sy_h1, 32, "h2", weight_init=normc_initializer(1.0)))
    # Mean control output
    sy_mean_na = dense(sy_h2,
                       ac_dim,
                       "mean",
                       weight_init=normc_initializer(0.1))
    # Variance
    logstd_a = tf.get_variable("logstdev", [ac_dim])

    # define action distribution
    sy_ac_distr = Normal(mu=tf.squeeze(sy_mean_na),
                         sigma=tf.exp(logstd_a),
                         validate_args=True)
    # sampled actions, used for defining the policy
    # (NOT computing the policy gradient)
    sy_sampled_ac = tf.squeeze(sy_ac_distr.sample(sample_shape=[ac_dim]))

    sy_n = tf.shape(sy_ob_no)[0]
    sy_logprob_n = sy_ac_distr.log_pdf(sy_ac_n)

    # used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES
    sy_oldmean_na = tf.placeholder(shape=[None, ac_dim],
                                   name='oldmean',
                                   dtype=tf.float32)
    sy_oldlogstd_a = tf.placeholder(shape=[ac_dim],
                                    name="oldlogstdev",
                                    dtype=tf.float32)
    sy_ac_olddistr = Normal(mu=tf.squeeze(sy_oldmean_na),
                            sigma=tf.exp(sy_oldlogstd_a),
                            validate_args=True)

    sy_kl = tf.reduce_mean(
        tf.contrib.distributions.kl(sy_ac_distr, sy_ac_olddistr))
    sy_ent = tf.reduce_mean(sy_ac_distr.entropy())

    ####

    sy_surr = -tf.reduce_mean(
        sy_adv_n * sy_logprob_n
    )  # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(
        shape=[], dtype=tf.float32
    )  # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    total_timesteps = 0
    stepsize = initial_stepsize

    for i in range(n_iter):
        print("********** Iteration %i ************" % i)

        ####
        # YOUR_CODE_HERE

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (i % 10 == 0)
                                    and animate)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                acs.append(ac)
                ob, rew, done, _ = env.step([ac])
                rewards.append(rew)
                if done:
                    break

            path = {
                "observation": np.array(obs),
                "terminated": terminated,
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break

        total_timesteps += timesteps_this_batch

        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict(path["observation"])
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        vf.fit(ob_no, vtarg_n)

        # Policy update
        _, oldmean_na, oldlogstdev = sess.run(
            [update_op, sy_mean_na, logstd_a],
            feed_dict={
                sy_ob_no: ob_no,
                sy_ac_n: ac_n,
                sy_adv_n: standardized_adv_n,
                sy_stepsize: stepsize
            })
        kl, ent = sess.run(
            [sy_kl, sy_ent],
            feed_dict={
                sy_ob_no: ob_no,
                sy_oldmean_na: oldmean_na,
                sy_oldlogstd_a: oldlogstdev
            })

        ####

        if kl > desired_kl * 2:
            stepsize /= 1.5
            print('stepsize -> %s' % stepsize)
        elif kl < desired_kl / 2:
            stepsize *= 1.5
            print('stepsize -> %s' % stepsize)
        else:
            print('stepsize OK')

        # Log diagnostics
        logz.log_tabular("EpRewMean",
                         np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean",
                         np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter",
                         explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
Exemplo n.º 4
0
    def _create_network(self):
        # Initialize autoencode network weights and biases
        network_weights = self._initialize_weights(**self.network_architecture)

        # Use recognition network to determine mean and
        # (log) variance of Gaussian distribution in latent
        # space
        self.z_mean, self.c_mean, self.z_log_sigma_sq, self.c_log_sigma_sq = \
            self._recognition_network(network_weights["weights_recog"],
                                      network_weights["biases_recog"],
                                      self.x)

        self.z_mean_concat = tf.concat(1, [self.z_mean, self.c_mean])
        self.z_log_sigma_sq_concat = tf.concat(1, [self.z_log_sigma_sq, self.c_log_sigma_sq])


        # Compute I(Z,X) point estimate as H(Z|X)

        self.cond_ent_lat_given_x = tf.reduce_mean(tf.reduce_sum(tf.mul(tf.constant(0.5), tf.add(self.z_log_sigma_sq_concat, tf.constant(2.838))), reduction_indices=1))
        self.cond_ent_z_given_x = tf.reduce_mean(tf.reduce_sum(tf.mul(tf.constant(0.5), tf.add(self.z_log_sigma_sq, tf.constant(2.838))), reduction_indices=1))
        self.cond_ent_c_given_x = tf.reduce_mean(tf.reduce_sum(tf.mul(tf.constant(0.5), tf.add(self.c_log_sigma_sq, tf.constant(2.838))), reduction_indices=1))

        # Draw one sample z from Gaussian distribution
        n_z = self.network_architecture["n_z"]
        n_c = self.network_architecture["n_c"]

        eps = tf.random_normal((self.batch_size, n_z + n_c), 0, 1,
                               dtype=tf.float32)

        # z = mu + sigma*epsilon
        self.z = tf.add(self.z_mean_concat,
                        tf.mul(tf.sqrt(tf.exp(self.z_log_sigma_sq_concat)), eps),
                        name='z')

        # Use generator to determine mean of
        # Bernoulli distribution of reconstructed input
        self.x_reconstr_mean = \
            self._generator_network(network_weights["weights_gener"],
                                    network_weights["biases_gener"],
                                    z=self.z)

        ####
        ####
        ####
        eps = tf.random_normal((self.batch_size, n_z + n_c), 0, 1,
                               dtype=tf.float32)

        self.z_theta_concat = tf.add(0.0, tf.mul(1.0, eps), name='z_theta')
        self.z_theta = self.z_theta_concat[:, :n_z]
        self.c_theta = self.z_theta_concat[:, n_z:]

        self.x_prime = self._generator_network(network_weights["weights_gener"],
                                               network_weights["biases_gener"],
                                               z=self.z_theta_concat)

        self.z_prime_mean, self.c_prime_mean, self.z_prime_log_sigma_sq, self.c_prime_log_sigma_sq = \
            self._recognition_network(network_weights["weights_recog"],
                                      network_weights["biases_recog"],
                                      self.x_prime)

        self.z_prime_mean_concat = tf.concat(1, [self.z_prime_mean, self.c_prime_mean])
        self.z_prime_log_sigma_sq_concat = tf.concat(1, [self.z_prime_log_sigma_sq, self.c_prime_log_sigma_sq])

        # XEntropy for the code C
        dist = Normal(mu=self.c_prime_mean,
                      sigma=tf.sqrt(tf.exp(self.c_prime_log_sigma_sq)))
        logli = tf.reduce_sum(dist.log_pdf(self.c_theta, name='xc_entropy'),
                              reduction_indices=1)
        self.cross_entropy = tf.reduce_mean(- logli)
        self.entropy = tf.constant(1.4185 * n_c)

        # XEntropy for the entire latent code
        dist_all = Normal(mu=self.z_prime_mean_concat,
                          sigma=tf.sqrt(tf.exp(self.z_prime_log_sigma_sq_concat)))
        logli_all = tf.reduce_sum(dist_all.log_pdf(self.z_theta_concat, name='x_entropy_concat'),
                                  reduction_indices=1)
        self.cross_entropy_concat = tf.reduce_mean(- logli_all)
        self.entropy_concat = tf.constant(1.4185 * (n_z + n_c))

        # Entropy for the code Z
        dist_z = Normal(mu=self.z_prime_mean,
                        sigma=tf.sqrt(tf.exp(self.z_prime_log_sigma_sq)))
        logli_z = tf.reduce_sum(dist_z.log_pdf(self.z_theta, name='xz_entropy'),
                                reduction_indices=1)
        self.cross_entropy_z = tf.reduce_mean(- logli_z)
        self.entropy_z = tf.constant(1.4185 * n_z)