def __init__(self, policy, rate, train=True): self.rate = rate self.policy = policy with tf.variable_scope('policy_estimator'): self.policy.setup() self.X = policy.X self.a = policy.a self.target = tf.placeholder(dtype='float', shape=[None, 1], name='target') self.a_pred = policy.a_pred self.var = policy.var dist = Normal(self.a_pred, self.var) self.log_probs = dist.log_pdf(self.a) self.losses = self.log_probs * self.target self.loss = tf.reduce_sum(self.losses, name='loss') if train: self.opt = tf.train.RMSPropOptimizer(rate, 0.99, 0.0, 1e-6) self.grads_and_vars = self.opt.compute_gradients(self.loss) self.grads_and_vars = [(g, v) for g, v in self.grads_and_vars if g is not None] self.update = self.opt.apply_gradients(self.grads_and_vars)
def _create_network(self): # Initialize autoencode network weights and biases network_weights = self._initialize_weights(**self.network_architecture) # Use recognition network to determine mean and # (log) variance of Gaussian distribution in latent # space self.z_mean, self.z_log_sigma_sq = \ self._recognition_network(network_weights["weights_recog"], network_weights["biases_recog"], self.x) # Draw one sample z from Gaussian distribution n_z = self.network_architecture["n_z"] eps = tf.random_normal((self.batch_size, n_z), 0, 1, dtype=tf.float32) # z = mu + sigma*epsilon self.z = tf.add(self.z_mean, tf.mul(tf.sqrt(tf.exp(self.z_log_sigma_sq)), eps), name='z') # Use generator to determine mean of # Bernoulli distribution of reconstructed input self.x_reconstr_mean = \ self._generator_network(network_weights["weights_gener"], network_weights["biases_gener"], z=self.z) #### #### #### eps = tf.random_normal((self.batch_size, n_z), 0, 1, dtype=tf.float32) self.z_theta = tf.add(0.0, tf.mul(1.0, eps), name='z_theta') self.x_prime = self._generator_network(network_weights["weights_gener"], network_weights["biases_gener"], z=self.z_theta) self.z_prime_mean, self.z_prime_log_sigma_sq = self._recognition_network( network_weights["weights_recog"], network_weights["biases_recog"], self.x_prime) dist = Normal(mu=self.z_prime_mean, sigma=tf.sqrt(tf.exp(self.z_prime_log_sigma_sq))) logli = tf.reduce_sum(dist.log_pdf(self.z_theta, name='x_entropy'), reduction_indices=1) self.cross_entropy = tf.reduce_mean(- logli) #self.cross_entropy = tf.reduce_mean(- dist.log_pdf(self.z_theta, name='x_entropy')) self.entropy = tf.constant(28.37)
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False): tf.set_random_seed(seed) np.random.seed(seed) env = gym.make("Pendulum-v0") ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] logz.configure_output_dir(logdir) if vf_type == 'linear': vf = LinearValueFunction(**vf_params) elif vf_type == 'nn': vf = NnValueFunction(ob_dim=ob_dim, **vf_params) #### # YOUR_CODE_HERE # batch of observations sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of actions sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.float32) # batch of advantage function estimates sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # 2-layer network to learn state from observation sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) sy_h2 = lrelu(dense(sy_h1, 32, "h2", weight_init=normc_initializer(1.0))) # Mean control output sy_mean_na = dense(sy_h2, ac_dim, "mean", weight_init=normc_initializer(0.1)) # Variance logstd_a = tf.get_variable("logstdev", [ac_dim]) # define action distribution sy_ac_distr = Normal(mu=tf.squeeze(sy_mean_na), sigma=tf.exp(logstd_a), validate_args=True) # sampled actions, used for defining the policy # (NOT computing the policy gradient) sy_sampled_ac = tf.squeeze(sy_ac_distr.sample(sample_shape=[ac_dim])) sy_n = tf.shape(sy_ob_no)[0] sy_logprob_n = sy_ac_distr.log_pdf(sy_ac_n) # used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES sy_oldmean_na = tf.placeholder(shape=[None, ac_dim], name='oldmean', dtype=tf.float32) sy_oldlogstd_a = tf.placeholder(shape=[ac_dim], name="oldlogstdev", dtype=tf.float32) sy_ac_olddistr = Normal(mu=tf.squeeze(sy_oldmean_na), sigma=tf.exp(sy_oldlogstd_a), validate_args=True) sy_kl = tf.reduce_mean( tf.contrib.distributions.kl(sy_ac_distr, sy_ac_olddistr)) sy_ent = tf.reduce_mean(sy_ac_distr.entropy()) #### sy_surr = -tf.reduce_mean( sy_adv_n * sy_logprob_n ) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder( shape=[], dtype=tf.float32 ) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 total_timesteps = 0 stepsize = initial_stepsize for i in range(n_iter): print("********** Iteration %i ************" % i) #### # YOUR_CODE_HERE # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (i % 10 == 0) and animate) while True: if animate_this_episode: env.render() obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) acs.append(ac) ob, rew, done, _ = env.step([ac]) rewards.append(rew) if done: break path = { "observation": np.array(obs), "terminated": terminated, "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vpred_t = vf.predict(path["observation"]) adv_t = return_t - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) vf.fit(ob_no, vtarg_n) # Policy update _, oldmean_na, oldlogstdev = sess.run( [update_op, sy_mean_na, logstd_a], feed_dict={ sy_ob_no: ob_no, sy_ac_n: ac_n, sy_adv_n: standardized_adv_n, sy_stepsize: stepsize }) kl, ent = sess.run( [sy_kl, sy_ent], feed_dict={ sy_ob_no: ob_no, sy_oldmean_na: oldmean_na, sy_oldlogstd_a: oldlogstdev }) #### if kl > desired_kl * 2: stepsize /= 1.5 print('stepsize -> %s' % stepsize) elif kl < desired_kl / 2: stepsize *= 1.5 print('stepsize -> %s' % stepsize) else: print('stepsize OK') # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def _create_network(self): # Initialize autoencode network weights and biases network_weights = self._initialize_weights(**self.network_architecture) # Use recognition network to determine mean and # (log) variance of Gaussian distribution in latent # space self.z_mean, self.c_mean, self.z_log_sigma_sq, self.c_log_sigma_sq = \ self._recognition_network(network_weights["weights_recog"], network_weights["biases_recog"], self.x) self.z_mean_concat = tf.concat(1, [self.z_mean, self.c_mean]) self.z_log_sigma_sq_concat = tf.concat(1, [self.z_log_sigma_sq, self.c_log_sigma_sq]) # Compute I(Z,X) point estimate as H(Z|X) self.cond_ent_lat_given_x = tf.reduce_mean(tf.reduce_sum(tf.mul(tf.constant(0.5), tf.add(self.z_log_sigma_sq_concat, tf.constant(2.838))), reduction_indices=1)) self.cond_ent_z_given_x = tf.reduce_mean(tf.reduce_sum(tf.mul(tf.constant(0.5), tf.add(self.z_log_sigma_sq, tf.constant(2.838))), reduction_indices=1)) self.cond_ent_c_given_x = tf.reduce_mean(tf.reduce_sum(tf.mul(tf.constant(0.5), tf.add(self.c_log_sigma_sq, tf.constant(2.838))), reduction_indices=1)) # Draw one sample z from Gaussian distribution n_z = self.network_architecture["n_z"] n_c = self.network_architecture["n_c"] eps = tf.random_normal((self.batch_size, n_z + n_c), 0, 1, dtype=tf.float32) # z = mu + sigma*epsilon self.z = tf.add(self.z_mean_concat, tf.mul(tf.sqrt(tf.exp(self.z_log_sigma_sq_concat)), eps), name='z') # Use generator to determine mean of # Bernoulli distribution of reconstructed input self.x_reconstr_mean = \ self._generator_network(network_weights["weights_gener"], network_weights["biases_gener"], z=self.z) #### #### #### eps = tf.random_normal((self.batch_size, n_z + n_c), 0, 1, dtype=tf.float32) self.z_theta_concat = tf.add(0.0, tf.mul(1.0, eps), name='z_theta') self.z_theta = self.z_theta_concat[:, :n_z] self.c_theta = self.z_theta_concat[:, n_z:] self.x_prime = self._generator_network(network_weights["weights_gener"], network_weights["biases_gener"], z=self.z_theta_concat) self.z_prime_mean, self.c_prime_mean, self.z_prime_log_sigma_sq, self.c_prime_log_sigma_sq = \ self._recognition_network(network_weights["weights_recog"], network_weights["biases_recog"], self.x_prime) self.z_prime_mean_concat = tf.concat(1, [self.z_prime_mean, self.c_prime_mean]) self.z_prime_log_sigma_sq_concat = tf.concat(1, [self.z_prime_log_sigma_sq, self.c_prime_log_sigma_sq]) # XEntropy for the code C dist = Normal(mu=self.c_prime_mean, sigma=tf.sqrt(tf.exp(self.c_prime_log_sigma_sq))) logli = tf.reduce_sum(dist.log_pdf(self.c_theta, name='xc_entropy'), reduction_indices=1) self.cross_entropy = tf.reduce_mean(- logli) self.entropy = tf.constant(1.4185 * n_c) # XEntropy for the entire latent code dist_all = Normal(mu=self.z_prime_mean_concat, sigma=tf.sqrt(tf.exp(self.z_prime_log_sigma_sq_concat))) logli_all = tf.reduce_sum(dist_all.log_pdf(self.z_theta_concat, name='x_entropy_concat'), reduction_indices=1) self.cross_entropy_concat = tf.reduce_mean(- logli_all) self.entropy_concat = tf.constant(1.4185 * (n_z + n_c)) # Entropy for the code Z dist_z = Normal(mu=self.z_prime_mean, sigma=tf.sqrt(tf.exp(self.z_prime_log_sigma_sq))) logli_z = tf.reduce_sum(dist_z.log_pdf(self.z_theta, name='xz_entropy'), reduction_indices=1) self.cross_entropy_z = tf.reduce_mean(- logli_z) self.entropy_z = tf.constant(1.4185 * n_z)