def log_progress(self): episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) if self.t % self.log_every_n_steps == 0 and self.model_initialized: print("Timestep %d" % (self.t,)) print("mean reward (100 episodes) %f" % self.mean_episode_reward) print("best mean reward %f" % self.best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % self.exploration.value(self.t)) print("learning_rate %f" % self.optimizer_spec.lr_schedule.value(self.t)) if self.start_time is not None: print("running time %f" % ((time.time() - self.start_time) / 60.)) self.start_time = time.time() sys.stdout.flush() with open(self.rew_file, 'wb') as f: pickle.dump(episode_rewards, f, pickle.HIGHEST_PROTOCOL) # Log diagnostics logz.log_tabular("Iteration", self.t) logz.log_tabular("mean_reward_(100_episodes)", self.mean_episode_reward) logz.log_tabular("best_mean_reward", self.best_mean_episode_reward) logz.log_tabular("episodes", len(episode_rewards)) logz.log_tabular("exploration", self.exploration.value(self.t)) logz.log_tabular("learning_rate", self.optimizer_spec.lr_schedule.value(self.t)) logz.dump_tabular() logz.pickle_tf_vars(self.session)
def train_PG(exp_name, env_name, n_iter, \ gamma, min_timesteps_per_batch, max_path_length, learning_rate, \ reward_to_go, animate, logdir, normalize_advantages, nn_baseline, \ seed, n_layers, size): start = time.time() setup_logger(logdir, locals()) ## Set up Logger env = gym.make(env_name) tf.set_random_seed(seed) env.seed(seed) max_path_length = max_path_length or env.spec.max_episode_steps discrete = isinstance(env.action_space, gym.spaces.Discrete) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n if discrete else env.action_space.shape[0] ## Initialize Agent computation_graph_args = {'n_layers': n_layers, 'obs_dim': obs_dim, 'act_dim': act_dim, \ 'discrete': discrete, 'size': size, 'learning_rate': learning_rate} sample_trajectory_args = {'animate': animate, 'max_path_length': max_path_length, \ 'min_timesteps_per_batch': min_timesteps_per_batch} estimate_return_args = {'gamma': gamma, 'reward_to_go': reward_to_go, \ 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages} agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) agent.build_computation_graph() agent.init_tf_sess() ## Training Loop total_time_steps = 0 for itr in range(n_iter): print("********* Iteration %i *********" % itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_time_steps += timesteps_this_batch obs_no = np.concatenate([path['observation'] for path in paths]) act_na = np.concatenate([path['action'] for path in paths]) ret_n = [path['reward'] for path in paths] q_n, adv_n = agent.estimate_return(obs_no, ret_n) agent.update_parameters(obs_no, act_na, q_n, adv_n) # Log dianostics returns = [path['reward'].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenSt", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_time_steps) logz.dump_tabular() logz.pickle_tf_vars()
def run_model(session, predict, loss, train_step, saver, images, labels, X, y, epochs=1, batch_size=64, print_every=100, is_test=False): if not is_test: # Configure output directory for logging logz.configure_output_dir('logs') # Log experimental parameters args = inspect.getargspec(main)[0] # Get the names and default values of a function's parameters. locals_ = locals() # Return a dictionary containing the current scope's local variables params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # have tensorflow compute accuracy correct_prediction = tf.equal(tf.argmax(predict, axis=1), tf.argmax(y, axis=1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # counter iter_cnt = 0 iters_each_epoch = len(images)//batch_size - 1 for e in range(epochs): # keep track of losses and accuracy correct = 0 losses = [] # make sure we iterate over the dataset once images, labels = shuffle_dataset(images, labels) for i in range(iters_each_epoch): current_iter = i+1 batch_X, batch_y = images[current_iter*batch_size:(current_iter+1)*batch_size], labels[current_iter*batch_size:(current_iter+1)*batch_size] feed_dict = {X: batch_X, y: batch_y} # have tensorflow compute loss and correct predictions # and (if given) perform a training step l, corr, _ = session.run([loss, correct_prediction, train_step],feed_dict=feed_dict) # aggregate performance stats losses.append(l*batch_size) correct += np.sum(corr) # print every now and then if (iter_cnt % print_every) == 0 and not is_test: logz.log_tabular("Iteration", iter_cnt) logz.log_tabular("minibatch_loss", l) logz.log_tabular("minibatch_accuracy", np.sum(corr)/batch_size) logz.dump_tabular() logz.pickle_tf_vars() iter_cnt += 1 if is_test: total_correct = correct/len(images) total_loss = np.sum(losses)/len(images) print('acc:', total_correct) print('los:', total_loss) else: saver.save(session, 'checkpoints/mnist_plus', iter_cnt)
def log_progress(self): episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) if self.t % self.log_every_n_steps == 0 and self.model_initialized: logz.log_tabular("Time", (time.time() - self.start_time) / 60.) logz.log_tabular("Timestep", self.t) logz.log_tabular("Episodes", len(episode_rewards)) logz.log_tabular("AverageReturn", self.mean_episode_reward) logz.log_tabular("MaxReturn", self.best_mean_episode_reward) logz.log_tabular("Exploration", self.exploration.value(self.t)) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #todo: create Agent #todo: initilize Agent: #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = actor.run(ob) print("need to type-check action here:(two lines)") print(ac) print(ac.size()) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break #One episode finishes; perform update here finish_episode(actor, actor_optimizer, critic=None, critic_optimizer=None, ) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name = "adv", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: sy_logits_na = build_mlp(sy_ob_no, ac_dim, "scope", n_layers, size) sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1, seed = seed), [-1]) sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na) #[None] else: sy_mean = build_mlp(sy_ob_no, ac_dim, "scope", n_layers, size) # [None, ac_dim] # logstd should just be a trainable variable, not a network output. sy_logstd = tf.get_variable("logstd", shape = [ac_dim, 1], trainable = True, initializer = tf.contrib.layers.xavier_initializer()) z = tf.random_normal(tf.shape(sy_mean), mean = 0.0, stddev = 1.0, seed = seed) # [None, ac_dim] sigma = tf.reshape(tf.exp(sy_logstd), [1, ac_dim]) # [1, ac_dim] STANDARD DEVIATION sy_sampled_ac = (sigma * z) + sy_mean # [None, ac_dim] # Hint: Use the log probability under a multivariate gaussian. # diff = sy_ac_na - sy_mean # # the implementation below is by hand and assumes that sigma is covariance, though i've changed it to be SD instead. # first_term = -0.5 * tf.diag_part(tf.matmul(diff, tf.matmul(tf.matrix_inverse(sigma), tf.transpose(diff)))) # second_term = -0.5 * ac_dim * tf.log(tf.norm(sigma)) # third_term = -0.5 * ac_dim * tf.log(2*math.pi) # sy_logprob_n = first_term + second_term + third_term # [None, 1] sy_logprob_n = -tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean, scale_diag=sigma).log_prob(sy_ac_na) # [None] #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = tf.reduce_mean(sy_logprob_n * sy_adv_n) # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze(build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. sy_value_n = tf.placeholder(shape=[None], name = "V", dtype=tf.float32) baseline_loss = tf.losses.mean_squared_error(sy_value_n, baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# rewards_n = [path["reward"] for path in paths] if not reward_to_go: weighted_rewards = np.array([[(gamma**i)*r for i, r in enumerate(row)] for row in rewards_n]) q_sums = [sum(row) for row in weighted_rewards] q_n = np.hstack(np.array([[q_sums[i]]*len(weighted_rewards[i]) for i in range(len(weighted_rewards))])) # [None] else: q_n = np.hstack(np.array([[sum(map_gamma(row[i:], gamma)) for i in range(len(row))] for row in rewards_n])) # [None] #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no : ob_no}) b_n = tf.nn.l2_normalize(b_n, 0) q_mean = np.mean(q_n) q_std = np.std(q_n) b_n = b_n * q_std b_n = b_n + q_mean adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. adv_n = tf.nn.l2_normalize(adv_n, 0) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) q_n = tf.nn.l2_normalize(q_n, 0) sess.run(baseline_update_op, feed_dict={sy_ob_no : ob_no, sy_value_n : q_n.eval()}) #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. _, l_next = sess.run([update_op, loss], feed_dict = {sy_ob_no : ob_no, sy_ac_na : ac_na, sy_adv_n : adv_n.eval()}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.log_tabular("Loss After Update", l_next) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='ProstheticsEnv', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, test=False): start = time.time() logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} params['env_name'] = 'Prosthetic_3D' print('params: ', params) logz.save_params(params) args = inspect.getargspec(train_PG)[0] # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = env_name # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.timestep_limit # ========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None # ========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] print('observation dim: ', ob_dim) print('action dim: ', ac_dim) print('action space: ', discrete) # print("hellooooooo",ac_dim,env.action_space.shape) # ========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. # ========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(dtype=tf.float32, shape=[None], name="adv") # ========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # # ========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(env.action_space.high, sy_ob_no, ac_dim, scope="build_nn", n_layers=n_layers, size=size, activation=tf.nn.relu) sy_sampled_ac = tf.one_hot(tf.squeeze(tf.multinomial(sy_logits_na, 1)), ac_dim) # Hint: Use the tf.multinomial op # batch_size x ac_dim sy_logprob_n = tf.nn.softmax_cross_entropy_with_logits( labels=sy_ac_na, logits=sy_logits_na) # batch_size ---> log probability for each action # Learned from https://github.com/InnerPeace-Wu/ # # Another way to do it # N = tf.shape(sy_ob_no)[0] # sy_prob_na = tf.nn.softmax(sy_logits_na) # sy_logprob_n = tf.log(tf.gather_nd(sy_prob_na, tf.stack((tf.range(N), sy_ac_na), axis=1))) else: # YOUR_CODE_HERE sy_mean = build_mlp(env.action_space.high, sy_ob_no, ac_dim, scope="build_nn", n_layers=n_layers, size=size, activation=tf.nn.relu) sy_logstd = tf.Variable(tf.zeros(ac_dim), name='logstd', dtype=tf.float32) sy_std = tf.exp(sy_logstd) sy_sampled_ac = sy_mean + tf.multiply( sy_std, tf.random_normal(tf.shape(sy_mean))) sy_z = (sy_ac_na - sy_mean) / sy_std sy_logprob_n = 0.5 * tf.reduce_sum(tf.square(sy_z), axis=1) # sy_logprob_n = 0.5*tf.reduce_sum(tf.squared_difference(tf.div(sy_mean,sy_std), # tf.div(sy_ac_na,sy_std))) # Hint: Use the log probability under a multivariate gaussian. # ========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation # ========================================================================================# # loss = tf.reduce_sum(tf.multiply(tf.nn.softmax_cross_entropy_with_logits_v2(labels=sy_ac_na,logits=sy_logits_na),sy_adv_n)) # Loss function that we'll differentiate to get the policy gradient. loss = tf.reduce_sum(tf.multiply(sy_logprob_n, sy_adv_n)) update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) # ========================================================================================# # ----------SECTION 5---------- # Optional Baseline - Defining Second Graph # ========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(1, sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE sy_rew_n = tf.placeholder(shape=[None], name="rew", dtype=tf.int32) loss2 = tf.losses.mean_squared_error(labels=sy_rew_n, predictions=baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( loss2) # ========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization # ========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` # pylint: disable=E1101 network_params = tf.trainable_variables() saver = tf.train.Saver(network_params, max_to_keep=1) checkpoint_actor_dir = os.path.join(os.curdir, 'PG_MODEL_CONT_TANH') if not os.path.exists(checkpoint_actor_dir): os.makedirs(checkpoint_actor_dir) model_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt") ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir) if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path): print("Reading actor parameters from %s" % ckpt_1.model_checkpoint_path) saver.restore(sess, ckpt_1.model_checkpoint_path) uninitialized_vars = [] for var in tf.global_variables(): try: sess.run(var) except tf.errors.FailedPreconditionError: uninitialized_vars.append(var) if len(uninitialized_vars) > 0: init_new_vars_op = tf.variables_initializer(uninitialized_vars) sess.run(init_new_vars_op) # ========================================================================================# # Training Loop # ========================================================================================# total_timesteps = 0 t = 0 def testing(): print('testing the model..') ob = env.reset() steps = 0 done = False total_r = 0 one_hot_ac = env.action_space.sample() while not done: k = np.reshape(np.array(ob), newshape=(-1, len(ob))) # print('sampling an action...') if steps % 1 == 0: one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k}) ac = np.reshape(one_hot_ac, newshape=(one_hot_ac.shape[1])) # print('getting observations from env ..') # ac = np.clip(ac, -1.0, 1.0) ob, rew, done, _ = env.step(ac) total_r += rew env.render() steps += 1 if steps > max_path_length: break print('steps, rew', steps, total_r) return steps, total_r test = False if test: steps, rew = testing() return exp = False if exp: print('generating exp data..') import pickle as pkl paths = [] timesteps_this_batch = 0 while True: ob = env.reset() obs, acs = [], [] total_r = 0 while True: obs.append(ob) k = np.reshape(np.array(ob), newshape=(-1, len(ob))) one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k}) ac = np.reshape(one_hot_ac, newshape=(one_hot_ac.shape[1])) ac = np.clip(ac, 0.0, 1.0) acs.append(ac) ob, rew, done, _ = env.step(ac) total_r += rew if done: done = False break path = { "observation": np.array(obs[:-15]), "action": np.array(acs[:-15]) } if total_r > 50: timesteps_this_batch += len(path['action']) timesteps_this_batch -= 15 paths.append(path) print(timesteps_this_batch, total_r) if timesteps_this_batch > 1000: break ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) pkl.dump(ob_no, open('./simulation_0_1/obs_pg.p', 'wb')) pkl.dump(ac_na, open('./simulation_0_1/acts_pg.p', 'wb')) return _, best_rew = testing() for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 30 == 0) and animate) steps = 0 total_r = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) k = np.reshape(np.array(ob), newshape=(-1, len(ob))) # print(k.shape) # print('sampling an action...') one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k}) if discrete: ac = int(np.argmax(one_hot_ac)) else: ac = one_hot_ac acs.append(one_hot_ac) max_action = env.action_space.high ac = np.reshape(ac, newshape=(ac.shape[1])) # print('getting observations from env ..') ob, rew, done, _ = env.step( ac ) # transition dynamics P(s_t+1/s_t,a_t), r(s_t+1/s_t,a_t) total_r += rew rew = rew * 4 rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } if total_r > 0: paths.append(path) timesteps_this_batch += pathlength(path) print(total_r) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) ac_na = ac_na.reshape([-1, ac_dim]) import pickle as pkl # pkl.dump(ob_no, open('./simulation_data/obs_'+str(itr)+'.p', 'wb')) # pkl.dump(ac_na, open('./simulation_data/act_'+str(itr)+'.p', 'wb')) print("hello..", ac_na.shape) # ====================================================================================# # ----------..---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # # ====================================================================================# # DYNAMIC PROGRAMMING if reward_to_go: q_n = list() for path in paths: pLen = pathlength(path) q_p = np.zeros(pLen) q_p[pLen - 1] = path['reward'][pLen - 1] for t in reversed(range(pLen - 1)): q_p[t] = path['reward'][t] + gamma * q_p[t + 1] q_p = np.array(q_p) q_n.append(q_p) else: q_n = list() for path in paths: pLen = pathlength(path) q_p = 0 for t in range(pLen): q_p = q_p + (gamma**t) * (path['reward'][t]) q_n.append(q_p * np.ones(pLen)) q_n = np.concatenate(q_n) # print(q_n.shape) # ====================================================================================# # ----------SECTION 5---------- # Computing Baselines # ====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) b_n = normalize(b_n, np.mean(q_n), np.std(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() # ====================================================================================# # ----------SECTION 4---------- # Advantage Normalization # ====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = normalize(adv_n) # ====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline # ====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE sess.run(baseline_update_op, feed_dict={ sy_ob_no: ob_no, sy_rew_n: q_n }) # ====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update # ====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. t += 1 for i in range(1): print('updating model params..') sess.run(update_op, feed_dict={ sy_ac_na: ac_na, sy_ob_no: ob_no, sy_adv_n: adv_n }) _, new_r = testing() if new_r > best_rew: print('saving model params to, ', model_prefix) best_rew = new_r saver.save(sess, model_prefix) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name='', env_name=' HalfCheetah', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=False, animate=True, logdir=None, normalize_advantages=False, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = HalfCheetahEnvNew() # env = gym.make("RoboschoolHalfCheetah-v1") # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # Print environment infomation print("Environment name: ", "HalfCheetah") print("Action space is discrete: ", discrete) print("Action space dim: ", ac_dim) print("Observation space dim: ", ob_dim) print("Max_path_length ", max_path_length) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=4) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` data_buffer_ppo = DataBuffer_general(10000, 4) timesteps_per_actorbatch=1000 max_timesteps = 10000000 clip_param=0.2 entcoeff=0.0 optim_epochs=10 optim_stepsize=3e-4 optim_batchsize=64 gamma=0.99 lam=0.95 schedule='linear' callback=None # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5 policy_nn = MlpPolicy_bc(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff) # policy_nn = MlpPolicy(sess=sess, env=env, hid_size=64, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff, adam_epsilon=adam_epsilon) tf.global_variables_initializer().run() #pylint: disable=E1101 # Prepare for rollouts # ---------------------------------------- # seg_gen = traj_segment_generator_old(policy_nn, env, timesteps_per_actorbatch) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) data_buffer_ppo.clear() seg = traj_segment_generator(policy_nn, env, timesteps_per_actorbatch) # seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not policy_nn.recurrent) for n in range(len(ob)): data_buffer_ppo.add([ob[n], ac[n], atarg[n], tdlamret[n]]) print("data_buffer_ppo", data_buffer_ppo.size) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new() # set old parameter values to new parameter values # logger.log("Optimizing...") # logger.log(fmt_row(13, policy_nn.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for i in range(int(timesteps_per_actorbatch/optim_batchsize)): sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = data_buffer_ppo.sample(optim_batchsize) newlosses = policy_nn.lossandupdate_ppo(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, optim_stepsize*cur_lrmult) losses.append(newlosses) # logger.log(fmt_row(13, np.mean(losses, axis=0))) # logger.log("Evaluating losses...") # losses = [] # # for batch in d.iterate_once(optim_batchsize): # sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = data_buffer_ppo.sample(optim_batchsize) # newlosses = policy_nn.compute_losses(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult) # losses.append(newlosses) # meanlosses,_,_ = mpi_moments(losses, axis=0) # logger.log(fmt_row(13, meanlosses)) # for (lossval, name) in zipsame(meanlosses, policy_nn.loss_names): # logger.record_tabular("loss_"+name, lossval) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) # logger.record_tabular("EpLenMean", np.mean(lenbuffer)) # logger.record_tabular("EpRewMean", np.mean(rewbuffer)) # logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 # logger.record_tabular("EpisodesSoFar", episodes_so_far) # logger.record_tabular("TimestepsSoFar", timesteps_so_far) # logger.record_tabular("TimeElapsed", time.time() - tstart) # if MPI.COMM_WORLD.Get_rank()==0: # logger.dump_tabular() # Log diagnostics # returns = [path["reward"].sum() for path in paths] # ep_lengths = [pathlength(path) for path in paths] ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars()
def train( self, n_iter=100, seed=0, animate=True, min_timesteps_per_batch=1000, batch_epochs=1, reward_to_go=True, ): start = time.time() # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() # pylint: disable=E1101 total_timesteps = 0 merged_summary = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph) for itr in range(n_iter): # Collect paths until we have enough timesteps # 每一轮结束或者超过max_path_length时会结束一次path # 每一轮path结束后填充到paths中,检查一次总的batch步数是否超过batch需求数,超过了则退出,开始训练 # 因此每次训练的都是完整的数据 # PG算法每次都使用当前分布sample action,不涉及exploration # TODO 改成observation和train分开两个进程,这样不用互相等待 timesteps_this_batch = 0 paths = [] while True: ob = self.env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: self.env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(self.sy_sampled_ac, feed_dict={self.sy_ob_no: ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = self.env.step(ac) rewards.append(rew) steps += 1 if done or steps > self.max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) # YOUR_CODE_HERE q_n = [] reward_n = [] for path in paths: reward = path['reward'] max_step = len(reward) reward_n.extend(reward) # 从当前t开始的value估算 if reward_to_go: q = [ np.sum( np.power(self.gamma, np.arange(max_step - t)) * reward[t:]) for t in range(max_step) ] else: # 整个trajectory的q值估算 q = [ np.sum( np.power(self.gamma, np.arange(max_step)) * reward) for t in range(max_step) ] q_n.extend(q) epoch_step = 1 for epoch in range(batch_epochs): # ====================================================================================# # ----------SECTION 5---------- # Computing Baselines # ====================================================================================# # print('run %d epoch' % epoch) if self.nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(self.baseline_prediction, feed_dict={self.sy_ob_no: ob_no}) # b_n_norm = b_n - np.mean(b_n, axis=0) / (np.std(b_n, axis=0) + 1e-7) # 这里b_n要根据qn设置回来,因为b_n在下面optimize时是标准化过的 b_n = b_n * np.std(q_n, axis=0) + np.mean(q_n, axis=0) if self.gae_lambda > 0: adv_n = lambda_advantage(reward_n, b_n, len(reward_n), self.gae_lambda * self.gamma) else: adv_n = q_n - b_n else: adv_n = q_n.copy() # ====================================================================================# # ----------SECTION 4---------- # Advantage Normalization # ====================================================================================# if self.normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_mean = np.mean(adv_n, axis=0) adv_std = np.std(adv_n, axis=0) adv_n = (adv_n - adv_mean) / (adv_std + 1e-7) # ====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline # ====================================================================================# if self.nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # 标准化的q_n作为baseline的优化目标 q_n_mean = np.mean(q_n, axis=0) q_n_std = np.std(q_n, axis=0) q_n = (q_n - q_n_mean) / (q_n_std + 1e-7) sess.run(self.baseline_update_op, feed_dict={ self.sy_ob_no: ob_no, self.baseline_targets: q_n }) # ====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update # ====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # 输出两次loss是为了下面的log feed_dict = { self.sy_ob_no: ob_no, self.sy_ac_na: ac_na, self.sy_adv_n: adv_n } sess.run(self.param_assign_op, feed_dict) #loss_1 = sess.run(self.loss, feed_dict) _, summary_val = sess.run([self.update_op, merged_summary], feed_dict) #loss_2 = sess.run(self.loss, feed_dict) global_step = itr * batch_epochs + epoch_step epoch_step = epoch_step + 1 self.summary_writer.add_summary(summary_val, global_step) #self.summary_writer.flush() # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] #logz.log_tabular("LossDelta", loss_1 - loss_2) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars() self.summary_writer.flush()
def train_AC(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, num_target_updates, num_grad_steps_per_target_update, animate, logdir, normalize_advantages, seed, n_layers, size): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment env = gym.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, 'num_target_updates': num_target_updates, 'num_grad_steps_per_target_update': num_grad_steps_per_target_update, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_advantage_args = { 'gamma': gamma, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_advantage_args) #estimate_return_args # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = np.concatenate([path["reward"] for path in paths]) next_ob_no = np.concatenate( [path["next_observation"] for path in paths]) terminal_n = np.concatenate([path["terminal"] for path in paths]) print(ob_no.shape) print("terminal shape" + str(terminal_n.shape)) # Call tensorflow operations to: # (1) update the critic, by calling agent.update_critic # (2) use the updated critic to compute the advantage by, calling agent.estimate_advantage # (3) use the estimated advantage values to update the actor, by calling agent.update_actor # YOUR CODE HERE #raise NotImplementedError agent.update_critic(ob_no=ob_no, next_ob_no=next_ob_no, re_n=re_n, terminal_n=terminal_n) adv = agent.estimate_advantage(ob_no=ob_no, next_ob_no=next_ob_no, re_n=re_n, terminal_n=terminal_n) agent.update_actor(ob_no=ob_no, ac_na=ac_na, adv_n=adv) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, test=False, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, seed=0, # network arguments n_layers=1, size=32): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # ========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None # ========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] print('observation dim: ', ob_dim) print('action dim: ', ac_dim) print('action space: ', discrete) # print("hellooooooo",ac_dim,env.action_space.shape) # ========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. # ========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(dtype=tf.float32, shape=[None], name="adv") # ========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # # ========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(sy_ob_no, ac_dim, scope="build_nn", n_layers=n_layers, size=size, activation=tf.nn.relu) sy_sampled_ac = tf.one_hot(tf.squeeze(tf.multinomial(sy_logits_na, 1)), ac_dim) # Hint: Use the tf.multinomial op # batch_size x ac_dim sy_logprob_n = tf.nn.softmax_cross_entropy_with_logits_v2( labels=sy_ac_na, logits=sy_logits_na) # batch_size ---> log probability for each action # Learned from https://github.com/InnerPeace-Wu/ # # Another way to do it # N = tf.shape(sy_ob_no)[0] # sy_prob_na = tf.nn.softmax(sy_logits_na) # sy_logprob_n = tf.log(tf.gather_nd(sy_prob_na, tf.stack((tf.range(N), sy_ac_na), axis=1))) else: # YOUR_CODE_HERE sy_mean = build_mlp(sy_ob_no, ac_dim, scope="build_nn", n_layers=n_layers, size=size, activation=tf.nn.relu) sy_logstd = tf.Variable(tf.zeros(ac_dim), name='logstd', dtype=tf.float32) sy_std = tf.exp(sy_logstd) sy_sampled_ac = sy_mean + tf.multiply( sy_std, tf.random_normal(tf.shape(sy_mean))) sy_z = (sy_ac_na - sy_mean) / sy_std sy_logprob_n = 0.5 * tf.reduce_sum(tf.square(sy_z), axis=1) # sy_logprob_n = 0.5*tf.reduce_sum(tf.squared_difference(tf.div(sy_mean,sy_std), # tf.div(sy_ac_na,sy_std))) # Hint: Use the log probability under a multivariate gaussian. # ========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation # ========================================================================================# # loss = tf.reduce_sum(tf.multiply(tf.nn.softmax_cross_entropy_with_logits_v2(labels=sy_ac_na,logits=sy_logits_na),sy_adv_n)) # Loss function that we'll differentiate to get the policy gradient. loss = tf.reduce_sum(tf.multiply(sy_logprob_n, sy_adv_n)) actor_update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) actor_params = tf.trainable_variables() # ========================================================================================# # critic graph # Loss and training operations # ========================================================================================# predict_value = critic(sy_ob_no) sy_target_value = tf.placeholder(dtype=tf.float32, shape=[None], name="target_value") predict_value = tf.squeeze(predict_value) rms_loss = tf.reduce_mean( tf.squared_difference(predict_value, sy_target_value)) critic_update_op = tf.train.AdamOptimizer(learning_rate).minimize(rms_loss) critic_params = tf.trainable_variables()[len(actor_params):] # ========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization # ========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` actor_saver = tf.train.Saver(actor_params, max_to_keep=1) critic_saver = tf.train.Saver(critic_params, max_to_keep=1) checkpoint_actor_dir = os.path.join(os.curdir, 'Actor_GAE_0.7' + str(env_name)) if not os.path.exists(checkpoint_actor_dir): os.makedirs(checkpoint_actor_dir) actor_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt") ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir) checkpoint_critic_dir = os.path.join(os.curdir, 'Critic_GAE_0.7' + str(env_name)) if not os.path.exists(checkpoint_critic_dir): os.makedirs(checkpoint_critic_dir) critic_prefix = os.path.join(checkpoint_critic_dir, "model.ckpt") ckpt_2 = tf.train.get_checkpoint_state(checkpoint_critic_dir) if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path): print("Reading actor parameters from %s" % ckpt_1.model_checkpoint_path) actor_saver.restore(sess, ckpt_1.model_checkpoint_path) if ckpt_2 and tf.train.checkpoint_exists(ckpt_2.model_checkpoint_path): print("Reading critic parameters from %s" % ckpt_2.model_checkpoint_path) critic_saver.restore(sess, ckpt_2.model_checkpoint_path) uninitialized_vars = [] for var in tf.global_variables(): try: sess.run(var) except tf.errors.FailedPreconditionError: uninitialized_vars.append(var) if len(uninitialized_vars) > 0: init_new_vars_op = tf.variables_initializer(uninitialized_vars) sess.run(init_new_vars_op) def testing(): print('testing..') ob = env.reset() steps = 0 total_r = 0 while True: one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) if discrete: ac = int(np.argmax(one_hot_ac)) else: ac = one_hot_ac ob, rew, done, _ = env.step(ac) env.render() total_r += rew steps += 1 if steps > max_path_length: break print(steps, total_r) return steps, total_r # ========================================================================================# # Training Loop # ========================================================================================# if test: testing() return total_timesteps = 0 best_steps, best_rew = testing() # best_rew = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] next_obs = [] animate_this_episode = (len(paths) == 0 and (itr % 30 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) if discrete: ac = int(np.argmax(one_hot_ac)) else: ac = one_hot_ac # print("helloooo",ac) acs.append(one_hot_ac) next_ob, rew, done, _ = env.step( ac ) # transition dynamics P(s_t+1/s_t,a_t), r(s_t+1/s_t,a_t) next_obs.append(next_ob) ob = next_ob rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs), "next_observation": np.array(next_obs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) next_ob_no = np.concatenate( [path["next_observation"] for path in paths]) rew_no = np.concatenate([path["reward"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) ac_na = ac_na.reshape([-1, ac_dim]) print("helloooo", ac_na.shape) # ======================== Finding target values ===================================# # target = r(s,a) + gamma* V(s') - V(s) # This estimate has less variance but is biased. Alternatively # we can go for n-step returns or GAE(Generalised Advantage Estimation) # ==================================================================================# next_values = sess.run(predict_value, feed_dict={sy_ob_no: next_ob_no}) target_values = rew_no + gamma * next_values # fit critic with target r(s,a) + gamma*V(s') print('updating the critic params..') sess.run(critic_update_op, feed_dict={ sy_ob_no: ob_no, sy_target_value: target_values }) current_values = sess.run(predict_value, feed_dict={sy_ob_no: ob_no}) next_values = sess.run(predict_value, feed_dict={sy_ob_no: next_ob_no}) adv_n = rew_no + gamma * next_values - current_values # ====================== Generalized Advatage Estimation =========================== # # A(s_t, a_t) = sum_{t'=t}^{t'=inf} (gamma*lambda)^{t'-t} delta_{t'}, where # delta_{t} = r(s_t, a_t) + gamma*V(s_{t+1}) - V(s_t) # ================================================================================== # q_n = list() GAE = True if GAE: ind = 0 lam = 0.7 for path in paths: pLen = pathlength(path) q_p = np.zeros(pLen) q_p[pLen - 1] = adv_n[ind + pLen - 1] for t in reversed(range(pLen - 1)): q_p[t] = adv_n[ind + t] + (gamma * lam) * q_p[t + 1] q_p = np.array(q_p) q_n.append(q_p) ind += pLen # =========================== n-step returns =========================================# # Consider only the n-step returns instead of until the end of episode. # Variance reduction technique # adv(s_t) = sum_{t'=t}^(t+n) gamma^{t'-t}*r(t') + gamma^{n} V(s_{t+n}) - V(s_t) # ====================================================================================# n_step_returns = False if n_step_returns: n = 100 value_paths = [] for path in paths: ob = path['observation'] pLen = pathlength(path) values = sess.run(predict_value, feed_dict={sy_ob_no: ob}) x = {} x['value'] = values value_paths.append(x) for ind, path in enumerate(paths): pLen = pathlength(path) q_p = np.zeros(pLen) rew = path['reward'] values = value_paths[ind]['value'] for i in range(pLen): start = i end = min(start + n, pLen - 1) for j, r in enumerate(rew[start:end]): q_p[i] += pow(gamma, j) * r q_p[i] += pow(gamma, n) * values[end] q_p[i] -= values[start] q_p = np.array(q_p) q_n.append(q_p) q_n = np.concatenate(q_n) adv_n = q_n.copy() # ====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update # ====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] if np.mean(returns) > best_rew: best_rew = np.mean(returns) print('saving actor to ', actor_prefix) actor_saver.save(sess, actor_prefix) print('saving critic to ', critic_prefix) critic_saver.save(sess, critic_prefix) sess.run(actor_update_op, feed_dict={ sy_ac_na: ac_na, sy_ob_no: ob_no, sy_adv_n: adv_n }) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, to_animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, video_dir=None): start = time.time() nn_params = {"n_layers": n_layers, "size": size, "lr": learning_rate} # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) #env._max_episode_steps = 4000 to_animate = ToAnimate(False) to_animate.animate = False if video_dir is not None: env = gym.wrappers.Monitor(env, video_dir, force=True, video_callable=to_animate) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] (sy_sampled_ac, sy_ob_no, sy_ac_na, sy_adv_n), (update_op, loss) = get_policy_gradient_NN(ob_dim, ac_dim, discrete, nn_params) if nn_baseline: baseline_predictor = BaselinePredictor(sy_ob_no, epoch_num=500, nn_params=nn_params) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() # pylint: disable=E1101 # Training Loop total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps for one batch paths, num_collected_timesteps = collect_paths( sess, sy_sampled_ac, sy_ob_no, env, min_timesteps, max_path_length, to_animate, itr, discrete) total_timesteps += num_collected_timesteps # Build arrays for observation, action for the policy gradient update # by concatenating across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) q_n = get_reward(paths, gamma, reward_to_go) if nn_baseline: # Getting baselines for each timesteps b_n = baseline_predictor.predict(ob_no)[0] # Rescaling the output to mach statistics of Q-values b_n = (b_n - np.mean(b_n)) / np.std(b_n) b_n = np.mean(q_n) + (b_n * np.std(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. adv_n = (adv_n - np.mean(adv_n)) / np.std(adv_n) if nn_baseline: baseline_predictor.fit(inputs=ob_no, labels=(q_n - np.mean(q_n)) / np.std(q_n), n_iter=1) if discrete: ac_na = ac_na.flatten() # FIXME loss_before = sess.run( loss, feed_dict={ sy_ob_no: ob_no, # observation sy_ac_na: ac_na, # taken actions sy_adv_n: adv_n # adventages }) sess.run( update_op, feed_dict={ sy_ob_no: ob_no, # observation sy_ac_na: ac_na, # taken actions sy_adv_n: adv_n # adventages }) loss_after = sess.run( loss, feed_dict={ sy_ob_no: ob_no, # observation sy_ac_na: ac_na, # taken actions sy_adv_n: adv_n # adventages }) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] #logz.log_tabular("Loss_before", loss_before) logz.log_tabular("Loss_after", loss_after) logz.log_tabular("delta_loss", loss_after - loss_before) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", len(ac_na)) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_MAPG( exp_name='', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, learning_rate=5e-3, logdir=None, normalize_advantages=True, seed=101, # network arguments n_layers=1, size=32): #========================================================================================# # Logfile setup #========================================================================================# start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_MAPG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) #========================================================================================# # Env setup #========================================================================================# nAgent = 2 # hard coded! env1 = Simulator(seed=101, N_agent=nAgent, N_prod=3, Tstamp=10, costQ=np.array([[0.3, 0.3, 0.3]]), costInv=np.array([[0.2, 0.2, 0.2]]), costLastInv=np.array([[2, 2, 2]]), costBack=np.array([[0.75, 0.75, 0.75]])) env2 = Simulator(seed=202, N_agent=nAgent, N_prod=3, Tstamp=10, costQ=np.array([[0.3, 0.3, 0.3]]), costInv=np.array([[0.2, 0.2, 0.2]]), costLastInv=np.array([[2, 2, 2]]), costBack=np.array([[0.75, 0.75, 0.75]])) # Observation and action sizes ob_dim = env1.obs_dim() ac_dim = env1.act_dim() print('observation dimension is: ', ob_dim) print('action dimension is: ', ac_dim) print('critic network input dimension is:', ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent) #========================================================================================# # PG Networks #========================================================================================# def PGNet(sy_ob_no, sy_ac_na, sy_adv_n, agent_id): sy_mean = build_mlp(input_placeholder=sy_ob_no, output_size=ac_dim[0] * ac_dim[1], scope=str(seed) + 'MA_' + str(agent_id), n_layers=n_layers, output_activation=tf.sigmoid, size=size, scale=10.) sy_logstd = tf.Variable(tf.truncated_normal( shape=[1, ac_dim[0] * ac_dim[1]], stddev=0.1), name='var_std' + str(agent_id)) sy_sampled_ac = sy_mean + tf.multiply( tf.random_normal(shape=tf.shape(sy_mean)), tf.exp(sy_logstd)) MVN_dist = tf.contrib.distributions.MultivariateNormalDiag( sy_mean, tf.exp(sy_logstd)) sy_logprob_n = MVN_dist.log_prob(sy_ac_na) # Loss function for PG network loss = -tf.reduce_mean( tf.multiply(sy_logprob_n, sy_adv_n) ) # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) return sy_sampled_ac, loss, update_op #========================================================================================# # Critic network #========================================================================================# def CriticNet(sy_ob_critic, baseline_target, agent_id): baseline_prediction = tf.squeeze( build_mlp(sy_ob_critic, output_size=1, scope=str(seed) + "critic_" + str(agent_id), n_layers=n_layers, size=size)) baseline_loss = tf.nn.l2_loss(baseline_target - baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( baseline_loss) return baseline_prediction, baseline_loss, baseline_update_op #========================================================================================# # Add networks in a loop #========================================================================================# sy_ob_no_1 = tf.placeholder(shape=[None, ob_dim[0]], name='ob' + str(1), dtype=tf.float32) sy_ac_na_1 = tf.placeholder(shape=[None, ac_dim[0] * ac_dim[1]], name='ac' + str(1), dtype=tf.float32) sy_adv_n_1 = tf.placeholder(shape=[None], name='adv' + str(1), dtype=tf.float32) sy_ob_critic_1 = tf.placeholder( shape=[None, ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent], name='critic_ob' + str(1), dtype=tf.float32) baseline_target_1 = tf.placeholder(shape=[None], name='baseline_target_qn' + str(1), dtype=tf.float32) sy_sampled_ac_1, loss_1, update_op_1 = PGNet(sy_ob_no_1, sy_ac_na_1, sy_adv_n_1, 1) baseline_prediction_1, baseline_loss_1, baseline_update_op_1 = CriticNet( sy_ob_critic_1, baseline_target_1, 1) sy_ob_no_2 = tf.placeholder(shape=[None, ob_dim[0]], name='ob' + str(2), dtype=tf.float32) sy_ac_na_2 = tf.placeholder(shape=[None, ac_dim[0] * ac_dim[1]], name='ac' + str(2), dtype=tf.float32) sy_adv_n_2 = tf.placeholder(shape=[None], name='adv' + str(2), dtype=tf.float32) sy_ob_critic_2 = tf.placeholder( shape=[None, ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent], name='critic_ob' + str(2), dtype=tf.float32) baseline_target_2 = tf.placeholder(shape=[None], name='baseline_target_qn' + str(2), dtype=tf.float32) sy_sampled_ac_2, loss_2, update_op_2 = PGNet(sy_ob_no_2, sy_ac_na_2, sy_adv_n_2, 2) baseline_prediction_2, baseline_loss_2, baseline_update_op_2 = CriticNet( sy_ob_critic_2, baseline_target_2, 2) # exec("sy_sampled_ac_%s, loss_%s, update_op_%s = PGNet(sy_ob_no_%s, sy_ac_na_%s, sy_adv_n_%s, agent)"%(agent, agent, agent, agent, agent, agent)) # exec("baseline_prediction_%s, baseline_loss_%s, baseline_update_op_%s = CriticNet(sy_ob_critic_%s, baseline_target_%s, agent)"%(agent, agent, agent, agent, agent)) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# num_gpu = 0 tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, device_count={'GPU': num_gpu}) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 total_numpaths = 0 demand_cov = np.array([[0.1, -0.5 * 0.3, -0.5 * 0.3], [-0.5 * 0.3, 0.1, 0.5 * 0.3], [-0.5 * 0.3, 0.5 * 0.3, 0.1]]) for itr in range(n_iter): #========================# # Sampling #========================# randk1 = 0 + itr * seed randk2 = 12306 + itr * seed print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 num_path = 0 paths1 = [] paths2 = [] while True: steps = 0 last = False ob1 = env1.randomInitialStateGenerator() obs1, acs1, rewards1, criticObs1 = [], [], [], [] ob2 = env2.randomInitialStateGenerator() obs2, acs2, rewards2, criticObs2 = [], [], [], [] while steps < env1.Tstamp: if steps == env1.Tstamp - 1: last = True obs1.append(ob1.flatten()) obs2.append(ob2.flatten()) ac1 = sess.run(sy_sampled_ac_1, feed_dict={sy_ob_no_1: ob1}) ac2 = sess.run(sy_sampled_ac_2, feed_dict={sy_ob_no_2: ob2}) acs1.append(ac1.flatten()) acs2.append(ac2.flatten()) criticObs1.append( np.append(np.append(ob1.flatten(), ac1.flatten()), ac2.flatten()).flatten()) criticObs2.append( np.append(np.append(ob2.flatten(), ac2.flatten()), ac1.flatten()).flatten()) actList = [ac1.reshape(-1, 2), ac2.reshape(-1, 2)] demand = env1.demandGenerator_p( actList, M=np.array([10, 10, 10]).reshape(-1, 1), V=np.array([5, 5, 5]).reshape(-1, 1), sens=np.array([1.5, 1.5, 1.5]).reshape(-1, 1), cov=demand_cov, seed=randk1) demand1 = demand[:, 0] demand2 = demand[:, 1] # demand2 = env2.demandGenerator_p(actList, # M = np.array([3, 3, 3]).reshape(-1,1), # V = np.array([5,5,5]).reshape(-1,1), # sens = np.array([1, 1, 1]).reshape(-1,1), # cov = np.diag(np.array([0.25, 0.25, 0.25])), # seed = randk2) ob1, rew1 = env1.step(actList[0], ob1.flatten(), demand1, last) ob2, rew2 = env2.step(actList[1], ob2.flatten(), demand2, last) randk1 += 1 randk2 += 1 rewards1.append(rew1) rewards2.append(rew2) steps += 1 path1 = { "observation": np.array(obs1), "reward": np.array(rewards1), "action": np.array(acs1), "criticObservation": np.array(criticObs1) } path2 = { "observation": np.array(obs2), "reward": np.array(rewards2), "action": np.array(acs2), "criticObservation": np.array(criticObs2) } paths1.append(path1) paths2.append(path2) num_path += 1 timesteps_this_batch += pathlength(path1) if timesteps_this_batch > min_timesteps_per_batch: break total_numpaths += num_path total_timesteps += timesteps_this_batch if last and itr == n_iter - 1: pickle.dump(path1, open(logdir + '/trained_path1_sample.pkl', 'wb'), protocol=2) pickle.dump(path2, open(logdir + '/trained_path2_sample.pkl', 'wb'), protocol=2) # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no1 = np.concatenate([path["observation"] for path in paths1]) ac_na1 = np.concatenate([path["action"] for path in paths1]) critic_ob_no1 = np.concatenate( [path["criticObservation"] for path in paths1]) ob_no2 = np.concatenate([path["observation"] for path in paths2]) ac_na2 = np.concatenate([path["action"] for path in paths2]) critic_ob_no2 = np.concatenate( [path["criticObservation"] for path in paths2]) # print(ob_no.shape) # print(ac_na.shape) # print(path['reward'].shape) #========================# # Compute Q value #========================# q_n1 = np.concatenate([[ np.npv((1 / gamma - 1), path["reward"][i:]) for i in range(len(path["reward"])) ] for path in paths1]) q_n2 = np.concatenate([[ np.npv((1 / gamma - 1), path["reward"][i:]) for i in range(len(path["reward"])) ] for path in paths2]) #========================# # Compute Baselines #========================# q_n_mean1 = q_n1.mean() q_n_std1 = q_n1.std() q_n1 = (q_n1 - q_n_mean1) / q_n_std1 b_n1 = baseline_prediction_1 adv_n_baseline1 = q_n1 - b_n1 q_n_mean2 = q_n2.mean() q_n_std2 = q_n2.std() q_n2 = (q_n2 - q_n_mean2) / q_n_std2 b_n2 = baseline_prediction_2 adv_n_baseline2 = q_n2 - b_n2 # if bootstrap: # last_critic_ob_no1 = np.concatenate([path["criticObservation"] for path in paths1]) # lastFit1 = sess.run(baseline_prediction_1, # feed_dict = {sy_ob_critic_1: critic_ob_no1[]}) #====================================# # Optimizing Neural Network Baseline #====================================# _, adv_n1 = sess.run([baseline_update_op_1, adv_n_baseline1], feed_dict={ baseline_target_1: q_n1, sy_ob_critic_1: critic_ob_no1 }) adv_n1 = adv_n1 * q_n_std1 + q_n_mean1 _, adv_n2 = sess.run([baseline_update_op_2, adv_n_baseline2], feed_dict={ baseline_target_2: q_n2, sy_ob_critic_2: critic_ob_no2 }) adv_n2 = adv_n2 * q_n_std2 + q_n_mean2 #====================================================================================# # Advantage Normalization #====================================================================================# if normalize_advantages: adv_n1 = (adv_n1 - adv_n1.mean()) / adv_n1.std() adv_n2 = (adv_n2 - adv_n2.mean()) / adv_n2.std() #====================================================================================# # Performing the Policy Update #====================================================================================# _, train_loss1 = sess.run([update_op_1, loss_1], feed_dict={ sy_adv_n_1: adv_n1, sy_ac_na_1: ac_na1, sy_ob_no_1: ob_no1 }) _, train_loss2 = sess.run([update_op_2, loss_2], feed_dict={ sy_adv_n_2: adv_n2, sy_ac_na_2: ac_na2, sy_ob_no_2: ob_no2 }) print("PG Network 1 training loss: %.5f" % train_loss1) print("PG Network 2 training loss: %.5f" % train_loss2) # Log diagnostics returns1 = np.array([path["reward"].sum() for path in paths1]) returns2 = np.array([path["reward"].sum() for path in paths2]) totalReturn = returns1 + returns2 ep_lengths = [pathlength(path) for path in paths1] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn1", np.mean(returns1)) logz.log_tabular("StdReturn1", np.std(returns1)) logz.log_tabular("MaxReturn1", np.max(returns1)) logz.log_tabular("MinReturn1", np.min(returns1)) logz.log_tabular("AverageReturn2", np.mean(returns2)) logz.log_tabular("StdReturn2", np.std(returns2)) logz.log_tabular("MaxReturn2", np.max(returns2)) logz.log_tabular("MinReturn2", np.min(returns2)) logz.log_tabular("AverageTotalReturn", np.mean(totalReturn)) logz.log_tabular("StdReturn", np.std(totalReturn)) logz.log_tabular("MaxReturn", np.max(totalReturn)) logz.log_tabular("MinReturn", np.min(totalReturn)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("NumPathsThisBatch", num_path) logz.log_tabular("NumPathsSoFar", total_numpaths) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=False, animate=True, logdir=None, normalize_advantages=False, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, # mb mpc arguments model_learning_rate=1e-3, onpol_iters=10, dynamics_iters=260, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=10, m_n_layers=2, m_size=500, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment # env = gym.make(env_name) env = HalfCheetahEnvNew() cost_fn = cheetah_cost_fn activation=tf.nn.relu output_activation=None # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes # max_path_length = max_path_length or env.spec.max_episode_steps max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # Print environment infomation print("-------- env info --------") print("Environment name: ", env_name) print("Action space is discrete: ", discrete) print("Action space dim: ", ac_dim) print("Observation space dim: ", ob_dim) print("Max_path_length ", max_path_length) #========================================================================================# # Random data collection #========================================================================================# random_controller = RandomController(env) data_buffer_model = DataBuffer() data_buffer_ppo = DataBuffer_general(10000, 4) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("data buffer size: ", data_buffer_model.size) normalization = compute_normalization(data_buffer_model) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True tf_config.intra_op_parallelism_threads =4 tf_config.inter_op_parallelism_threads = 1 sess = tf.Session(config=tf_config) dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate) if nn_baseline: value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) if MPC: dyn_model.fit(data_buffer_model) returns = [] costs = [] # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: # print("data buffer size: ", data_buffer_model.size) current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]} ob = env.reset() obs, acs, mpc_acs, rewards = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 return_ = 0 while True: # print("steps ", steps) if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) if MPC: mpc_ac = mpc_controller.get_action(ob) else: mpc_ac = random_controller.get_action(ob) ac = policy_nn.predict(ob, mpc_ac) ac = ac[0] if not PG: ac = mpc_ac acs.append(ac) mpc_acs.append(mpc_ac) current_path['observations'].append(ob) ob, rew, done, _ = env.step(ac) current_path['reward'].append(rew) current_path['actions'].append(ac) current_path['next_observations'].append(ob) return_ += rew rewards.append(rew) steps += 1 if done or steps > max_path_length: break if MPC: # cost & return cost = path_cost(cost_fn, current_path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(current_path['observations'])): data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n]) for n in range(len(current_path['observations'])): data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n]) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs), "mpc_action" : np.array(mpc_acs)} paths.append(path) timesteps_this_batch += pathlength(path) # print("timesteps_this_batch", timesteps_this_batch) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch print("data_buffer_ppo.size:", data_buffer_ppo.size) # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths]) # Computing Q-values if reward_to_go: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): if t_ >= t: q += gamma**(t_-t) * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) else: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): q += gamma**t_ * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) # Computing Baselines if nn_baseline: # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no}) b_n = value_nn.predict(ob_no) b_n = normalize(b_n) b_n = denormalize(b_n, np.std(q_n), np.mean(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() # Advantage Normalization if normalize_advantages: adv_n = normalize(adv_n) # Optimizing Neural Network Baseline if nn_baseline: b_n_target = normalize(q_n) value_nn.fit(ob_no, b_n_target) # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target}) # Performing the Policy Update # policy_nn.fit(ob_no, ac_na, adv_n) policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na) # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def reinforce(sess, exp, pg_model, value_model, env, gamma, isRTG=True, n_iterations=100, n_batch=100, isRenderding=True, isRecordingVideo=True, recordingVideo_dir="video", isNNBaseLine=True, isNormalizeAdvantage=True, isAdaptiveStd=False, test_name="test", logging_dir="log", seed=0): # Get environment name env_name = env.spec.id # Configure output directory for logging logz.configure_output_dir(os.path.join(logging_dir, '%d' % exp)) recordingVideo_dir = os.path.join(recordingVideo_dir, '%d' % exp) if not os.path.exists(recordingVideo_dir): os.makedirs(recordingVideo_dir) # Log experimental parameters args = inspect.getargspec(reinforce)[0] locals_ = locals() params = { k: locals_[k] if k in locals_ and isinstance(locals_[k], (int, str, float)) else None for k in args } logz.save_params(params) print("Policy Gradient for {} Environment".format(env_name)) for iter in range(n_iterations): print("==========================================") print("Iteration: ", iter) steps_in_batch = 0 trajectories = [] tic = time.clock() episode = 1 video_recorder = None # Outer loop for collecting a trajectory batch while True: episode_states, episode_actions, episode_rewards, episode_returns, episode_advantages = [], [], [], [], [] episode_steps = 0 state = env.reset() if isRecordingVideo and episode == 1 and ( iter % 10 == 0 or iter == n_iterations - 1 or iter == 0): video_recorder = VideoRecorder( env, os.path.join( recordingVideo_dir, "vid_{}_{}_{}_{}.mp4".format(env_name, exp, test_name, iter)), enabled=True) print("Recording a video of this episode {} in iteration {}". format(episode, iter)) # Roll-out trajectory to collect a batch while True: if isRenderding: env.render() if video_recorder: video_recorder.capture_frame() # Choose an action based on observation action = pg_model.predict(state, sess=sess) action = action[0] # Simulate one time step from action nex_state, reward, done, info = env.step(action=action) # Collect data for a trajectory episode_states.append(state) episode_actions.append(action) episode_rewards.append(reward) state = nex_state episode_steps += 1 if done: break # Compute returns (Reward-To-Go or Full trajectory-centric) if isRTG: episode_returns = get_discounted_rewards_to_go(episode_rewards, gamma=gamma) else: episode_returns = [ get_sum_of_reward(episode_rewards, gamma=gamma) ] * len(episode_rewards) # Compute Value function per trajectory if isNNBaseLine: episode_baseline = value_model.predict(state=episode_states, sess=sess) # Normalize baseline estimation w.r.t returns # episode_baseline = normalize(episode_baseline, np.mean(episode_returns), np.std(episode_returns)) # Get advantage episode_advantages = np.squeeze(episode_returns) - np.squeeze( episode_baseline) else: episode_advantages = episode_returns.copy() # Normalize advantage if isNormalizeAdvantage: # episode_advantages = normalize(episode_advantages) episode_advantages = (episode_advantages - np.mean(episode_advantages)) \ / (np.std(episode_advantages) + 1e-8) # # Normalize Target (Q) # episode_returns = normalize(episode_returns) # Append to trajectory batch trajectory = { "state": np.array(episode_states), "action": np.array(episode_actions), "reward": np.array(episode_rewards), "return": np.array(episode_returns), "advantage": np.array(episode_advantages) } trajectories.append(trajectory) # Increase episode step steps_in_batch += len(trajectory["reward"]) episode += 1 # Close video recording if video_recorder: video_recorder.close() video_recorder = None # Break loop when enough episode batch is collected if episode > n_batch: # steps_in_batch > min_steps_in_batch: break # Batching sample trajectories # Generate 'ready-to-use' batch arrays for state, action, and reward # pg_model.sample_trajectories(trajectories) batch_states = np.concatenate([traj["state"] for traj in trajectories]) batch_actions = np.concatenate( [traj["action"] for traj in trajectories]) batch_returns = np.concatenate( [traj["return"] for traj in trajectories]) batch_advantages = np.concatenate( [traj["advantage"] for traj in trajectories]) # # Compute trajectory-centric reward sum # if isRTG: # batch_rewards = np.concatenate([ # get_discounted_rewards_to_go(traj["reward"], gamma) for traj in trajectories]) # else: # batch_rewards = np.concatenate([ # [get_sum_of_reward(traj["reward"], gamma=gamma)] * len(traj["reward"]) # for traj in trajectories # ]) # Compute estimated V(s) and A(s) (= Sum(rewards) - V(s)) # if isNNBaseLine: # # Compute NN baseline estimation # value_estimates = value_model.predict(state=batch_states) # # value_estimates = normalize(value_estimates, np.mean(value_estimates), np.std(value_estimates)) # # value_estimates = value_estimates * np.std(value_estimates, axis=0) + np.mean(value_estimates, axis=0) # # # Compute advantages and normalize it per trajectory # advantages = np.squeeze(batch_rewards) - np.squeeze(value_estimates) # # advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) # else: # advantages = batch_rewards.copy() # if isNormalizeAdvantage: # # advantages = normalize(advantages) # advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) # if isNNBaseLine: # # Normalize rewards (targets) and update value estimator # # batch_rewards = (batch_rewards - np.mean(batch_rewards)) / (np.std(batch_rewards) + 1e-8) # batch_rewards = normalize(batch_rewards) # # # Update value estimator # value_model.update(states=batch_states, targets=batch_rewards) # Update value estimator if isNNBaseLine: value_model.update(states=batch_states, targets=batch_returns, sess=sess) # Update policy estimator pg_model.update(states=batch_states, actions=batch_actions, advantages=batch_advantages, sess=sess) toc = time.clock() elapsed_sec = toc - tic rewards = [traj["reward"].sum() for traj in trajectories] advantages = [traj["advantage"].sum() for traj in trajectories] episode_lengths = [len(traj["reward"]) for traj in trajectories] # # Print progress # print("------------Return--------------") # print("Average_Return", np.mean(rewards)) # print("Std_Return", np.std(rewards)) # print("Max_Return", np.max(rewards)) # print("Min_Return", np.min(rewards)) # print("------------Advs----------------") # print("Average_Advs", np.mean(advantages)) # print("Std_Advs", np.std(advantages)) # print("Max_Advs", np.max(advantages)) # print("Min_Advs", np.min(advantages)) # print("------------Ep------------------") # print("Num_Total_Ep", len(episode_lengths)) # print("Mean_Ep_Len", np.mean(episode_lengths)) # print("Std_Ep_Len", np.std(episode_lengths)) # print("Sec_per_interaction: ", elapsed_sec) # Log progress logz.log_tabular("Time", elapsed_sec) logz.log_tabular("Iteration", iter) logz.log_tabular("Average_Return", np.mean(rewards)) logz.log_tabular("Std_Return", np.std(rewards)) logz.log_tabular("Max_Return", np.max(rewards)) logz.log_tabular("Min_Return", np.min(rewards)) logz.log_tabular("Average_Advs", np.mean(advantages)) logz.log_tabular("Std_Advs", np.std(advantages)) logz.log_tabular("Max_Advs", np.max(advantages)) logz.log_tabular("Min_Advs", np.min(advantages)) logz.log_tabular("Num_Total_Ep", len(episode_lengths)) logz.log_tabular("Mean_Ep_Len", np.mean(episode_lengths)) logz.log_tabular("Std_Ep_Len", np.std(episode_lengths)) logz.log_tabular("Sec_per_iteration: ", elapsed_sec) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # i need here to give a directory # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds #seed: it makes sure that you will not have the same random number twice/ ref:https://en.wikipedia.org/wiki/Random_seed tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: sy_logits_na =build_mlp(sy_ob_no,ac_dim,"discrete",n_layers,size,activation=tf.nn.relu,output_activation=tf.nn.relu) #print(sy_logits_na.shape) #env_actions=tf.concat(axis=1,values=[sy_logits_na,1-sy_logits_na]) sy_sampled_ac =tf.reshape(tf.multinomial(sy_logits_na,1,seed),[-1]) sy_logprob_n =tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na) else: # YOUR_CODE_HERE #sy_mean =-tf.reduce_mean(build_mlp(sy_ob_no,ac_dim,"cont",n_layers,size,activation=tf.tanh)) #sy_logstd = tf.Variable(tf.random_uniform([None, ac_dim])) # logstd should just be a trainable variable, not a network output. #sy_sampled_ac = tf.random_normal([None, ac_dim],sy_mean,sy_logstd,dtype=tf.float32,seed=seed) #sy_logprob_n = -0.5*(sy_sampled_ac-sy_ac_na)^2 # Hint: Use the log probability under a multivariate gaussian. print("Continous System") #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = tf.reduce_mean(tf.multiply(sy_logprob_n,sy_adv_n)) # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze(build_mlp( sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_target=tf.placeholder(shape=[None], name="tr", dtype=tf.float32) baseline_loss=tf.placeholder(shape=[None], name="lo", dtype=tf.float32) #baseline_update_op=tf.placeholder(shape=[None], name="up", dtype=tf.float32) b_loss=tf.losses.mean_squared_error(labels=baseline_target,predictions=baseline_prediction) baseline_update_op=tf.train.AdamOptimizer(learning_rate).minimize(b_loss) #baseline_loss=(baseline_prediction-baseline_target)**2 #baseline_update_op=tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) with tf.Session() as sess: # equivalent to `with sess:` sess.run(tf.global_variables_initializer()) #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=( (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no :[ob]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) print(ac_na.shape, "action sizeeeee") #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) #0 # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE print(total_timesteps) Q_t=[] if(reward_to_go): #Case 2: reward-to-go PG for no_traj in range(len(paths)): for _ in range(np.size((paths[no_traj])["reward"])): temp_rew=0 t_=np.size((paths[no_traj])["reward"])-1 for no_rew in range(t_+1): temp_rew+=(math.pow(gamma,t_-no_rew)*(((paths[no_traj])["reward"])[no_rew,])) Q_t.append(temp_rew) else:# Case 1: trajectory-based PG count =0 index=len(paths) i=0 t_=0 while(count<=total_timesteps and i <index): for _ in range (np.size((paths[i])["reward"])): Q_t.append((math.pow(gamma,total_timesteps-t_)*((paths[i])["reward"])[_,])) t_+=1 count+=np.size((paths[i])["reward"]) i+=1 q_n=Q_t print(len(q_n)) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction,feed_dict={sy_ob_no:ob_no}) b_n = preprocessing.scale(b_n) adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = preprocessing.scale(adv_n) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE target_tmp=1+gamma*b_n target_tmp=preprocessing.scale(target_tmp) sess.run(b_loss,feed_dict={sy_ob_no:ob_no,baseline_target:target_tmp}) sess.run(baseline_update_op,feed_dict={sy_ob_no:ob_no,baseline_target:target_tmp}) #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE #print(sess.run(sy_logits_na,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n})) #print(sess.run(sy_sampled_ac,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n})) loss_=sess.run(loss,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n}) sess.run(update_op,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n}) loss_=sess.run(loss,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.log_tabular("loss_",loss_) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gae_lambda=1.0, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name='adv', dtype=tf.float32) if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(input_placeholder=sy_ob_no, output_size=ac_dim, scope='discrete', n_layers=n_layers, size=size) sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), [-1]) # Hint: Use the tf.multinomial op sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits( labels=sy_ac_na, logits=sy_logits_na) else: # YOUR_CODE_HERE sy_mean = build_mlp(input_placeholder=sy_ob_no, output_size=ac_dim, scope='continus', n_layers=n_layers, size=size) sy_logstd = tf.get_variable( name='logstd', shape=[ac_dim], dtype=tf.float32 ) # logstd should just be a trainable variable, not a network output. sy_sampled_ac = tf.random_normal(shape=tf.shape(sy_mean), mean=sy_mean, stddev=tf.exp(sy_logstd)) dist = tf.contrib.distributions.MultivariateNormalDiag( loc=sy_mean, scale=tf.exp(sy_logstd)) sy_logprob_n = dist.log_prob(sy_ac_na) # Hint: Use the log probability under a multivariate gaussian. #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = tf.reduce_mean( -sy_logprob_n * sy_adv_n ) # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_targets = tf.placeholder(shape=[None], name='targets', dtype=tf.float32) baseline_loss = tf.nn.l2_loss(baseline_prediction - baseline_targets) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( baseline_loss) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 # ========================================================================================# # Training Loop # ========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) # YOUR_CODE_HERE q_n = [] for path in paths: r = path['reward'] max_step = len(r) q = np.zeros(max_step) q[-1] = r[-1] for t in reversed(range(max_step - 1)): q[t] = r[t] + gamma * q[t + 1] q_n.extend(q) if not reward_to_go: q_n.extend([q[0]] * max_step) q_n = np.array(q_n) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) # b_n = b_n * np.std(q_n, axis=0) + np.mean(q_n, axis=0) # adv_n = q_n - b_n adv_n = [] idx = 0 for path in paths: r = path['reward'] max_step = len(r) adv = np.zeros(max_step) adv[-1] = r[-1] for t in reversed(range(max_step - 1)): delta = r[t] + b_n[idx + t + 1] - b_n[idx + t] adv[t] = delta + gae_lambda * gamma * adv[t + 1] idx += max_step adv_n.extend(adv) q_n = b_n + adv_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE mean_adv = np.mean(adv_n, axis=0) std_adv = np.std(adv_n, axis=0) adv_n = (adv_n - mean_adv) / (std_adv + 1e-7) pass #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: q_n_mean = np.mean(q_n) q_n_std = np.std(q_n) q_n = (q_n - q_n_mean) / (q_n_std + 1e-7) sess.run(baseline_update_op, feed_dict={ sy_ob_no: ob_no, baseline_targets: q_n }) pass #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE sess.run(update_op, feed_dict={ sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n }) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, mini_batch_size, max_path_length, learning_rate, num_ppo_updates, num_value_iters, animate, logdir, normalize_advantages, nn_critic, seed, n_layers, size, gru_size, history, num_tasks, l2reg, recurrent, generalized, granularity ): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment envs = {'pm': PointEnv, 'pm-obs': ObservedPointEnv, } env = envs[env_name](num_tasks) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] task_dim = len(env._goal) # rude, sorry #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'task_dim': task_dim, 'size': size, 'gru_size': gru_size, 'learning_rate': learning_rate, 'history': history, 'num_value_iters': num_value_iters, 'l2reg': l2reg, 'recurrent': recurrent, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, 'generalized': generalized, 'granularity': granularity, } estimate_return_args = { 'gamma': gamma, 'nn_critic': nn_critic, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() #========================================================================================# # Training Loop #========================================================================================# def unpack_sample(data): ''' unpack a sample from the replay buffer ''' ob = data["observations"] ac = data["actions"] re = data["rewards"] hi = data["hiddens"] ma = 1 - data["terminals"] return ob, ac, re, hi, ma # construct PPO replay buffer, perhaps rude to do outside the agent ppo_buffer = PPOReplayBuffer(agent.replay_buffer) total_timesteps = 0 for itr in range(n_iter): # for PPO: flush the replay buffer! ppo_buffer.flush() # sample trajectories to fill agent's replay buffer print("********** Iteration %i ************"%itr) stats = [] for _ in range(num_tasks): s, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch) total_timesteps += timesteps_this_batch stats += s # compute the log probs, advantages, and returns for all data in agent's buffer # store in ppo buffer for use in multiple ppo updates # TODO: should move inside the agent probably data = agent.replay_buffer.all_batch() ob_no, ac_na, re_n, hidden, masks = unpack_sample(data) fixed_log_probs = agent.sess.run(agent.sy_lp_n, feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na}) q_n, adv_n = agent.estimate_return(ob_no, re_n, hidden, masks) ppo_buffer.add_samples(fixed_log_probs, adv_n, q_n) # update with mini-batches sampled from ppo buffer for _ in range(num_ppo_updates): data = ppo_buffer.random_batch(mini_batch_size) ob_no, ac_na, re_n, hidden, masks = unpack_sample(data) fixed_log_probs = data["log_probs"] adv_n = data["advantages"] q_n = data["returns"] log_probs = agent.sess.run(agent.sy_lp_n, feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na}) agent.update_parameters(ob_no, hidden, ac_na, fixed_log_probs, q_n, adv_n) # compute validation statistics print('Validating...') val_stats = [] for _ in range(num_tasks): vs, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch // 10, is_evaluation=True) val_stats += vs # save trajectories for viz with open("output/{}-epoch{}.pkl".format(exp_name, itr), 'wb') as f: pickle.dump(agent.val_replay_buffer.all_batch(), f, pickle.HIGHEST_PROTOCOL) agent.val_replay_buffer.flush() # Log TRAIN diagnostics returns = [sum(s["rewards"]) for s in stats] final_rewards = [s["rewards"][-1] for s in stats] ep_lengths = [s['ep_len'] for s in stats] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("FinalReward", np.mean(final_rewards)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) # Log VAL diagnostics val_returns = [sum(s["rewards"]) for s in val_stats] val_final_rewards = [s["rewards"][-1] for s in val_stats] logz.log_tabular("ValAverageReturn", np.mean(val_returns)) logz.log_tabular("ValFinalReward", np.mean(val_final_rewards)) logz.dump_tabular() logz.pickle_tf_vars()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2 , entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', bc_lr=1e-3, ppo_lr=3e-4, timesteps_per_actorbatch=1000, MPC = True, BEHAVIORAL_CLONING = True, PPO = True, ): start = time.time() logz.configure_output_dir(logdir) print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print("MPC-AUG: ", MPC) print(" ") # initialize buffers model_data_buffer = DataBufferGeneral(1000000, 5) ppo_data_buffer = DataBufferGeneral(10000, 4) bc_data_buffer = DataBufferGeneral(BC_BUFFER_SIZE, 2) # random sample path print("collecting random data ..... ") random_controller = RandomController(env) paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add([path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n], path['next_observations'][n] - path['observations'][n]]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) dyn_model = NNDynamicsRewardModel(env=env, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = MlpPolicy(sess=sess, env=env, hid_size=256, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff) mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, dyn_model=dyn_model, policy_net=policy_nn, self_exp=False, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon bc = False ppo_mpc = False mpc_returns = 0 for itr in range(onpol_iters): print(" ") print("onpol_iters: ", itr) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) print("bc learning_rate: ", bc_lr) print("ppo learning_rate: ", ppo_lr) ################## fit mpc model if MPC: dyn_model.fit(model_data_buffer) ################## ppo seg data if PPO: ppo_data_buffer.clear() # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon) mpc = False ppo_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(ppo_seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = \ ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # add into buffer for n in range(len(ob)): ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) if MPC: model_data_buffer.add([ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]]) ################## mpc augmented seg data if itr % MPC_AUG_GAP == 0 and MPC: print("MPC AUG PPO") ppo_mpc = True mpc = True mpc_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(mpc_seg, gamma, lam) ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg["ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg["rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # add into buffer for n in range(len(ob)): # if PPO: # ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) if BEHAVIORAL_CLONING and bc: bc_data_buffer.add([ob[n], mpcac[n]]) if MPC: model_data_buffer.add([ob[n], mpcac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]]) mpc_returns = mpc_seg["ep_rets"] seg = ppo_seg # check if seg is good ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] # saver.save(sess, CHECKPOINT_DIR) if BEHAVIORAL_CLONING: if np.mean(returns) > 100: bc = True else: bc = False print("BEHAVIORAL_CLONING: ", bc) bc_return = behavioral_cloning_eval(sess, env, policy_nn, env_horizon) if bc_return > 100: ppo_mpc = True else: ppo_mpc = False ################## optimization print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new() # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(optim_batchsize) newlosses = policy_nn.lossandupdate_ppo(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, ppo_lr*cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING and bc: sample_ob_no, sample_ac_na = bc_data_buffer.sample(optim_batchsize) # print("sample_ob_no", sample_ob_no.shape) # print("sample_ac_na", sample_ac_na.shape) policy_nn.update_bc(sample_ob_no, sample_ac_na, bc_lr*cur_lrmult) if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc: print('epcho: ', op_ep) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) ################## print and save data lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 # if np.mean(returns) > 1000: # filename = "seg_data.pkl" # pickle.dump(seg, open(filename, 'wb')) # print("saved", filename) logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("MpcReturn", np.mean(mpc_returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars() tstart = time.time()
def train_PG( exp_name='', batch_size=250, n_episodes=25000, learning_rate=1e-3, logdir=None, seed=0, # network arguments n_layers=2, size=64): env = Environment() agent1 = Agent(env, n_layers, size, learning_rate, "agent1") agent2 = Agent(env, n_layers, size, learning_rate, "agent2") agent1_Nash = Agent(env, 3, 32, 1e-2, "agent1_Nash") agent2_Nash = Agent(env, 3, 32, 1e-2, "agent2_Nash") start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) n_iter = n_episodes // batch_size #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# for itr in range(n_iter): print("********** Iteration %i ************" % itr) #simulate a batch of temperature-gas price states s = env.samplestatess(batch_size) ag1_prices, _ = agent1.sample_actions(sess, s) ag2_prices, _ = agent2.sample_actions(sess, s) #====================================================================================# # Feed agents' actions into the market simulator and obtain corresponding rewards #====================================================================================# #Convert agent RTM actions to corresponding prices ag1_rewards, ag2_rewards = get_rewards(env, ag1_prices, ag2_prices) #====================================================================================# # # Advantage Normalization #====================================================================================# ag1_adv = normalize(ag1_rewards) ag2_adv = normalize(ag2_rewards) #====================================================================================# # # Performing the Policy Update #====================================================================================# #update policy parameters for agent1 #if (itr % 20 < 10): loss1 = agent1.improve_policy(sess, s, ag1_adv, ag1_prices) #update policy parameters for agent2 #else: loss2 = agent2.improve_policy(sess, s, ag2_adv, ag2_prices) # Log diagnostics logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageProfit_agt1", np.mean(ag1_rewards)) logz.log_tabular("AverageProfit_agt2", np.mean(ag2_rewards)) logz.log_tabular("Agt1_StdReturn", np.std(ag1_rewards)) logz.log_tabular("Agt2_StdReturn", np.std(ag2_rewards)) logz.log_tabular("Agt1_MaxReturn", np.max(ag1_rewards)) logz.log_tabular("Agt2_MaxReturn", np.max(ag2_rewards)) logz.log_tabular("Agt1_MinReturn", np.min(ag1_rewards)) logz.log_tabular("Agt2_MinReturn", np.min(ag2_rewards)) logz.dump_tabular() logz.pickle_tf_vars() m1, m2, m1_m, m2_m, ag1_p, ag2_p = get_smart_rewards( sess, agent1, agent2, env) print("Agent1 Stochastic Profit: " + repr(m1)) print("Agent2 Stochastic Profit: " + repr(m2)) print("Agent1 Deterministic Profit: " + repr(m1_m)) print("Agent2 Deterministic Profit: " + repr(m2_m)) print("Agent1 Mean Price") print(ag1_p) print("Agent2 Prices") print(ag2_p) print("Assessing degree of deviation from Nash Eq") ag1_imp, ag2_imp = assess_policy_accuracy(sess, agent1, agent1_Nash, agent2, agent2_Nash, env) print("Agent1 Accuracy: " + repr(ag1_imp)) print("Agent2 Accuracy: " + repr(ag2_imp))
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, gae=True, lambd=1.0, threads=1, max_threads_pool=16, thread_timeout=None, offpol=False, n_it_pol=1, n_it_pol_fn=None, wis=True, record=None, # network arguments n_layers=1, size=32, ): def n_threads_to_run(timesteps_this_batch): tsteps_left = min_timesteps_per_batch - timesteps_this_batch max_threads = int(np.ceil((tsteps_left) / max_path_length)) if threads < 1 or threads > max_threads: return max_threads else: return threads start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] # args = inspect.signature(train_PG).parameters locals_ = locals() params = { k: locals_[k] if (k in locals_ and not callable(locals_[k])) else None for k in args } logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Maximum length for episodes max_path_length = max_path_length or gym.make( env_name).spec.max_episode_steps # Make the gym environment env = EnvList(env_name, n_threads_to_run(0), logdir, record if threads == 1 else None) # Is this env continuous, or discrete? discrete = env.discrete() #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space ac_dim = env.action_space #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) sy_prob_old = tf.placeholder(shape=[None], name='pol_old', dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if n_it_pol < 1 or not offpol: n_it_pol = 1 if discrete: sy_logits_na = build_mlp(sy_ob_no, ac_dim, 'disc_policy', n_layers, size) sy_sampled_ac = tf.squeeze(tf.multinomial( tf.log(tf.nn.softmax(sy_logits_na)), 1), axis=1) sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits( labels=sy_ac_na, logits=sy_logits_na) sy_prob_n = tf.exp(sy_logprob_n) if offpol else sy_logprob_n else: sy_mean = build_mlp(sy_ob_no, ac_dim, 'cont_policy', n_layers=n_layers, size=size) sy_logstd = tf.get_variable('logstd', shape=[ac_dim], dtype=np.float32) sy_std = tf.exp(sy_logstd) sy_sampled_ac = sy_mean + tf.multiply( tf.random_normal(shape=tf.shape(sy_mean)), sy_std) mvn = tf.contrib.distributions.MultivariateNormalDiag( loc=sy_mean, scale_diag=sy_std) sy_prob_n = mvn.prob(sy_ac_na) if offpol else mvn.log_prob(sy_ac_na) #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# if offpol: sy_policy_n = sy_prob_n / (sy_prob_old + CONST) loss = -tf.multiply(sy_policy_n, sy_adv_n) loss = tf.reduce_sum(loss) / tf.reduce_sum( sy_policy_n) if wis else tf.reduce_mean(loss) else: loss = tf.reduce_mean(-tf.multiply(sy_prob_n, sy_adv_n)) update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if gae: nn_baseline = True if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. sy_bl_target_n = tf.placeholder(shape=[None], name="bl_target", dtype=tf.float32) baseline_loss = tf.losses.mean_squared_error(sy_bl_target_n, baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( baseline_loss) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 col = PathCollector(sess, sy_sampled_ac, sy_ob_no, max_path_length) total_n_it_pol = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: n_threads = n_threads_to_run(timesteps_this_batch) if threads == 1: path = col.__call__(env, animate=(animate and len(paths) == 0 and itr % 10)) paths.append(path) else: with ThreadPoolExecutor(max_threads_pool) as exec: futures = [ exec.submit(col.__call__, e) for e in env.envs[:n_threads] ] for future in as_completed(futures, timeout=thread_timeout): paths.append(future.result()) col_paths = paths[-n_threads:] timesteps_this_batch += sum( [pathlength(path) for path in col_paths]) if timesteps_this_batch >= min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate( [path["observation"] for path in paths if pathlength(path) > 0]) ac_na = np.concatenate( [path["action"] for path in paths if pathlength(path) > 0]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# q_ns = [] for path in paths: path_len = pathlength(path) rews = path['reward'] discs = np.power(gamma, np.arange(path_len)) if reward_to_go: qn = [ np.sum(discs[:path_len - t] * rews[t:]) for t in range(path_len) ] else: qn = np.sum(discs * rews) * np.ones(path_len) q_ns.append(qn) q_n = np.concatenate(q_ns) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = np.array( sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})) b_n = b_n * np.std(q_n) + np.mean(q_n) if gae: adv_ns = [] for i, path in enumerate(paths): path_len = pathlength(path) rews = path['reward'] gamma_discs = np.power(gamma, np.arange(path_len)) gamma_lambda_discs = np.multiply( gamma_discs, np.power(lambd, np.arange(path_len))) deltas = rews[:-1] + gamma * b_n[i + 1:i + path_len] - b_n[ i:i + path_len - 1] adv_n = [ np.sum( gamma_lambda_discs[:path_len - 1 - t] * deltas[t:]) for t in range(path_len - 1) ] + [0] adv_ns.append(adv_n) adv_n = np.concatenate(adv_ns) q_gae = np.array(adv_n + b_n) else: adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + CONST) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # experiment with different targets # q_n = (q_n - np.mean(q_n)) / (np.std(q_n) + CONST) q_n = (q_n - np.mean(q_gae)) / (np.std(q_gae) + CONST) # q_n = (q_gae - np.mean(q_gae)) / (np.std(q_gae) + CONST) # q_n = (q_gae - np.mean(q_n)) / (np.std(q_n) + CONST) # q_n = (b_n-np.mean(q_n))/(np.std(q_n)+CONST) # q_n = (b_n-np.mean(q_gae))/(np.std(q_gae)+CONST) _ = sess.run([baseline_update_op], feed_dict={ sy_ob_no: ob_no, sy_bl_target_n: q_n }) #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. print('pg n_it_pol', n_it_pol) curr_n_it_pol = n_it_pol_fn(itr) if n_it_pol_fn else n_it_pol total_n_it_pol += curr_n_it_pol print('pg curr_n_it_pol', curr_n_it_pol) policy_feed_dict = {sy_ob_no: ob_no, sy_ac_na: ac_na} loss_feed_dict = {**policy_feed_dict, sy_adv_n: adv_n} if offpol: policy_old = sess.run(sy_prob_n, feed_dict=policy_feed_dict) loss_feed_dict = {**loss_feed_dict, sy_prob_old: policy_old} l = sess.run(loss, feed_dict=loss_feed_dict) for off_it in range(curr_n_it_pol): _ = sess.run(update_op, feed_dict=loss_feed_dict) l_upd = sess.run(loss, feed_dict=loss_feed_dict) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("PolicyIter", total_n_it_pol - 1) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.log_tabular("Loss", l) logz.log_tabular("Loss updated", l_upd) logz.dump_tabular(prec=8) logz.pickle_tf_vars()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, network_activation='tanh' ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #activation function for the network if network_activation=='relu': activation=torch.nn.functional.relu elif network_activation=='leaky_relu': activation=torch.nn.functional.leaky_relu else: activation=torch.nn.functional.tanh #todo: create policy actor=build_mlp(ob_dim, ac_dim, "actor",\ n_layers=n_layers, size=size, activation=activation, discrete=discrete) actor_loss=reinforce_loss actor_optimizer=torch.optim.Adam(actor.parameters(), lr=learning_rate) #todo: initilize Agent: #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: critic=build_mlp(ob_dim,1,"nn_baseline",\ n_layers=n_layers,size=size, discrete=discrete) critic_loss=nn.MSELoss() critic_optimizer=torch.optim.Adam(critic.parameters(), lr=learning_rate) #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards, log_probs = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) ob = torch.from_numpy(ob).float().unsqueeze(0) obs.append(ob) ac, log_prob = actor.run(ob) acs.append(ac) log_probs.append(log_prob) #format the action from policy if discrete: ac = int(ac) else: ac = ac.squeeze(0).numpy() ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : torch.cat(obs, 0), "reward" : torch.Tensor(rewards), "action" : torch.cat(acs, 0), "log_prob" : torch.cat(log_probs, 0)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch ob_no = torch.cat([path["observation"] for path in paths], 0) ac_na = torch.cat([path["action"] for path in paths], 0) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# q_n = [] for path in paths: rewards = path['reward'] num_steps = pathlength(path) R=[] if reward_to_go: for t in range(num_steps): R.append((torch.pow(gamma, torch.arange(num_steps-t))*rewards[t:]).sum().view(-1,1)) q_n.append(torch.cat(R)) else: q_n.append((torch.pow(gamma, torch.arange(num_steps)) * rewards).sum() * torch.ones(num_steps, 1)) q_n = torch.cat(q_n, 0) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = critic(ob_no) q_n_std = q_n.std() q_n_mean = q_n.mean() b_n_scaled = b_n * q_n_std + q_n_mean adv_n = (q_n - b_n_scaled).detach() else: adv_n = q_n #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + np.finfo(np.float32).eps.item()) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE target = (q_n - q_n_mean) / (q_n_std + np.finfo(np.float32).eps.item()) critic_optimizer.zero_grad() c_loss = critic_loss(b_n, target) c_loss.backward() critic_optimizer.step() #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE log_probs = torch.cat([path["log_prob"] for path in paths], 0) actor_optimizer.zero_grad() loss = actor_loss(log_probs, adv_n, len(paths)) print(loss) loss.backward() actor_optimizer.step() # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, baseline_lr, reward_to_go, animate, logdir, normalize_advantages, nn_baseline, seed, n_layers, output_activation, size, save_models, save_best_model, resume_string, run_model_only, script_optimizing_dir, parallel, relative_positions, death_penalty, reward_circle, num_enemies, gb_discrete, gb_max_speed): start = time.time() if script_optimizing_dir is not None: logdir = logdir[:5] + script_optimizing_dir + '/' + logdir[5:] #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment if env_name == 'GB_game': env = GB_game(num_char=num_enemies, reward_circle=reward_circle, death_penalty=death_penalty, relative_positions=relative_positions, discrete=gb_discrete, max_speed=gb_max_speed) discrete = env.discrete if parallel == True: ray.register_custom_serializer( GB_game, use_pickle=True) # amazing. I needed to use this to get it to put_env = ray.put(env) else: env = gym.make(env_name) # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # pdb.set_trace() env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'output_activation': output_activation, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, 'baseline_lr': baseline_lr, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'reward_to_go': reward_to_go, 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages, } if parallel is True: num_cpus = psutil.cpu_count(logical=True) num_cpus = num_cpus - 1 print('the number of cpus is now' + str(num_cpus)) ray.init(num_cpus=num_cpus, ignore_reinit_error=True) pathlen_counter = Counter.remote() parallel_actors = [ Parallel_Actor.remote(computation_graph_args, sample_trajectory_args, estimate_return_args) for _ in range(num_cpus) ] agent = Parallel_Actor.remote(computation_graph_args, sample_trajectory_args, estimate_return_args) # This is the one used for updating the weights else: agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() # Now we'll try to load if we are only running a model or if we are resuming training. if run_model_only is not None: agent.load_models_action(run_model_only) agent.running_only = True elif resume_string is not None: agent.load_models_action(resume_string) #setup for a parallel training loader. #========================================================================================# # Training Loop #========================================================================================# best_avg_return = -(5e10) total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) if parallel is True: pathlen_counter.reset_counter.remote() weights_copy = agent.get_weights.remote() ray.get([ p_agent.set_weights.remote(weights_copy) for p_agent in parallel_actors ]) weights = ray.get( [p_agent.get_weights.remote() for p_agent in parallel_actors]) for i in range(len(weights)): np.testing.assert_equal(weights[i], weights[0]) print('\n \n the weights have successfully been reset!! \n \n') paths = [] agent_outputs = [] for p_agent in parallel_actors: # Note this is not parallel! yet. agent_outputs.append( p_agent.sample_trajectories.remote(itr, put_env, pathlen_counter)) for output in agent_outputs: path_set, timesteps_this_batch = ray.get( output) #Gotta use pathset #Question: Would it be faster to do a self.env structure for parallel agents? [paths.append(path) for path in path_set] total_timesteps += timesteps_this_batch # wow so it's really helpful the paths come in contiguous segments. else: paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths # Note that estimate_return could also be parallelized. if run_model_only is not None: continue ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = [path["reward"] for path in paths] if parallel: q_n, adv_n = ray.get(agent.estimate_return.remote(ob_no, re_n)) agent.update_parameters.remote(ob_no, ac_na, q_n, adv_n) else: q_n, adv_n = agent.estimate_return(ob_no, re_n) agent.update_parameters(ob_no, ac_na, q_n, adv_n) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) mean_return = np.mean(returns) if mean_return > best_avg_return: best_avg_return = mean_return if save_best_model == True: save_string = logdir[5:-2] if parallel: agent.save_models_action.remote(save_string) else: agent.save_models_action(save_string) logz.log_tabular("AverageReturn", mean_return) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) # My own if parallel is False: if hasattr(agent, 'batch_baseline_loss'): logz.log_tabular("BaselineLoss", agent.batch_baseline_loss) logz.log_tabular("UnscaledLoss", agent.batch_unscaled_loss) logz.log_tabular("Loss", agent.batch_loss) logz.dump_tabular() logz.pickle_tf_vars() # if script_optimizing == True: # print(np.max(returns)) # One potential issue here is that there won't be a local for the first iteration. we must make it # so. if save_models == True and save_best_model == False: save_string = logdir[5:-2] if parallel: agent.save_models_action.remote(save_string) else: agent.save_models_action(save_string)
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, num_threads_gen=1, multi_steps_gd=1, reuse_nn_bl=False): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# tf.reset_default_graph() sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(sy_ob_no, ac_dim, "nn", n_layers=n_layers, size=size) # Hint: Use the tf.multinomial op # the shape -1 automatically infers that the reshape will be done in the None axis sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), shape=[-1]) # negative in front is to remove the negative nature of cross entropy sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits( logits=sy_logits_na, labels=sy_ac_na) else: # YOUR_CODE_HERE sy_mean = build_mlp(sy_ob_no, ac_dim, "nn", n_layers=n_layers, size=size) # logstd should just be a trainable variable, not a network output. sy_logstd = tf.get_variable('logstd', shape=[1, ac_dim], dtype=tf.float32, initializer=tf.zeros_initializer) sy_sampled_ac = sy_mean + tf.exp(sy_logstd) * tf.random_normal( tf.shape(sy_mean)) # Hint: Use the log probability under a multivariate gaussian. sy_z = (sy_ac_na - sy_mean) / tf.exp(sy_logstd) sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(sy_z), axis=1) # sy_logprob_n = - 1/2 * tf.nn.l2_loss(sy_mean - sy_ac_na) #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# # Loss function that we'll differentiate to get the policy gradient. # Negative is to maximize the loss, instead of minimizing loss = -tf.reduce_mean(sy_logprob_n * sy_adv_n) update_op = tf.train.AdamOptimizer(learning_rate, name='AdamPolicy').minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: if not reuse_nn_bl: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) else: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size, reuse_hidden_layers=True, reuse_scope_name="nn")) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE sy_target_bn = tf.placeholder(tf.float32, shape=[None], name='target_bn') loss_bn = tf.nn.l2_loss(sy_target_bn - baseline_prediction) baseline_update_op = tf.train.AdamOptimizer( learning_rate, name='AdamBL').minimize(loss_bn) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() # pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) paths = [] gen_start_time = time.time() if num_threads_gen == 1: # Collect paths until we have enough timesteps timesteps_this_batch = 0 while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch else: # Multithread approach using tf coordinator coord = tf.train.Coordinator() workers = [ TrajectionRunner(sess, sy_sampled_ac, sy_ob_no, env_name, max_path_length, min_timesteps_per_batch // num_threads_gen) for _ in range(num_threads_gen) ] for wrk in workers: wrk.start() coord.join(workers) # After here, all workers should be ready, let's collect their data timesteps_this_batch = 0 for wrk in workers: paths.extend(wrk.paths) timesteps_this_batch = wrk.total_timesteps total_timesteps += wrk.total_timesteps gen_total_time = time.time() - gen_start_time # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # # ====================================================================================# # YOUR_CODE_HERE # wrong, every path leads to different rewards! def discount_rewards(rwds, rtg): q = np.zeros_like(rwds) s = 0 for t in reversed(range(rwds.shape[0])): s = s * gamma + rwds[t] q[t] = s if not rtg: q[:] = q[0] return q q_n = np.concatenate( [discount_rewards(path["reward"], reward_to_go) for path in paths]) # ====================================================================================# # ----------SECTION 5---------- # Computing Baselines # ====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) b_n = rescale(normalize(b_n), q_n.mean(axis=0, keepdims=True), q_n.std(axis=0, keepdims=True)) adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = normalize(adv_n) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE norm_q_n = normalize(q_n) total_bn_loss = 0 for _ in range(multi_steps_gd): _, bn_loss = sess.run([baseline_update_op, loss_bn], feed_dict={ sy_ob_no: ob_no, sy_target_bn: norm_q_n }) total_bn_loss += bn_loss total_bn_loss /= multi_steps_gd #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE total_loss = 0 for _ in range(multi_steps_gd): _, current_loss = sess.run([update_op, loss], feed_dict={ sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n }) total_loss += current_loss total_loss /= multi_steps_gd # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("GenTime", gen_total_time) logz.log_tabular("Iteration", itr) logz.log_tabular("Loss", total_loss) if nn_baseline: logz.log_tabular("BNLoss", total_bn_loss) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, gae_lambda=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # _nac - this tensor should have shape _n (discrete action) or _na (continuous action) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# # Observations are input for everything: sampling actions, baselines, policy gradients sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # Actions are input when computing policy gradient updates if discrete: sy_nac = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_nac = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Advantages are input when computing policy gradient updates sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE # Compute stochastic policy over discrete actions sy_logits_na = build_mlp(sy_ob_no, ac_dim, "policy", n_layers=n_layers, size=size) # Sample an action from the stochastic policy sy_sampled_nac = tf.multinomial(sy_logits_na, 1) sy_sampled_nac = tf.reshape(sy_sampled_nac, [-1]) # Likelihood of chosen action sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits( labels=sy_nac, logits=sy_logits_na) else: # YOUR_CODE_HERE # Compute Gaussian stochastic policy over continuous actions. # The mean is a function of observations, while the variance is not. sy_mean_na = build_mlp(sy_ob_no, ac_dim, "policy", n_layers=n_layers, size=size) sy_logstd = tf.Variable(tf.zeros([1, ac_dim]), name="policy/logstd", dtype=tf.float32) sy_std = tf.exp(sy_logstd) # Sample an action from the stochastic policy sy_sampled_z = tf.random_normal(tf.shape(sy_mean_na)) sy_sampled_nac = sy_mean_na + sy_std * sy_sampled_z # Likelihood of chosen action sy_z = (sy_nac - sy_mean_na) / sy_std sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(sy_z), axis=1) #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# # Loss function that we'll differentiate to get the policy gradient. # Note: no gradient will flow through sy_adv_n, because it's a placeholder. loss = -tf.reduce_mean(sy_logprob_n * sy_adv_n) update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE sy_target_n = tf.placeholder(shape=[None], name="target", dtype=tf.float32) baseline_loss = tf.nn.l2_loss(baseline_prediction - sy_target_n) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( baseline_loss) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: # Simulate one episode and get a path ob = env.reset() obs, acs, rews = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) # Feed a batch of one observatioin to get a batch of one action ac = sess.run(sy_sampled_nac, feed_dict={sy_ob_no: [ob]}) ac = ac[0] acs.append(ac) # Simulate one time step ob, rew, done, _ = env.step(ac) rews.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "action": np.array(acs), "reward": np.array(rews) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_nac = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t)] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE q_n = [] for path in paths: q = 0 q_path = [] # Dynamic programming over reversed path for rew in reversed(path["reward"]): q = rew + gamma * q q_path.append(q) q_path.reverse() # Append these q values if not reward_to_go: q_path = [q_path[0]] * len(q_path) q_n.extend(q_path) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) b_n = normalize(b_n, np.mean(q_n), np.std(q_n)) # Generalized advantage estimation adv_n = [] idx = 0 for path in paths: adv = 0 adv_path = [] V_next = 0 idx += len(path["reward"]) # Dynamic programming over reversed path for rew, V in zip(reversed(path["reward"]), b_n[idx - 1:None:-1]): bellman_error = rew + gamma * V_next - V adv = bellman_error + gae_lambda * gamma * adv adv_path.append(adv) V_next = V adv_path.reverse() # Append these advantage values if not reward_to_go: adv_path = [adv_path[0]] * len(adv_path) adv_n.extend(adv_path) # Compute a GAE version of q_n to use when fitting the baseline q_n = b_n + adv_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = normalize(adv_n) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE q_normalized_n = normalize(q_n) sess.run(baseline_update_op, feed_dict={ sy_ob_no: ob_no, sy_target_n: q_normalized_n }) #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE sess.run(update_op, feed_dict={ sy_ob_no: ob_no, sy_nac: ac_nac, sy_adv_n: adv_n }) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_AC( exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, num_target_updates, num_grad_steps_per_target_update, animate, logdir, normalize_advantages, seed, n_layers, size, ######################################################################## # Exploration args bonus_coeff, kl_weight, density_lr, density_train_iters, density_batch_size, density_hiddim, dm, replay_size, sigma, ######################################################################## ): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment ######################################################################## # Exploration if env_name == 'PointMass-v0': from pointmass import PointMass env = PointMass() else: env = gym.make(env_name) dirname = logz.G.output_dir ######################################################################## # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, 'num_target_updates': num_target_updates, 'num_grad_steps_per_target_update': num_grad_steps_per_target_update, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_advantage_args = { 'gamma': gamma, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_advantage_args) #estimate_return_args # build computation graph agent.build_computation_graph() ######################################################################## # Initalize exploration density model if dm != 'none': if env_name == 'PointMass-v0' and dm == 'hist': density_model = Histogram( nbins=env.grid_size, preprocessor=env.preprocess) exploration = DiscreteExploration( density_model=density_model, bonus_coeff=bonus_coeff) elif dm == 'rbf': density_model = RBF(sigma=sigma) exploration = RBFExploration( density_model=density_model, bonus_coeff=bonus_coeff, replay_size=int(replay_size)) elif dm == 'ex2': density_model = Exemplar( ob_dim=ob_dim, hid_dim=density_hiddim, learning_rate=density_lr, kl_weight=kl_weight) exploration = ExemplarExploration( density_model=density_model, bonus_coeff=bonus_coeff, train_iters=density_train_iters, bsize=density_batch_size, replay_size=int(replay_size)) exploration.density_model.build_computation_graph() else: raise NotImplementedError ######################################################################## # tensorflow: config, session, variable initialization agent.init_tf_sess() ######################################################################## if dm != 'none': exploration.receive_tf_sess(agent.sess) ######################################################################## #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = np.concatenate([path["reward"] for path in paths]) next_ob_no = np.concatenate([path["next_observation"] for path in paths]) terminal_n = np.concatenate([path["terminal"] for path in paths]) ######################################################################## # Modify the reward to include exploration bonus """ 1. Fit density model if dm == 'ex2': the call to exploration.fit_density_model should return ll, kl, elbo else: the call to exploration.fit_density_model should return nothing 2. Modify the re_n with the reward bonus by calling exploration.modify_reward """ old_re_n = re_n if dm == 'none': pass else: # 1. Fit density model if dm == 'ex2': ### PROBLEM 3 ### YOUR CODE HERE ll, kl, elbo = exploration.fit_density_model(ob_no) # raise NotImplementedError elif dm == 'hist' or dm == 'rbf': ### PROBLEM 1 ### YOUR CODE HERE exploration.fit_density_model(ob_no) # raise NotImplementedError else: assert False # 2. Modify the reward ### PROBLEM 1 ### YOUR CODE HERE # raise NotImplementedError re_n = exploration.modify_reward(re_n, ob_no) print('average state', np.mean(ob_no, axis=0)) print('average action', np.mean(ac_na, axis=0)) # Logging stuff. # Only works for point mass. if env_name == 'PointMass-v0': np.save(os.path.join(dirname, '{}'.format(itr)), ob_no) ######################################################################## agent.update_critic(ob_no, next_ob_no, re_n, terminal_n) adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) agent.update_actor(ob_no, ac_na, adv_n) if n_iter - itr < 10: max_reward_path_idx = np.argmax(np.array([path["reward"].sum() for path in paths])) print(paths[max_reward_path_idx]['reward']) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) ######################################################################## logz.log_tabular("Unmodified Rewards Mean", np.mean(old_re_n)) logz.log_tabular("Unmodified Rewards Std", np.mean(old_re_n)) logz.log_tabular("Modified Rewards Mean", np.mean(re_n)) logz.log_tabular("Modified Rewards Std", np.mean(re_n)) if dm == 'ex2': logz.log_tabular("Log Likelihood Mean", np.mean(ll)) logz.log_tabular("Log Likelihood Std", np.std(ll)) logz.log_tabular("KL Divergence Mean", np.mean(kl)) logz.log_tabular("KL Divergence Std", np.std(kl)) logz.log_tabular("Negative ELBo", -elbo) ######################################################################## logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def actor_critic(sess, exp, pg_model, value_model, env, gamma, isRTG=True, n_iterations=100, n_batch=100, isRenderding=True, isRecordingVideo=True, recordingVideo_dir="video", isNNBaseLine=True, isNormalizeAdvantage=True, isAdaptiveStd=False, test_name="test", logging_dir="log", seed=0): # Get environment name env_name = env.spec.id # Configure output directory for logging logz.configure_output_dir(os.path.join(logging_dir, '%d' % exp)) recordingVideo_dir = os.path.join(recordingVideo_dir, '%d' % exp) if not os.path.exists(recordingVideo_dir): os.makedirs(recordingVideo_dir) # Log experimental parameters args = inspect.getargspec(actor_critic)[0] locals_ = locals() params = { k: locals_[k] if k in locals_ and isinstance(locals_[k], (int, str, float)) else None for k in args } logz.save_params(params) print("Policy Gradient for {} Environment".format(env_name)) for iter in range(n_iterations): print("==========================================") print("Iteration: ", iter) steps_in_batch = 0 trajectories = [] tic = time.clock() episode = 1 video_recorder = None # Outer loop for collecting a trajectory batch while True: episode_states, episode_next_states, episode_actions, episode_rewards, episode_targets, episode_advantages \ = [], [], [], [], [], [] episode_steps = 0 state = env.reset() if isRecordingVideo and episode == 1 and ( iter % 10 == 0 or iter == n_iterations - 1 or iter == 0): video_recorder = VideoRecorder( env, os.path.join( recordingVideo_dir, "vid_{}_{}_{}_{}.mp4".format(env_name, exp, test_name, iter)), enabled=True) print("Recording a video of this episode {} in iteration {}". format(episode, iter)) # Roll-out trajectory to collect a batch while True: if isRenderding: env.render() if video_recorder: video_recorder.capture_frame() # Choose an action based on observation action = pg_model.predict(state, sess=sess) action = action[0] # Simulate one time step from action next_state, reward, done, info = env.step(action=action) # Collect data for a trajectory episode_states.append(state) episode_next_states.append(next_state) episode_actions.append(action) episode_rewards.append(reward) state = next_state episode_steps += 1 if done: break # Compute advantages for step in range(len(episode_states)): target = episode_rewards[step] + gamma * ValueEstimator.predict( episode_next_states[step], sess=sess) advantage = target - ValueEstimator.predict( episode_states[step], sess=sess) episode_targets.append(target) episode_advantages.append(advantage) if isNormalizeAdvantage: # episode_advantages = normalize(episode_advantages) episode_advantages = (episode_advantages - np.mean(episode_advantages)) \ / (np.std(episode_advantages) + 1e-8) # Append to trajectory batch trajectory = { "state": np.array(episode_states), "action": np.array(episode_actions), "reward": np.array(episode_rewards), "target": np.array(episode_targets), "advantage": np.array(episode_advantages) } trajectories.append(trajectory) # Increase episode step steps_in_batch += len(trajectory["reward"]) episode += 1 # Close video recording if video_recorder: video_recorder.close() video_recorder = None # Break loop when enough episode batch is collected if episode > n_batch: # steps_in_batch > min_steps_in_batch: break # pg_model.sample_trajectories(trajectories) batch_states = np.concatenate([traj["state"] for traj in trajectories]) batch_actions = np.concatenate( [traj["action"] for traj in trajectories]) batch_targets = np.concatenate( [traj["target"] for traj in trajectories]) batch_advantages = np.concatenate( [traj["advantage"] for traj in trajectories]) # Update value estimator value_model.update(states=batch_states, targets=batch_targets, sess=sess) # Update policy estimator pg_model.update(states=batch_states, actions=batch_actions, advantages=batch_advantages, sess=sess) toc = time.clock() elapsed_sec = toc - tic rewards = [traj["reward"].sum() for traj in trajectories] advantages = [traj["advantage"].sum() for traj in trajectories ] # TODO: DOESN'T LOOK NECESSARY OR INFORMATIVE episode_lengths = [len(traj["reward"]) for traj in trajectories] # Log progress logz.log_tabular("Time", elapsed_sec) logz.log_tabular("Iteration", iter) logz.log_tabular("Average_Return", np.mean(rewards)) logz.log_tabular("Std_Return", np.std(rewards)) logz.log_tabular("Max_Return", np.max(rewards)) logz.log_tabular("Min_Return", np.min(rewards)) logz.log_tabular("Average_Advs", np.mean(advantages)) logz.log_tabular("Std_Advs", np.std(advantages)) logz.log_tabular("Max_Advs", np.max(advantages)) logz.log_tabular("Min_Advs", np.min(advantages)) logz.log_tabular("Num_Total_Ep", len(episode_lengths)) logz.log_tabular("Mean_Ep_Len", np.mean(episode_lengths)) logz.log_tabular("Std_Ep_Len", np.std(episode_lengths)) logz.log_tabular("Sec_per_iteration: ", elapsed_sec) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, bootstrap=False): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] print ob_dim, ac_dim #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="advantage", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(sy_ob_no, ac_dim, "discrete_mlp", n_layers=n_layers, size=size, activation=tf.nn.relu, output_activation=None) # print sy_logits_na sy_logprob_na = tf.nn.log_softmax(sy_logits_na) sy_sampled_ac = tf.multinomial(sy_logprob_na, 1) # Hint: Use the tf.multinomial op # print sy_sampled_ac batch_n = tf.shape(sy_ob_no)[0] act_index = tf.stack([tf.range(0, batch_n), sy_ac_na], axis=1) # sy_sampled_ac = tf.gather_nd(sy_sampled_ac,tf.range(0,batch_n)) # sy_sampled_ac = sy_sampled_ac[0] sy_logprob_n = tf.gather_nd(sy_logprob_na, act_index) else: # YOUR_CODE_HERE sy_mean = build_mlp(sy_ob_no, ac_dim, "continuous_mlp", n_layers=2, size=32, activation=tf.nn.relu, output_activation=None) sy_logstd = tf.Variable( tf.ones(batch_n), name="std" ) # logstd should just be a trainable variable, not a network output. sy_sampled_ac = sy_mean + sy_logstd * tf.random_normal( tf.shape(sy_mean)) sy_logprob_n = normal_log_prob( sy_ac_na, sy_mean, sy_log_std, ac_dim ) # Hint: Use the log probability under a multivariate gaussian. #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = -tf.reduce_mean( sy_logprob_n * sy_adv_n ) # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=1, size=32, activation=tf.nn.relu, output_activation=None)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE v_t = tf.placeholder("float", [None]) l_2 = 0.5 * tf.nn.l2_loss(v_t - baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( l_2) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards, obs_2 = [], [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) pi = sess.run(sy_logits_na, feed_dict={sy_ob_no: ob[None]}) # print pi ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) # print ac ac = ac[0][0] # print ac acs.append(ac) # print ac ob, rew, done, _ = env.step(ac) obs_2.append(ob) rewards.append(rew) steps += 1 if done or steps > max_path_length: terminated = done break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs), "obs_next": np.array(obs_2) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) ob_next_no = np.concatenate([path["obs_next"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # q_n = np.zero(q_n.shape) # YOUR_CODE_HERE if reward_to_go: q_n = [] # for path in paths.reverse(): # q_t = 0 # r_path = path["reward"].reverse() # path_len = pathlength(r_path) # for r in enumerate(r_path): # q_t = r + gamma*q_t # q_n[i] = q_t # i += 1 # q_n.reverse() if not bootstrap: for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) q_n.append(return_t) else: for path in paths: v_nxt = sess.run(baseline_prediction, feed_dict={sy_ob_no: path["obs_next"]}) q_target = v_nxt + path["reward"] q_n.append(q_target) q_n = np.concatenate(q_n) else: i = 0 q_n = np.concatenate([path["reward"] for path in paths]) for path in paths: q_t = 0 for idx, r in enumerate(path["reward"]): q_t += gamma**idx * r q_n[i] = q_t i += 1 #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) adv_n = q_n - b_n else: adv_n = q_n.copy() # print q_n #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE normal_adv = tf.nn.l2_normalize(sy_adv_n, 0, epsilon=1e-8, name="adv_normal") sess.run(normal_adv, feed_dict={sy_adv_n: adv_n}) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE v_target = [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) v_target.append(return_t) v_target = np.concatenate(v_target) print v_target.shape for _ in range(40): sess.run(baseline_update_op, feed_dict={ sy_ob_no: ob_no, v_t: v_target }) #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE sess.run(update_op, feed_dict={ sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n }) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps ################################################## # Notes on notation: # # sy_: symbolic variables, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob: observation # ac: action # _no: observations (X); shape: (batch size /n/, observation dim) # _na: actions (y); shape: (batch size /n/, action dim) # _n: this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for # that axis is None ################################################## # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] ################################################## # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy # gradient loss function. ################################################## # input to the policy network (X) sy_ob_no = tf.placeholder( shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder( shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder( shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder( shape=[None], name='adv', dtype=tf.float32) ################################################## # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian # distribution over actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # # a. For the discrete case, an op that takes in logits and produces # actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # # The output from a Gaussian distribution with mean 'mu' and std # 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use # tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output # ops. # # 3. Computing the log probability of a set of actions that were actually # taken, according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', # and the policy network output ops. # ################################################## sy_output_layer = build_mlp( input_placeholder=sy_ob_no, output_size=ac_dim, scope='policy_nn', n_layers=n_layers, size=size, ) if discrete: sy_logits_na = sy_output_layer # Based on the multinomial distribution defined by logits, sample one # action for each observation # [:, 0]: to be compatible with later usage of sy_sampled_ac sy_sampled_ac = tf.multinomial(sy_logits_na, 1)[:, 0] sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=sy_ac_na, logits=sy_logits_na ) else: sy_mean = sy_output_layer # logstd should just be a trainable variable, not a network output. sy_logstd = tf.Variable(tf.zeros([1, ac_dim])) sy_std = tf.exp(sy_logstd) sy_sampled_ac = tf.random_normal( # note off-diagonal elements are 0, meaning no correlation among # different dimensions in the gaussian shape=tf.shape(sy_mean), mean=sy_mean, stddev=sy_std) # Hint: Use the log probability under a multivariate gaussian. mvn = tf.contrib.distributions.MultivariateNormalDiag( sy_mean, sy_std) sy_logprob_n = tf.log(mvn.prob(sy_mean)) # code equivalent the implementation at https://github.com/EbTech/CS294/blob/58766d6d22d997c9c97e860b38ab95faf376162c/hw2/train_pg.py#L196 # sy_mean = build_mlp(sy_ob_no, ac_dim, "policy", n_layers=n_layers, size=size) # sy_logstd = tf.Variable(tf.zeros([1, ac_dim]), name="policy/logstd", dtype=tf.float32) # sy_std = tf.exp(sy_logstd) # # Sample an action from the stochastic policy # sy_sampled_z = tf.random_normal(tf.shape(sy_mean)) # sy_sampled_ac = sy_mean + sy_std * sy_sampled_z # # Likelihood of chosen action # sy_z = (sy_ac_na - sy_mean) / sy_std # sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(sy_z), axis=1) ################################################## # ----------SECTION 4---------- # Loss Function and Training Operation ################################################## # construct a pseudo-loss such that the gradient of its corresponding # computation graph is the policy gradient, within tf.reduce_mean is the # weighted negative likelihoods loss = tf.reduce_mean(tf.multiply(sy_logprob_n, sy_adv_n)) update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) ################################################## # ----------SECTION 5---------- # Optional Baseline ################################################## if nn_baseline: baseline_prediction = tf.squeeze(build_mlp( sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_update_op = TODO ################################################## # Tensorflow Engineering: Config, Session, Variable initialization ################################################## tf_config = tf.ConfigProto( inter_op_parallelism_threads=1, intra_op_parallelism_threads=1 ) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() # pylint: disable=E1101 ################################################## # Training Loop ################################################## total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = ( len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) # ob[None] is equivalent to ob.reshape(1, -1) in this case, # i.e. turning ob into a sequence of observations with a length # of 1 so that can be fed to the nn ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update # by concatenating across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) ################################################## # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be # used to compute advantages (which will in turn be fed to the # placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag # 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward # summed over entire trajectory (regardless of which time step # the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of # rewards starting from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a # variable 'q_n', like the 'ob_no' and 'ac_na' above. # ################################################## # YOUR_CODE_HERE q_n = [] if reward_to_go == False: for path in paths: q_t = [] for k in range(len(path['reward'])): T = k + 1 ret = [] for t in range(T): _r = 0 for t_prime in range(T): _r += gamma ** t_prime * path['reward'][t_prime] ret.append(_r) q_t.append(np.sum(ret)) q_n.append(q_t) # vectorized version # path_len = path['reward'].shape[0] # gammas = np.repeat(gamma, path_len) # powers = np.arange(path_len) # discounts = gammas ** powers # discounted_rewards = discounts * path['reward'] # discounted_rewards_sum = np.cumsum(discounted_rewards) else: for path in paths: q_t = [] for k in range(len(path['reward'])): T = k + 1 ret = [] for t in range(T): _r = 0 for t_prime in range(t, T): _r += gamma ** (t_prime - t) * path['reward'][t_prime] ret.append(_r) q_t.append(np.sum(ret)) q_n.append(q_t) q_n = np.concatenate(q_n) ################################################## # ----------SECTION 5---------- # Computing Baselines ################################################## if nn_baseline: # If nn_baseline is True, use your neural network to predict # reward-to-go at each timestep for each trajectory, and save the # result in a variable 'b_n' like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the # statistics (mean and std) of the current or previous batch of # Q-values. (Goes with Hint #bl2 below.) b_n = TODO adv_n = q_n - b_n else: adv_n = q_n.copy() ################################################## # ----------SECTION 4---------- # Advantage Normalization ################################################## if normalize_advantages: # On the next line, implement a trick which is known empirically to # reduce variance in policy gradient methods: normalize adv_n to # have mean zero and std=1. YOUR_CODE_HERE adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n + 1e-8)) ################################################## # ----------SECTION 5---------- # Optimizing Neural Network Baseline ################################################## if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE pass ################################################## # ----------SECTION 4---------- # Performing the Policy Update ################################################## # Call the update operation necessary to perform the policy gradient # update based on the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss # function before and after an update, and then log them below. # YOUR_CODE_HERE feed_dict = { sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: q_n } logz.log_tabular("loss before update", loss.eval(feed_dict=feed_dict)) # multiple updates per sampling is WRONG because the trajectories are # sampled from the specific one policy before a single update. After # one update, the trajectories do not correspond to the new policy any # more. # for i in range(100): # sess.run(update_op, feed_dict=feed_dict) logz.log_tabular("loss after update", loss.eval(feed_dict=feed_dict)) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, reward_to_go, animate, logdir, normalize_advantages, nn_baseline, seed, n_layers, size): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment env = gym.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'reward_to_go': reward_to_go, 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = [path["reward"] for path in paths] q_n, adv_n = agent.estimate_return(ob_no, re_n) agent.update_parameters(ob_no, ac_na, q_n, adv_n) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars() agent.close_tf_sess()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = TODO #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = TODO sy_sampled_ac = TODO # Hint: Use the tf.multinomial op sy_logprob_n = TODO else: # YOUR_CODE_HERE sy_mean = TODO sy_logstd = TODO # logstd should just be a trainable variable, not a network output. sy_sampled_ac = TODO sy_logprob_n = TODO # Hint: Use the log probability under a multivariate gaussian. #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = TODO # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze(build_mlp( sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_update_op = TODO #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE q_n = TODO #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = TODO adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()