def train_AC( exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, num_target_updates, num_grad_steps_per_target_update, animate, logdir, normalize_advantages, seed, n_layers, size): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment env = gym.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, 'num_target_updates': num_target_updates, 'num_grad_steps_per_target_update': num_grad_steps_per_target_update, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_advantage_args = { 'gamma': gamma, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_advantage_args) #estimate_return_args # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = np.concatenate([path["reward"] for path in paths]) next_ob_no = np.concatenate([path["next_observation"] for path in paths]) terminal_n = np.concatenate([path["terminal"] for path in paths]) # Call tensorflow operations to: # (1) update the critic, by calling agent.update_critic # (2) use the updated critic to compute the advantage by, calling agent.estimate_advantage # (3) use the estimated advantage values to update the actor, by calling agent.update_actor # YOUR CODE HERE raise NotImplementedError # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def log_progress(self): episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) self.std_episode_reward = np.std(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = \ max(self.best_mean_episode_reward, self.mean_episode_reward) # See the `log.txt` file for where these statistics are stored. if self.t % self.log_every_n_steps == 0: lr = self.optimizer_spec.lr_schedule.value(self.t) hours = (time.time() - self.start_time) / (60. * 60.) logz.log_tabular("Steps", self.t) logz.log_tabular("Avg_Last_100_Episodes", self.mean_episode_reward) logz.log_tabular("Std_Last_100_Episodes", self.std_episode_reward) logz.log_tabular("Best_Avg_100_Episodes", self.best_mean_episode_reward) logz.log_tabular("Num_Episodes", len(episode_rewards)) logz.log_tabular("Exploration_Epsilon", self.exploration.value(self.t)) logz.log_tabular("Adam_Learning_Rate", lr) logz.log_tabular("Elapsed_Time_Hours", hours) logz.dump_tabular()
def train_MAPG( exp_name='', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, learning_rate=5e-3, logdir=None, normalize_advantages=True, seed=101, # network arguments n_layers=1, size=32): #========================================================================================# # Logfile setup #========================================================================================# start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_MAPG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) #========================================================================================# # Env setup #========================================================================================# nAgent = 2 # hard coded! env1 = Simulator(seed=101, N_agent=nAgent, N_prod=3, Tstamp=10, costQ=np.array([[0.3, 0.3, 0.3]]), costInv=np.array([[0.2, 0.2, 0.2]]), costLastInv=np.array([[2, 2, 2]]), costBack=np.array([[0.75, 0.75, 0.75]])) env2 = Simulator(seed=202, N_agent=nAgent, N_prod=3, Tstamp=10, costQ=np.array([[0.3, 0.3, 0.3]]), costInv=np.array([[0.2, 0.2, 0.2]]), costLastInv=np.array([[2, 2, 2]]), costBack=np.array([[0.75, 0.75, 0.75]])) # Observation and action sizes ob_dim = env1.obs_dim() ac_dim = env1.act_dim() print('observation dimension is: ', ob_dim) print('action dimension is: ', ac_dim) print('critic network input dimension is:', ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent) #========================================================================================# # PG Networks #========================================================================================# def PGNet(sy_ob_no, sy_ac_na, sy_adv_n, agent_id): sy_mean = build_mlp(input_placeholder=sy_ob_no, output_size=ac_dim[0] * ac_dim[1], scope=str(seed) + 'MA_' + str(agent_id), n_layers=n_layers, output_activation=tf.sigmoid, size=size, scale=10.) sy_logstd = tf.Variable(tf.truncated_normal( shape=[1, ac_dim[0] * ac_dim[1]], stddev=0.1), name='var_std' + str(agent_id)) sy_sampled_ac = sy_mean + tf.multiply( tf.random_normal(shape=tf.shape(sy_mean)), tf.exp(sy_logstd)) MVN_dist = tf.contrib.distributions.MultivariateNormalDiag( sy_mean, tf.exp(sy_logstd)) sy_logprob_n = MVN_dist.log_prob(sy_ac_na) # Loss function for PG network loss = -tf.reduce_mean( tf.multiply(sy_logprob_n, sy_adv_n) ) # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) return sy_sampled_ac, loss, update_op #========================================================================================# # Critic network #========================================================================================# def CriticNet(sy_ob_critic, baseline_target, agent_id): baseline_prediction = tf.squeeze( build_mlp(sy_ob_critic, output_size=1, scope=str(seed) + "critic_" + str(agent_id), n_layers=n_layers, size=size)) baseline_loss = tf.nn.l2_loss(baseline_target - baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( baseline_loss) return baseline_prediction, baseline_loss, baseline_update_op #========================================================================================# # Add networks in a loop #========================================================================================# sy_ob_no_1 = tf.placeholder(shape=[None, ob_dim[0]], name='ob' + str(1), dtype=tf.float32) sy_ac_na_1 = tf.placeholder(shape=[None, ac_dim[0] * ac_dim[1]], name='ac' + str(1), dtype=tf.float32) sy_adv_n_1 = tf.placeholder(shape=[None], name='adv' + str(1), dtype=tf.float32) sy_ob_critic_1 = tf.placeholder( shape=[None, ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent], name='critic_ob' + str(1), dtype=tf.float32) baseline_target_1 = tf.placeholder(shape=[None], name='baseline_target_qn' + str(1), dtype=tf.float32) sy_sampled_ac_1, loss_1, update_op_1 = PGNet(sy_ob_no_1, sy_ac_na_1, sy_adv_n_1, 1) baseline_prediction_1, baseline_loss_1, baseline_update_op_1 = CriticNet( sy_ob_critic_1, baseline_target_1, 1) sy_ob_no_2 = tf.placeholder(shape=[None, ob_dim[0]], name='ob' + str(2), dtype=tf.float32) sy_ac_na_2 = tf.placeholder(shape=[None, ac_dim[0] * ac_dim[1]], name='ac' + str(2), dtype=tf.float32) sy_adv_n_2 = tf.placeholder(shape=[None], name='adv' + str(2), dtype=tf.float32) sy_ob_critic_2 = tf.placeholder( shape=[None, ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent], name='critic_ob' + str(2), dtype=tf.float32) baseline_target_2 = tf.placeholder(shape=[None], name='baseline_target_qn' + str(2), dtype=tf.float32) sy_sampled_ac_2, loss_2, update_op_2 = PGNet(sy_ob_no_2, sy_ac_na_2, sy_adv_n_2, 2) baseline_prediction_2, baseline_loss_2, baseline_update_op_2 = CriticNet( sy_ob_critic_2, baseline_target_2, 2) # exec("sy_sampled_ac_%s, loss_%s, update_op_%s = PGNet(sy_ob_no_%s, sy_ac_na_%s, sy_adv_n_%s, agent)"%(agent, agent, agent, agent, agent, agent)) # exec("baseline_prediction_%s, baseline_loss_%s, baseline_update_op_%s = CriticNet(sy_ob_critic_%s, baseline_target_%s, agent)"%(agent, agent, agent, agent, agent)) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# num_gpu = 0 tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, device_count={'GPU': num_gpu}) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 total_numpaths = 0 demand_cov = np.array([[0.1, -0.5 * 0.3, -0.5 * 0.3], [-0.5 * 0.3, 0.1, 0.5 * 0.3], [-0.5 * 0.3, 0.5 * 0.3, 0.1]]) for itr in range(n_iter): #========================# # Sampling #========================# randk1 = 0 + itr * seed randk2 = 12306 + itr * seed print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 num_path = 0 paths1 = [] paths2 = [] while True: steps = 0 last = False ob1 = env1.randomInitialStateGenerator() obs1, acs1, rewards1, criticObs1 = [], [], [], [] ob2 = env2.randomInitialStateGenerator() obs2, acs2, rewards2, criticObs2 = [], [], [], [] while steps < env1.Tstamp: if steps == env1.Tstamp - 1: last = True obs1.append(ob1.flatten()) obs2.append(ob2.flatten()) ac1 = sess.run(sy_sampled_ac_1, feed_dict={sy_ob_no_1: ob1}) ac2 = sess.run(sy_sampled_ac_2, feed_dict={sy_ob_no_2: ob2}) acs1.append(ac1.flatten()) acs2.append(ac2.flatten()) criticObs1.append( np.append(np.append(ob1.flatten(), ac1.flatten()), ac2.flatten()).flatten()) criticObs2.append( np.append(np.append(ob2.flatten(), ac2.flatten()), ac1.flatten()).flatten()) actList = [ac1.reshape(-1, 2), ac2.reshape(-1, 2)] demand = env1.demandGenerator_p( actList, M=np.array([10, 10, 10]).reshape(-1, 1), V=np.array([5, 5, 5]).reshape(-1, 1), sens=np.array([1.5, 1.5, 1.5]).reshape(-1, 1), cov=demand_cov, seed=randk1) demand1 = demand[:, 0] demand2 = demand[:, 1] # demand2 = env2.demandGenerator_p(actList, # M = np.array([3, 3, 3]).reshape(-1,1), # V = np.array([5,5,5]).reshape(-1,1), # sens = np.array([1, 1, 1]).reshape(-1,1), # cov = np.diag(np.array([0.25, 0.25, 0.25])), # seed = randk2) ob1, rew1 = env1.step(actList[0], ob1.flatten(), demand1, last) ob2, rew2 = env2.step(actList[1], ob2.flatten(), demand2, last) randk1 += 1 randk2 += 1 rewards1.append(rew1) rewards2.append(rew2) steps += 1 path1 = { "observation": np.array(obs1), "reward": np.array(rewards1), "action": np.array(acs1), "criticObservation": np.array(criticObs1) } path2 = { "observation": np.array(obs2), "reward": np.array(rewards2), "action": np.array(acs2), "criticObservation": np.array(criticObs2) } paths1.append(path1) paths2.append(path2) num_path += 1 timesteps_this_batch += pathlength(path1) if timesteps_this_batch > min_timesteps_per_batch: break total_numpaths += num_path total_timesteps += timesteps_this_batch if last and itr == n_iter - 1: pickle.dump(path1, open(logdir + '/trained_path1_sample.pkl', 'wb'), protocol=2) pickle.dump(path2, open(logdir + '/trained_path2_sample.pkl', 'wb'), protocol=2) # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no1 = np.concatenate([path["observation"] for path in paths1]) ac_na1 = np.concatenate([path["action"] for path in paths1]) critic_ob_no1 = np.concatenate( [path["criticObservation"] for path in paths1]) ob_no2 = np.concatenate([path["observation"] for path in paths2]) ac_na2 = np.concatenate([path["action"] for path in paths2]) critic_ob_no2 = np.concatenate( [path["criticObservation"] for path in paths2]) # print(ob_no.shape) # print(ac_na.shape) # print(path['reward'].shape) #========================# # Compute Q value #========================# q_n1 = np.concatenate([[ np.npv((1 / gamma - 1), path["reward"][i:]) for i in range(len(path["reward"])) ] for path in paths1]) q_n2 = np.concatenate([[ np.npv((1 / gamma - 1), path["reward"][i:]) for i in range(len(path["reward"])) ] for path in paths2]) #========================# # Compute Baselines #========================# q_n_mean1 = q_n1.mean() q_n_std1 = q_n1.std() q_n1 = (q_n1 - q_n_mean1) / q_n_std1 b_n1 = baseline_prediction_1 adv_n_baseline1 = q_n1 - b_n1 q_n_mean2 = q_n2.mean() q_n_std2 = q_n2.std() q_n2 = (q_n2 - q_n_mean2) / q_n_std2 b_n2 = baseline_prediction_2 adv_n_baseline2 = q_n2 - b_n2 # if bootstrap: # last_critic_ob_no1 = np.concatenate([path["criticObservation"] for path in paths1]) # lastFit1 = sess.run(baseline_prediction_1, # feed_dict = {sy_ob_critic_1: critic_ob_no1[]}) #====================================# # Optimizing Neural Network Baseline #====================================# _, adv_n1 = sess.run([baseline_update_op_1, adv_n_baseline1], feed_dict={ baseline_target_1: q_n1, sy_ob_critic_1: critic_ob_no1 }) adv_n1 = adv_n1 * q_n_std1 + q_n_mean1 _, adv_n2 = sess.run([baseline_update_op_2, adv_n_baseline2], feed_dict={ baseline_target_2: q_n2, sy_ob_critic_2: critic_ob_no2 }) adv_n2 = adv_n2 * q_n_std2 + q_n_mean2 #====================================================================================# # Advantage Normalization #====================================================================================# if normalize_advantages: adv_n1 = (adv_n1 - adv_n1.mean()) / adv_n1.std() adv_n2 = (adv_n2 - adv_n2.mean()) / adv_n2.std() #====================================================================================# # Performing the Policy Update #====================================================================================# _, train_loss1 = sess.run([update_op_1, loss_1], feed_dict={ sy_adv_n_1: adv_n1, sy_ac_na_1: ac_na1, sy_ob_no_1: ob_no1 }) _, train_loss2 = sess.run([update_op_2, loss_2], feed_dict={ sy_adv_n_2: adv_n2, sy_ac_na_2: ac_na2, sy_ob_no_2: ob_no2 }) print("PG Network 1 training loss: %.5f" % train_loss1) print("PG Network 2 training loss: %.5f" % train_loss2) # Log diagnostics returns1 = np.array([path["reward"].sum() for path in paths1]) returns2 = np.array([path["reward"].sum() for path in paths2]) totalReturn = returns1 + returns2 ep_lengths = [pathlength(path) for path in paths1] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn1", np.mean(returns1)) logz.log_tabular("StdReturn1", np.std(returns1)) logz.log_tabular("MaxReturn1", np.max(returns1)) logz.log_tabular("MinReturn1", np.min(returns1)) logz.log_tabular("AverageReturn2", np.mean(returns2)) logz.log_tabular("StdReturn2", np.std(returns2)) logz.log_tabular("MaxReturn2", np.max(returns2)) logz.log_tabular("MinReturn2", np.min(returns2)) logz.log_tabular("AverageTotalReturn", np.mean(totalReturn)) logz.log_tabular("StdReturn", np.std(totalReturn)) logz.log_tabular("MaxReturn", np.max(totalReturn)) logz.log_tabular("MinReturn", np.min(totalReturn)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("NumPathsThisBatch", num_path) logz.log_tabular("NumPathsSoFar", total_numpaths) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, reward_to_go, animate, logdir, normalize_advantages, nn_baseline, seed, n_layers, size): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the environment env = get_random_env() # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Maximum length for episodes max_path_length = 24 # Is this env continuous, or self.discrete? discrete = True # Observation and action sizes ob_dim = env.get_obs_shape() ac_dim = 1 #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'reward_to_go': reward_to_go, 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = [path["reward"] for path in paths] q_n, adv_n = agent.estimate_return(ob_no, re_n) agent.update_parameters(ob_no, ac_na, q_n, adv_n) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, mini_batch_size, max_path_length, learning_rate, num_ppo_updates, num_value_iters, animate, logdir, normalize_advantages, nn_critic, seed, n_layers, size, gru_size, history, num_tasks, l2reg, recurrent, ): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment envs = { 'pm': PointEnv, 'pm-obs': ObservedPointEnv, } env = envs[env_name](num_tasks) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] task_dim = len(env._goal) # rude, sorry #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'task_dim': task_dim, 'size': size, 'gru_size': gru_size, 'learning_rate': learning_rate, 'history': history, 'num_value_iters': num_value_iters, 'l2reg': l2reg, 'recurrent': recurrent, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'nn_critic': nn_critic, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() #========================================================================================# # Training Loop #========================================================================================# def unpack_sample(data): ''' unpack a sample from the replay buffer ''' ob = data["observations"] ac = data["actions"] re = data["rewards"] hi = data["hiddens"] ma = 1 - data["terminals"] return ob, ac, re, hi, ma # construct PPO replay buffer, perhaps rude to do outside the agent ppo_buffer = PPOReplayBuffer(agent.replay_buffer) total_timesteps = 0 for itr in range(n_iter): # for PPO: flush the replay buffer! ppo_buffer.flush() # sample trajectories to fill agent's replay buffer print("********** Iteration %i ************" % itr) stats = [] for _ in range(num_tasks): s, timesteps_this_batch = agent.sample_trajectories( itr, env, min_timesteps_per_batch) total_timesteps += timesteps_this_batch stats += s # compute the log probs, advantages, and returns for all data in agent's buffer # store in ppo buffer for use in multiple ppo updates # TODO: should move inside the agent probably data = agent.replay_buffer.all_batch() ob_no, ac_na, re_n, hidden, masks = unpack_sample(data) fixed_log_probs = agent.sess.run(agent.sy_lp_n, feed_dict={ agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na }) q_n, adv_n = agent.estimate_return(ob_no, re_n, hidden, masks) ppo_buffer.add_samples(fixed_log_probs, adv_n, q_n) # update with mini-batches sampled from ppo buffer for _ in range(num_ppo_updates): data = ppo_buffer.random_batch(mini_batch_size) ob_no, ac_na, re_n, hidden, masks = unpack_sample(data) fixed_log_probs = data["log_probs"] adv_n = data["advantages"] q_n = data["returns"] log_probs = agent.sess.run(agent.sy_lp_n, feed_dict={ agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na }) agent.update_parameters(ob_no, hidden, ac_na, fixed_log_probs, q_n, adv_n) # compute validation statistics print('Validating...') val_stats = [] for _ in range(num_tasks): vs, timesteps_this_batch = agent.sample_trajectories( itr, env, min_timesteps_per_batch // 10, is_evaluation=True) val_stats += vs # save trajectories for viz with open("output/{}-epoch{}.pkl".format(exp_name, itr), 'wb') as f: pickle.dump(agent.val_replay_buffer.all_batch(), f, pickle.HIGHEST_PROTOCOL) agent.val_replay_buffer.flush() # Log TRAIN diagnostics returns = [sum(s["rewards"]) for s in stats] final_rewards = [s["rewards"][-1] for s in stats] ep_lengths = [s['ep_len'] for s in stats] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("FinalReward", np.mean(final_rewards)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) # Log VAL diagnostics val_returns = [sum(s["rewards"]) for s in val_stats] val_final_rewards = [s["rewards"][-1] for s in val_stats] logz.log_tabular("ValAverageReturn", np.mean(val_returns)) logz.log_tabular("ValFinalReward", np.mean(val_final_rewards)) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, reward_to_go, animate, logdir, normalize_advantages, nn_baseline, seed, n_layers, size): start = time.time() # ======================================================================================= # # Set Up Logger # ======================================================================================= # setup_logger(logdir, locals()) # ======================================================================================= # # Set Up Env # ======================================================================================= # # Make the gym environment env = gym.make(env_name) # Set random seeds tf.random.set_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # ======================================================================================== # # Initialize Agent # ======================================================================================== # neural_net_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'reward_to_go': reward_to_go, 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages, } agent = Agent(neural_net_args, sample_trajectory_args, estimate_return_args) # ========================================================================================# # Training Loop # ========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) # Fixed total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = [path["reward"] for path in paths] # ob_no = (sum_length_paths, ob_dim) # ac_na = (sum_length_paths, ac_dim) # re_n = (num_paths,) where re_n[i] = (path_len_i, 1) q_n, adv_n = agent.estimate_return(ob_no, re_n) # Fixed agent.update_parameters(ob_no, ac_na, q_n, adv_n) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def on_epoch_end(self, epoch, logs={}): # Save training and validation losses logz.log_tabular('train_loss', logs.get('loss')) logz.log_tabular('val_loss', logs.get('val_loss')) logz.dump_tabular()
def train(self, num_iter): start = time.time() for i in range(num_iter): t1 = time.time() self.train_step() t2 = time.time() print('total time of one step', t2 - t1) print('iter ', i,' done') # record statistics every 10 iterations if ((i + 1) % 10 == 0): rewards = self.aggregate_rollouts(num_rollouts = 100, evaluate = True) w = ray.get(self.workers[0].get_weights_plus_stats.remote()) np.savez(self.logdir + "/lin_policy_plus", w) print(sorted(self.params.items())) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", i + 1) logz.log_tabular("AverageReward", np.mean(rewards)) logz.log_tabular("StdRewards", np.std(rewards)) logz.log_tabular("MaxRewardRollout", np.max(rewards)) logz.log_tabular("MinRewardRollout", np.min(rewards)) logz.log_tabular("timesteps", self.timesteps) logz.dump_tabular() t1 = time.time() # get statistics from all workers for j in range(self.num_workers): self.policy.observation_filter.update(ray.get(self.workers[j].get_filter.remote())) self.policy.observation_filter.stats_increment() # make sure master filter buffer is clear self.policy.observation_filter.clear_buffer() # sync all workers filter_id = ray.put(self.policy.observation_filter) setting_filters_ids = [worker.sync_filter.remote(filter_id) for worker in self.workers] # waiting for sync of all workers ray.get(setting_filters_ids) increment_filters_ids = [worker.stats_increment.remote() for worker in self.workers] # waiting for increment of all workers ray.get(increment_filters_ids) t2 = time.time() print('Time to sync statistics:', t2 - t1) return
def train(self, train_db, val_db, test_db): ################################################################## ## LOG ################################################################## logz.configure_output_dir(self.cfg.model_dir) logz.save_config(self.cfg) ################################################################## ## Main loop ################################################################## start = time() min_val_loss = 1000.0 max_val_recall = -1.0 train_loaddb = region_loader(train_db) val_loaddb = region_loader(val_db) #TODO train_loader = DataLoader(train_loaddb, batch_size=self.cfg.batch_size, shuffle=True, num_workers=self.cfg.num_workers, collate_fn=region_collate_fn) val_loader = DataLoader(val_loaddb, batch_size=self.cfg.batch_size, shuffle=False, num_workers=self.cfg.num_workers, collate_fn=region_collate_fn) for epoch in range(self.epoch, self.cfg.n_epochs): ################################################################## ## Training ################################################################## if self.cfg.coco_mode >= 0: self.cfg.coco_mode = np.random.randint(0, self.cfg.max_turns) torch.cuda.empty_cache() train_losses = self.train_epoch(train_loaddb, train_loader, epoch) ################################################################## ## Validation ################################################################## if self.cfg.coco_mode >= 0: self.cfg.coco_mode = 0 torch.cuda.empty_cache() val_losses, val_metrics, caches_results = self.validate_epoch( val_loaddb, val_loader, epoch) ################################################################# # Logging ################################################################# # update optim scheduler current_val_loss = np.mean(val_losses) self.optimizer.update(current_val_loss, epoch) logz.log_tabular("Time", time() - start) logz.log_tabular("Iteration", epoch) logz.log_tabular("TrainAverageLoss", np.mean(train_losses)) logz.log_tabular("ValAverageLoss", current_val_loss) mmm = np.zeros((5, ), dtype=np.float64) for k, v in val_metrics.items(): mmm = mmm + np.array(v) mmm /= len(val_metrics) logz.log_tabular("t2i_R1", mmm[0]) logz.log_tabular("t2i_R5", mmm[1]) logz.log_tabular("t2i_R10", mmm[2]) logz.log_tabular("t2i_medr", mmm[3]) logz.log_tabular("t2i_meanr", mmm[4]) logz.dump_tabular() current_val_recall = np.mean(mmm[:3]) ################################################################## ## Checkpoint ################################################################## if self.cfg.rl_finetune == 0 and self.cfg.coco_mode < 0: if min_val_loss > current_val_loss: min_val_loss = current_val_loss self.save_checkpoint(epoch) with open( osp.join(self.cfg.model_dir, 'val_metrics_%d.json' % epoch), 'w') as fp: json.dump(val_metrics, fp, indent=4, sort_keys=True) with open( osp.join(self.cfg.model_dir, 'val_top5_inds_%d.pkl' % epoch), 'wb') as fid: pickle.dump(caches_results, fid, pickle.HIGHEST_PROTOCOL) else: if max_val_recall < current_val_recall: max_val_recall = current_val_recall self.save_checkpoint(epoch) with open( osp.join(self.cfg.model_dir, 'val_metrics_%d.json' % epoch), 'w') as fp: json.dump(val_metrics, fp, indent=4, sort_keys=True) with open( osp.join(self.cfg.model_dir, 'val_top5_inds_%d.pkl' % epoch), 'wb') as fid: pickle.dump(caches_results, fid, pickle.HIGHEST_PROTOCOL)
def trainPG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length,\ learning_rate, reward_to_go, animate, logdir, normalize_advantages, nn_baseline,\ seed, n_layers, size): tic = time.time() setup_logger(logdir, locals()) env = gym.make(env_name) tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) max_path_length = max_path_length or env.spec.max_episode_steps discrete = isinstance(env.action_space, gym.spaces.Discrete) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n if discrete else env.action_space.shape[0] ## Define Placeholders obs_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32, name='obs') if discrete: act_ph = tf.placeholder(shape=[None], dtype=tf.int32, name='act') else: act_ph = tf.placeholder(shape=[None, act_dim], dtype=tf.float32, name='act') adv_ph = tf.placeholder(shape=[None], dtype=tf.float32, name='adv') ## Build computation graph, define forward pass nn_out = build_mlp(input_ph=obs_ph, output_size=act_dim, scope='policy_model', n_layers=n_layers, size=size) if discrete: logits_ph = nn_out sampled_action_ph = tf.multinomial(logits=logits_ph, num_samples=1)[0] logprob_ph = -tf.nn.sparse_softmax_cross_entropy_with_logits( labels=act_ph, logits=logits_ph) else: mu_ph = nn_out logstd_ph = tf.get_variable('logstd', [act_dim], dtype=tf.float32) sampled_action_ph = tf.random_normal(shape=mu_ph.shape, mean=mu_ph, stddev=tf.exp(logstd_ph)) logprob_ph = -tf.contrib.distributions.MultivariateNormalDiag( loc=mu_ph, scale_diag=tf.exp(logstd_ph)) ## Define Loss Function and Training Operation loss = tf.reduce_mean(-logprob_ph * adv_ph) update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) if nn_baseline: baseline_pred_ph = tf.squeeze( build_mlp(input_ph=obs_ph, output_size=1, scope='nn_baseline', n_layers=n_layers, size=size)) baseline_target_ph = tf.placeholder(shape=[None], dtype=tf.float32, name='baseline') baseline_loss = tf.nn.l2_loss(baseline_pred_ph - baseline_target_ph) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( baseline_loss) ## Initialize Tensorflow Configs tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to "with self.sess:" tf.global_variables_initializer().run() ## Training Loop total_time_steps = 0 for itr in range(n_iter): print("********* Iteration %i *********" % itr) ### Sample_Trajectories timesteps_this_batch = 0 paths = [] while True: #### Sample a trajectory observations, actions, rewards = [], [], [] animate_this_episode = (animate and len(paths) == 0 and itr % 10 == 0) s = env.reset() steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.1) a = sess.run(sampled_action_ph, feed_dict={obs_ph: s[None]}) a = a[0] sp, r, done, _ = env.step(a) observations.append(s) actions.append(a) rewards.append(r) steps += 1 if done or steps > max_path_length: break s = sp #### End of Sample a trajectory path = { 'observation': np.array(observations, dtype=np.float32), 'action': np.array(actions, dtype=np.int32 if discrete else np.float32), 'reward': np.array(rewards, dtype=np.float32) } paths.append(path) timesteps_this_batch += steps if timesteps_this_batch > min_timesteps_per_batch: break total_time_steps += timesteps_this_batch ## Build arrays for observation, action for the policy gradient update by concatenating across paths obs = np.concatenate([path['observation'] for path in paths]) act = np.concatenate([path['action'] for path in paths]) rew = [path['reward'] for path in paths] ## Estimate Return ### Compute Q-values qvals = [] for path_rewards in rew: q_path = [] q = 0 for r in reversed(path_rewards): q = r + gamma * q q_path.append(q) if reward_to_go: q_path.reverse() else: q_path = [q for _ in range(len(path_rewards))] qvals.extend(q_path) ### Compute Advantages if nn_baseline: bl = sess.run(baseline_pred_ph, feed_dict={obs_ph: obs}) bl = bl * np.std(qvals) + np.mean(qvals) adv = qvals - bl #### TODO: GAE implementation else: adv = qvals.copy() if normalize_advantages: adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-8) ## Policy Network Parameters Update if nn_baseline: sess.run([baseline_update_op], feed_dict={ baseline_target_ph: adv, obs_ph: obs }) _, loss_policy = sess.run([update_op, loss], \ feed_dict={obs_ph: obs, act_ph: act, adv_ph: adv}) # Log diagnostics returns = [path['reward'].sum() for path in paths] ep_lengths = [len(path['reward']) for path in paths] logz.log_tabular('Time', time.time() - tic) logz.log_tabular('Iteration', itr) logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenSt", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_time_steps) logz.log_tabular("PolicyLoss", loss_policy) logz.dump_tabular() logz.pickle_tf_vars()
def log_progress(self): episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: self.mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) if self.t % self.log_every_n_steps == 0 and self.model_initialized: logz.log_tabular("TimeStep", self.t) logz.log_tabular("MeanReturn", self.mean_episode_reward) logz.log_tabular( "BestMeanReturn", max(self.best_mean_episode_reward, self.mean_episode_reward)) logz.log_tabular("Episodes", len(episode_rewards)) logz.log_tabular("Exploration", self.exploration.value(self.t)) logz.log_tabular("LearningRate", self.optimizer_spec.lr_lambda(self.t)) logz.log_tabular("Time", (time.time() - self.start_time) / 60.) logz.dump_tabular() logz.save_pytorch_model(self.q_net)
def learn(env, q_func, optimizer_spec, session, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10): assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete # Log the progress during the trainining start = time.time() logdir = 'pacman_hra_' + time.strftime("%d-%m-%Y_%H-%M-%S") logdir = os.path.join('hra_result', logdir) logz.configure_output_dir(logdir) args = inspect.getargspec(q_func)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) time_name = path.join(logdir, "rha_t.dat") mean_name = path.join(logdir, "rha_mean.dat") best_name = path.join(logdir, "rha_best.dat") if not os.path.exists(logdir): os.makedirs(logdir) times, mean_ep_rewards, best_ep_rewards = [], [], [] img_h, img_w, img_c = env.observation_space.shape input_shape = (img_h, img_w, frame_history_len * img_c) num_actions = env.action_space.n # set up placeholders # placeholder for current observation (or state) obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) # placeholder for current action act_t_ph = tf.placeholder(tf.int32, [None]) # placeholder for current reward rew_food_t_ph = tf.placeholder(tf.float32, [None]) rew_fruit_t_ph = tf.placeholder(tf.float32, [None]) rew_avoid_t_ph = tf.placeholder(tf.float32, [None]) rew_eat_t_ph = tf.placeholder(tf.float32, [None]) # placeholder for next observation (or state) obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) # placeholder for end of episode mask # this value is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target, not the # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) done_mask_ph = tf.placeholder(tf.float32, [None]) # casting to float on GPU ensures lower data transfer times. obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0 # Here, you should fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # TensorFlow will differentiate this error for you, you just need to pass it to the # optimizer. See assignment text for details. # Your code should produce one scalar-valued tensor: total_error # This will be passed to the optimizer in the provided code below. # Your code should also produce two collections of variables: # q_func_vars # target_q_func_vars # These should hold all of the variables of the Q-function network and target network, # respectively. A convenient way to get these is to make use of TF's "scope" feature. # For example, you can create your Q-function network with the scope "q_func" like this: # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) # And then you can obtain the variables like this: # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" ###### q_val = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) q_food, q_avoid, q_fruit, q_eat = q_val target_val = q_func(obs_tp1_float, num_actions, scope="target_q_func", reuse=False) target_food, target_avoid, target_fruit, target_eat = target_val q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func') q_all = 1 / 4 * (q_food + q_avoid + q_fruit + q_eat) action_selected = tf.argmax(q_all, 1) q_act_food_t_val = tf.reduce_sum(q_food * tf.one_hot(act_t_ph, num_actions), axis=1) q_act_avoid_t_val = tf.reduce_sum(q_avoid * tf.one_hot(act_t_ph, num_actions), axis=1) q_act_fruit_t_val = tf.reduce_sum(q_fruit * tf.one_hot(act_t_ph, num_actions), axis=1) q_act_eat_t_val = tf.reduce_sum(q_eat * tf.one_hot(act_t_ph, num_actions), axis=1) y_food_t_val = rew_food_t_ph + (1 - done_mask_ph) * gamma * tf.reduce_max( target_food, axis=1) y_avoid_t_val = rew_avoid_t_ph + ( 1 - done_mask_ph) * gamma * tf.reduce_max(target_avoid, axis=1) y_fruit_t_val = rew_fruit_t_ph + ( 1 - done_mask_ph) * gamma * tf.reduce_max(target_fruit, axis=1) y_eat_t_val = rew_eat_t_ph + (1 - done_mask_ph) * gamma * tf.reduce_max( target_eat, axis=1) food_error = tf.reduce_mean( tf.losses.huber_loss(y_food_t_val, q_act_food_t_val)) avoid_error = tf.reduce_mean( tf.losses.huber_loss(y_avoid_t_val, q_act_avoid_t_val)) fruit_error = tf.reduce_mean( tf.losses.huber_loss(y_fruit_t_val, q_act_fruit_t_val)) eat_error = tf.reduce_mean( tf.losses.huber_loss(y_eat_t_val, q_act_eat_t_val)) ###### # construct optimization op (with gradient clipping) learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs) train_food_fn = minimize_and_clip(optimizer, food_error, var_list=q_func_vars, clip_val=grad_norm_clipping) train_avoid_fn = minimize_and_clip(optimizer, avoid_error, var_list=q_func_vars, clip_val=grad_norm_clipping) train_fruit_fn = minimize_and_clip(optimizer, fruit_error, var_list=q_func_vars, clip_val=grad_norm_clipping) train_eat_fn = minimize_and_clip(optimizer, eat_error, var_list=q_func_vars, clip_val=grad_norm_clipping) # update_target_fn will be called periodically to copy Q network to target Q network update_target_fn = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_fn.append(var_target.assign(var)) update_target_fn = tf.group(*update_target_fn) # construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### model_initialized = False num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in itertools.count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env, t): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### idx = replay_buffer.store_frame(last_obs, rha_shape=4) epsilon = exploration.value(t) if not model_initialized or np.random.rand(1) < epsilon: action = env.action_space.sample() else: obs_input = replay_buffer.encode_recent_observation()[None, :] action = session.run(action_selected, feed_dict={obs_t_ph: obs_input}) obs, reward, done, info = env.step(action) replay_buffer.store_effect(idx, action, reward, done) if done: obs = env.reset() last_obs = obs ##### # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # 3.b: initialize the model if it has not been initialized yet; to do # that, call # initialize_interdependent_variables(session, tf.global_variables(), { # obs_t_ph: obs_t_batch, # obs_tp1_ph: obs_tp1_batch, # }) # where obs_t_batch and obs_tp1_batch are the batches of observations at # the current and next time step. The boolean variable model_initialized # indicates whether or not the model has been initialized. # Remember that you have to update the target network too (see 3.d)! # 3.c: train the model. To do this, you'll need to use the train_fn and # total_error ops that were created earlier: total_error is what you # created to compute the total Bellman error in a batch, and train_fn # will actually perform a gradient step and update the network parameters # to reduce total_error. When calling session.run on these you'll need to # populate the following placeholders: # obs_t_ph # act_t_ph # rew_t_ph # obs_tp1_ph # done_mask_ph # (this is needed for computing total_error) # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t) # (this is needed by the optimizer to choose the learning rate) # 3.d: periodically update the target network by calling # session.run(update_target_fn) # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### obs_t_batch, act_t_batch, rew_t_batch, obs_tp1_batch, done_mask_batch = replay_buffer.sample( batch_size) rew_food_t_batch = rew_t_batch[:, 0] rew_fruit_t_batch = rew_t_batch[:, 1] rew_avoid_t_batch = rew_t_batch[:, 2] rew_eat_t_batch = rew_t_batch[:, 3] if not model_initialized: initialize_interdependent_variables( session, tf.global_variables(), { obs_t_ph: obs_t_batch, obs_tp1_ph: obs_tp1_batch }) session.run(update_target_fn) model_initialized = True session.run(train_food_fn, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_food_t_ph: rew_food_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch, learning_rate: optimizer_spec.lr_schedule.value(t) }) session.run(train_avoid_fn, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_avoid_t_ph: rew_avoid_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch, learning_rate: optimizer_spec.lr_schedule.value(t) }) session.run(train_fruit_fn, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_fruit_t_ph: rew_fruit_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch, learning_rate: optimizer_spec.lr_schedule.value(t) }) session.run(train_eat_fn, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_eat_t_ph: rew_eat_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch, learning_rate: optimizer_spec.lr_schedule.value(t) }) if num_param_updates % target_update_freq == 0: session.run(update_target_fn) train_food_loss = session.run(food_error, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_food_t_ph: rew_food_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch }) train_avoid_loss = session.run(avoid_error, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_avoid_t_ph: rew_avoid_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch }) train_fruit_loss = session.run(fruit_error, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_fruit_t_ph: rew_fruit_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch }) train_eat_loss = session.run(eat_error, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_eat_t_ph: rew_eat_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch }) train_loss = (train_food_loss + train_avoid_loss + train_fruit_loss + train_eat_loss) / 4. print("Loss at iteration {} is: {}".format(t, train_loss)) num_param_updates += 1 ##### ### 4. Log progress episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and model_initialized: times.append(t) mean_ep_rewards.append(mean_episode_reward) best_ep_rewards.append(best_mean_episode_reward) joblib.dump(value=times, filename=time_name, compress=3) joblib.dump(value=mean_ep_rewards, filename=mean_name, compress=3) joblib.dump(value=best_ep_rewards, filename=best_name, compress=3) logz.log_tabular("Training Time", time.time() - start) logz.log_tabular("Loss", train_loss) logz.log_tabular("Iteration", t) logz.log_tabular("Mean Reward (/100ep)", mean_episode_reward) logz.log_tabular("Best Mean Reward", best_mean_episode_reward) logz.log_tabular("Episodes", len(episode_rewards)) logz.log_tabular("Exploration", exploration.value(t)) logz.log_tabular("Learning Rate", optimizer_spec.lr_schedule.value(t)) logz.dump_tabular() sys.stdout.flush() return times, mean_ep_rewards, best_ep_rewards
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, reward_to_go, animate, logdir, normalize_advantages, nn_baseline, seed, n_layers, size, epoch, evaluate=False): start = time.time() # ========================================================================================# # Set Up Logger # ========================================================================================# setup_logger(logdir, locals()) # ========================================================================================# # Set Up Env # ========================================================================================# # Make the gym environment env = gym.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # ========================================================================================# # Initialize Agent # ========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'reward_to_go': reward_to_go, 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph agent.build_computation_graph( '/tmp/hw2/%s/seed_%d_lr_%f_batch_%d_epoch_%d' % (env_name, seed, learning_rate, min_timesteps_per_batch, epoch)) # tensorflow: config, session, variable initialization agent.init_tf_sess() # ========================================================================================# # Training Loop # ========================================================================================# if evaluate: reward = 0 agent.load_model(799) for _ in range(10): path = agent.sample_trajectory(env, True) reward += path['reward'] print("Mean Reward:", sum(reward) / 10) else: total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = [path["reward"] for path in paths] q_n, adv_n = agent.estimate_return(ob_no, re_n) agent.update_parameters(ob_no, ac_na, q_n, adv_n, itr, epoch) agent.copy_new_to_old() # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars() agent.add_to_tensorboard(returns, ep_lengths, itr) if (itr + 1) % 100 == 0: agent.save_model(itr)
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False): tf.set_random_seed(seed) np.random.seed(seed) env = gym.make("Pendulum-v0") ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] logz.configure_output_dir(logdir) if vf_type == 'linear': vf = LinearValueFunction(**vf_params) elif vf_type == 'nn': vf = NnValueFunction(ob_dim=ob_dim, **vf_params) YOUR_CODE_HERE sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 total_timesteps = 0 stepsize = initial_stepsize for i in range(n_iter): print("********** Iteration %i ************"%i) YOUR_CODE_HERE if kl > desired_kl * 2: stepsize /= 1.5 print('stepsize -> %s'%stepsize) elif kl < desired_kl / 2: stepsize *= 1.5 print('stepsize -> %s'%stepsize) else: print('stepsize OK') # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def fit(self, dataset): """ """ self.graph = self.build_computation_graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(graph=self.graph, config = config) self.sess.run(self.initializer) num_steps_in_epoch = dataset.num_train // self.n_batch n_step = num_steps_in_epoch * self.n_epoch start = time.time() np.random.seed(self.seed) try: self.learning_curve['train'].clear() self.learning_curve['val'].clear() loss_train = 0. for step in range(n_step): local_batch = dataset.next_batch(self.n_batch) loss_train += self.compute_loss(self.sess, batch = local_batch, optimize=True) if (step + 1) % num_steps_in_epoch == 0: train_error = self.n_batch / dataset.num_train * loss_train val_error = self.compute_loss(self.sess, batch = dataset.testdata(), optimize = False) # Return the negative error to allow monitoring for the ELBO. self.learning_curve['train'] += [train_error] self.learning_curve['val'] += [val_error] loss_train = 0. logz.log_tabular("Time", time.time() - start) logz.log_tabular("Fold", dataset.test_fold) logz.log_tabular("Epoch", dataset.epochs_completed) logz.log_tabular("BatchStep", step) logz.log_tabular("TrainError", train_error) logz.log_tabular("ValError", val_error) logz.dump_tabular() # print('epoch: {:2d}, step: {:5d}, training error: {:03.4f}, ' # 'validation error: {:03.4f}, time elapsed: {:4.0f} s' # .format(dataset.epochs_completed, step, train_error, val_error, time.time() - start)) except KeyboardInterrupt: print('ending training') finally: # If interrupted or stopped, store the progress of the model. self.saver.save(self.sess, self.checkpoint_path) self.sess.close() #coord.request_stop() #coord.join(threads) print('finished training') return self
def main_pendulum(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=False, logfile=None): env = gym.make("Pendulum-v0") ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] logz.configure_output_file(logfile) #vf = LinearValueFunction() vf = NeuralValueFunction(ob_dim) # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in these functions sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.float32) # batch of actions taken by the policy, used for policy gradient computation sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate sy_h1 = tf.nn.relu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer sy_mean_n = dense(sy_h1, ac_dim, "final", weight_init=normc_initializer(0.05)) # Mean control output sy_logstd_n = tf.Variable(tf.zeros([ac_dim])) sy_std_n = tf.exp(sy_logstd_n) # Get probabilities from normal distribution and sample from distribution dist = tf.contrib.distributions.Normal(mu=tf.reshape(sy_mean_n,[-1]), sigma=sy_std_n) sy_logprob_n = tf.reshape(tf.log(dist.pdf(sy_ac_n)),[-1]) sy_n = tf.shape(sy_ob_no)[0] sy_sampled_ac = dist.sample(sy_n) # sampled actions, used for defining the policy (NOT computing the policy gradient) # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>> sy_mean_n_old = tf.placeholder(shape=[None, ac_dim], name='old_mean', dtype=tf.float32) sy_std_n_old = tf.placeholder(shape=[ac_dim], name='old_std', dtype=tf.float32) sy_kl = tf.reduce_sum(tf.log(sy_std_n/sy_std_n_old)+(sy_std_n_old**0.5+(sy_mean_n_old-sy_mean_n)**0.5)/(2*sy_std_n**0.5)-0.5)/tf.to_float(sy_n) sy_ent = tf.reduce_sum(-(1+tf.log(2*math.pi*sy_std_n**2))*0.5) # <<<<<<<<<<<<< sy_surr = -tf.reduce_mean(sy_adv_n*sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) total_timesteps = 0 obs_mean = np.zeros(ob_dim) obs_std = np.zeros(ob_dim) for i in range(n_iter): print("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate) while True: if animate_this_episode: env.render() obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) acs.append(ac.flatten()) ob, rew, done, _ = env.step(ac) rewards.append(rew.flatten()) ob = ob.flatten() if done: break path = {"observation" : np.array(obs), "terminated" : terminated, "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vpred_t = vf.predict((path["observation"]-obs_mean)/(obs_std+1e-8)) adv_t = return_t.flatten() - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n-adv_n.mean())/(adv_n.std()+1e-8) vtarg_n = np.concatenate(vtargs).flatten() vpred_n = np.concatenate(vpreds) obs_mean = np.average(ob_no,axis=0) obs_std = np.std(ob_no,axis=0) vf.fit((ob_no-obs_mean)/(obs_std+1e-8), vtarg_n) # Policy update _, mean_n, std_n = sess.run([update_op, sy_mean_n, sy_std_n], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n.flatten(), sy_adv_n:standardized_adv_n, sy_stepsize:stepsize}) kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_mean_n_old: mean_n, sy_std_n_old: std_n}) desired_kl = 2e-3 if kl > desired_kl * 2: stepsize /= 1.5 print('stepsize -> %s'%stepsize) elif kl < desired_kl / 2: stepsize *= 1.5 print('stepsize -> %s'%stepsize) else: print('stepsize OK') # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def train(pub_cmd, pub_act, rate, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. rand_controller = RandomController() paths = sample(pub_cmd, pub_act, rate, rand_controller, num_paths_random, env_horizon, render) data = paths_to_array(paths) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): # Fit dynamics model print('Training dynamics model...') dyn_model.fit(data) mpc_controller.dyn_model = dyn_model costs = [] returns = [] # Do MPC for i in range(num_paths_onpol): print('On policy path: %i' % i) obs_t, obs_tp1, acs_t, rews_t = [], [], [], [] s_t = reset(pub_cmd, rate) total_return = 0 for j in range(env_horizon): # print('Timestep: %i, Return: %g' % (j,total_return)) a_t = mpc_controller.get_action(s_t) s_tp1, _ = step(a_t, pub_act, pub_cmd, rate) r_t = 0 for i in range(9): r_t += s_tp1[i * 12] - s_t[i * 12] total_return += r_t if render: env.render() time.sleep(0.05) obs_t.append(s_t) obs_tp1.append(s_tp1) acs_t.append(a_t) rews_t.append(r_t) s_t = s_tp1 path = { "observations": np.array(obs_t), "next_observations": np.array(obs_tp1), "actions": np.array(acs_t), "rewards": np.array(rews_t) } total_cost = path_cost(cost_fn, path) paths.append(path) returns.append(total_return) costs.append(total_cost) print('Total cost: %g, Total reward: %g' % (total_cost, total_return)) data = paths_to_array(paths) normalization = compute_normalization(data) # Set new normalization statistics for dynamics model dyn_model.normalization = normalization # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} del params['cost_fn'] del params['activation'] del params['output_activation'] del params['env'] logz.save_params(params) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ # Sample from random controller paths = sample(env, random_controller, num_paths_random, env_horizon, render, True) # Build data set data = dict() data['observations'] = np.concatenate( [path['observations'] for path in paths]) data['actions'] = np.concatenate([path['actions'] for path in paths]) next_observations = np.concatenate( [path['next_observations'] for path in paths]) data['deltas'] = next_observations - data['observations'] #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ # Refit dynamic model dyn_model.fit(data) # Sample on-policy trajectories paths = sample(env, mpc_controller, num_paths_onpol, env_horizon, render, True) # Summarize trajectories costs = [path_cost(cost_fn, path) for path in paths] returns = [np.sum(path['rewards']) for path in paths] # Aggregate data onpol_observations = np.concatenate( [path['observations'] for path in paths]) onpol_actions = np.concatenate([path['actions'] for path in paths]) onpol_next_observations = np.concatenate( [path['next_observations'] for path in paths]) onpol_deltas = onpol_next_observations - onpol_observations data['observations'] = np.append(data['observations'], onpol_observations, 0) data['actions'] = np.append(data['actions'], onpol_actions, 0) data['deltas'] = np.append(data['deltas'], onpol_deltas, 0) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train_mf(self): self.start_worker() self.init_opt() logz.configure_output_dir( "/home/hendawy/Desktop/2DOF_Robotic_Arm_withSphereObstacle/Rr", 1807) for itr in range(self.current_itr, self.n_itr): with logger.prefix('itr #%d | ' % itr): paths = self.sampler.obtain_samples(itr, Constrained=True) samples_data, analysis_data = self.sampler.process_samples( itr, paths) self.log_diagnostics(paths) optimization_data = self.optimize_policy(itr, samples_data) logz.log_tabular('Iteration', analysis_data["Iteration"]) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageDiscountedReturn', analysis_data["AverageDiscountedReturn"]) logz.log_tabular('AverageReturns', analysis_data["AverageReturn"]) logz.log_tabular('violation_cost', np.mean(samples_data["violation_cost"])) logz.log_tabular( 'boundary_violation_cost', np.mean(samples_data["boundary_violation_cost"])) logz.log_tabular('success_rate', samples_data["success_rate"]) logz.log_tabular( 'successful_AverageReturn', np.mean(samples_data["successful_AverageReturn"])) logz.log_tabular('ExplainedVariance', analysis_data["ExplainedVariance"]) logz.log_tabular('NumTrajs', analysis_data["NumTrajs"]) logz.log_tabular('Entropy', analysis_data["Entropy"]) logz.log_tabular('Perplexity', analysis_data["Perplexity"]) logz.log_tabular('StdReturn', analysis_data["StdReturn"]) logz.log_tabular('MaxReturn', analysis_data["MaxReturn"]) logz.log_tabular('MinReturn', analysis_data["MinReturn"]) logz.log_tabular('LossBefore', optimization_data["LossBefore"]) logz.log_tabular('LossAfter', optimization_data["LossAfter"]) logz.log_tabular('MeanKLBefore', optimization_data["MeanKLBefore"]) logz.log_tabular('MeanKL', optimization_data["MeanKL"]) logz.log_tabular('dLoss', optimization_data["dLoss"]) logz.dump_tabular() logger.log("saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) self.current_itr = itr + 1 params["algo"] = self if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("saved") logger.dump_tabular(with_prefix=False) if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ paths = sample(env=env, controller=random_controller, num_paths=num_paths_random, horizon=env_horizon, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = { "observations": compute_normalization(paths["observations"]), "actions": compute_normalization(paths["actions"]), "deltas": compute_normalization(paths["next_observations"] - paths["observations"]) } #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ shuffle_indexes = np.random.permutation(paths["observations"].shape[0]) for key in ['observations', 'actions', 'next_observations', 'rewards']: paths[key] = paths[key][shuffle_indexes] dyn_model.fit(paths) newpaths = sample(env=env, controller=mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, verbose=False) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(self): args = self.args mnist = self.data t_start = time.time() for ee in range(args.epochs): # Resample the hyperparameters if we're doing HMC. if args.algo == 'hmc': hparams = self.hmc_updater.update_hparams() hmc_info = defaultdict(list) for ii in range(self.num_train_mbs): xs = self.data_mb_list['X_train'][ii] ys = self.data_mb_list['y_train'][ii] if args.algo == 'hmc': info = self.hmc_updater.hmc(xs, ys, hparams) for key in info: hmc_info[key].append(info[key]) else: feed = {self.x_BO: xs, self.y_targ_B: ys} _, grads, loss = self.sess.run( [self.train_op, self.grads, self.loss], feed) # Log after each epoch, if desired and test on validation. if (ee % args.log_every_t_epochs == 0): acc_valid, loss_valid = self._check_validation() print("\n ************ Epoch %i ************" % (ee + 1)) elapsed_time_hours = (time.time() - t_start) / (60.0**2) if args.algo == 'hmc': for ww, hp in zip(self.weights, hparams): print("{:10} -- plambda={:.3f}".format( str(ww.get_shape().as_list()), hp)) logz.log_tabular("HMCAcceptRateEpoch", np.mean(hmc_info['accept'])) logz.log_tabular("KineticOldMean", np.mean(hmc_info['K_old'])) logz.log_tabular("KineticNewMean", np.mean(hmc_info['K_new'])) logz.log_tabular("PotentialOldMean", np.mean(hmc_info['U_old'])) logz.log_tabular("PotentialNewMean", np.mean(hmc_info['U_new'])) logz.log_tabular("HamiltonianOldMean", np.mean(hmc_info['H_old'])) logz.log_tabular("HamiltonianNewMean", np.mean(hmc_info['H_new'])) logz.log_tabular("ValidAcc", acc_valid) logz.log_tabular("ValidLoss", loss_valid) logz.log_tabular("Temperature", args.temperature) logz.log_tabular("TimeHours", elapsed_time_hours) logz.log_tabular("Epochs", ee) logz.dump_tabular()
def train(self, num_iter): log_name = "seed_{0}".format(self.seed) logger = Logger(logname=self.env_name, now=log_name) start = time.time() for i in range(num_iter): t1 = time.time() reward_avg_loss = self.train_step() t2 = time.time() print('total time of one step', t2 - t1) print('iter ', i, ' done') # record statistics every 10 iterations if ((i + 1) % 10 == 0): rewards = self.aggregate_rollouts(num_rollouts=100, evaluate=True) w = ray.get(self.workers[0].get_weights_plus_stats.remote()) np.savez(self.logdir + "/lin_policy_plus", w) # # output reward function loss # test_loss_list = [] # test_size = len(test_dataset_x) # assert len(test_dataset_x) == len(test_dataset_y) # test_dataset_x = np.array(test_dataset_x) # test_dataset_y = np.array(test_dataset_y).reshape(-1,1) # num_batch = int(test_size / self.batch_size) # for idx in range(num_batch): # test_loss_list.append(self.reward_func.sess.run(self.reward_func.loss, feed_dict={self.reward_func.input_ph: test_dataset_x[idx*self.batch_size: (idx+1)*self.batch_size], # self.reward_func.reward_ph: test_dataset_y[idx*self.batch_size: (idx+1)*self.batch_size]})) # test_avg_loss = np.mean(test_loss_list) print(sorted(self.params.items())) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", i + 1) logz.log_tabular("AverageReward", np.mean(rewards)) logz.log_tabular("StdRewards", np.std(rewards)) logz.log_tabular("MaxRewardRollout", np.max(rewards)) logz.log_tabular("MinRewardRollout", np.min(rewards)) logz.log_tabular("timesteps", self.timesteps) logz.log_tabular("AvgRewardFunctionLoss", reward_avg_loss) # logz.log_tabular("AvgRewardTestLoss", test_avg_loss) logz.dump_tabular() logger.log({ "Time": time.time() - start, "Iteration": i + 1, "AverageReward": np.mean(rewards), "StdRewards": np.std(rewards), "MaxRewardRollout": np.max(rewards), "MinRewardRollout": np.min(rewards), "timesteps": self.timesteps }) logger.write(display=False) t1 = time.time() # get statistics from all workers for j in range(self.num_workers): self.policy.observation_filter.update( ray.get(self.workers[j].get_filter.remote())) self.policy.observation_filter.stats_increment() # make sure master filter buffer is clear self.policy.observation_filter.clear_buffer() # sync all workers filter_id = ray.put(self.policy.observation_filter) setting_filters_ids = [ worker.sync_filter.remote(filter_id) for worker in self.workers ] # waiting for sync of all workers ray.get(setting_filters_ids) increment_filters_ids = [ worker.stats_increment.remote() for worker in self.workers ] # waiting for increment of all workers ray.get(increment_filters_ids) t2 = time.time() print('Time to sync statistics:', t2 - t1) np.savetxt("dataset_x.txt", self.dataset_x) np.savetxt("dataset_y.txt", self.dataset_y) logger.close() return
def train_SAC(env_name, exp_name, seed, logdir, extra_params=None): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, }.get(env_name, 0.2) algorithm_params = { 'alpha': alpha, 'batch_size': 256, 'discount': 0.99, 'learning_rate': 1e-3, 'reparameterize': get_extra_param(extra_params, 'reparameterize', False), 'tau': 0.01, 'epoch_length': 1000, 'n_epochs': 500, 'two_qf': get_extra_param(extra_params, 'two_qf', False), } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': 1000, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (128, 128), } q_function_params = { 'hidden_layer_sizes': (128, 128), } policy_params = { 'hidden_layer_sizes': (128, 128), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) env = gym.envs.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=env.action_space.shape, **replay_pool_params) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction(name='value_function', **value_function_params) target_value_function = nn.ValueFunction(name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=env.action_space.shape[0], reparameterize=algorithm_params['reparameterize'], **policy_params) sampler.initialize(env, policy, replay_pool) algorithm = SAC(**algorithm_params) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True # may need if using GPU with tf.Session(config=tf_config): algorithm.build(env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get( 'n_epochs', 1000)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, to_animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, video_dir=None): start = time.time() nn_params = {"n_layers": n_layers, "size": size, "lr": learning_rate} # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) #env._max_episode_steps = 4000 to_animate = ToAnimate(False) to_animate.animate = False if video_dir is not None: env = gym.wrappers.Monitor(env, video_dir, force=True, video_callable=to_animate) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] (sy_sampled_ac, sy_ob_no, sy_ac_na, sy_adv_n), (update_op, loss) = get_policy_gradient_NN(ob_dim, ac_dim, discrete, nn_params) if nn_baseline: baseline_predictor = BaselinePredictor(sy_ob_no, epoch_num=500, nn_params=nn_params) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() # pylint: disable=E1101 # Training Loop total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps for one batch paths, num_collected_timesteps = collect_paths( sess, sy_sampled_ac, sy_ob_no, env, min_timesteps, max_path_length, to_animate, itr, discrete) total_timesteps += num_collected_timesteps # Build arrays for observation, action for the policy gradient update # by concatenating across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) q_n = get_reward(paths, gamma, reward_to_go) if nn_baseline: # Getting baselines for each timesteps b_n = baseline_predictor.predict(ob_no)[0] # Rescaling the output to mach statistics of Q-values b_n = (b_n - np.mean(b_n)) / np.std(b_n) b_n = np.mean(q_n) + (b_n * np.std(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. adv_n = (adv_n - np.mean(adv_n)) / np.std(adv_n) if nn_baseline: baseline_predictor.fit(inputs=ob_no, labels=(q_n - np.mean(q_n)) / np.std(q_n), n_iter=1) if discrete: ac_na = ac_na.flatten() # FIXME loss_before = sess.run( loss, feed_dict={ sy_ob_no: ob_no, # observation sy_ac_na: ac_na, # taken actions sy_adv_n: adv_n # adventages }) sess.run( update_op, feed_dict={ sy_ob_no: ob_no, # observation sy_ac_na: ac_na, # taken actions sy_adv_n: adv_n # adventages }) loss_after = sess.run( loss, feed_dict={ sy_ob_no: ob_no, # observation sy_ac_na: ac_na, # taken actions sy_adv_n: adv_n # adventages }) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] #logz.log_tabular("Loss_before", loss_before) logz.log_tabular("Loss_after", loss_after) logz.log_tabular("delta_loss", loss_after - loss_before) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", len(ac_na)) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=False, animate=True, logdir=None, normalize_advantages=False, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, # mb mpc arguments model_learning_rate=1e-3, onpol_iters=10, dynamics_iters=260, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=10, m_n_layers=2, m_size=500, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment # env = gym.make(env_name) env = HalfCheetahEnvNew() cost_fn = cheetah_cost_fn activation=tf.nn.relu output_activation=None # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes # max_path_length = max_path_length or env.spec.max_episode_steps max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # Print environment infomation print("-------- env info --------") print("Environment name: ", env_name) print("Action space is discrete: ", discrete) print("Action space dim: ", ac_dim) print("Observation space dim: ", ob_dim) print("Max_path_length ", max_path_length) #========================================================================================# # Random data collection #========================================================================================# random_controller = RandomController(env) data_buffer_model = DataBuffer() data_buffer_ppo = DataBuffer_general(10000, 4) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("data buffer size: ", data_buffer_model.size) normalization = compute_normalization(data_buffer_model) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True tf_config.intra_op_parallelism_threads =4 tf_config.inter_op_parallelism_threads = 1 sess = tf.Session(config=tf_config) dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate) if nn_baseline: value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) if MPC: dyn_model.fit(data_buffer_model) returns = [] costs = [] # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: # print("data buffer size: ", data_buffer_model.size) current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]} ob = env.reset() obs, acs, mpc_acs, rewards = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 return_ = 0 while True: # print("steps ", steps) if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) if MPC: mpc_ac = mpc_controller.get_action(ob) else: mpc_ac = random_controller.get_action(ob) ac = policy_nn.predict(ob, mpc_ac) ac = ac[0] if not PG: ac = mpc_ac acs.append(ac) mpc_acs.append(mpc_ac) current_path['observations'].append(ob) ob, rew, done, _ = env.step(ac) current_path['reward'].append(rew) current_path['actions'].append(ac) current_path['next_observations'].append(ob) return_ += rew rewards.append(rew) steps += 1 if done or steps > max_path_length: break if MPC: # cost & return cost = path_cost(cost_fn, current_path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(current_path['observations'])): data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n]) for n in range(len(current_path['observations'])): data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n]) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs), "mpc_action" : np.array(mpc_acs)} paths.append(path) timesteps_this_batch += pathlength(path) # print("timesteps_this_batch", timesteps_this_batch) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch print("data_buffer_ppo.size:", data_buffer_ppo.size) # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths]) # Computing Q-values if reward_to_go: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): if t_ >= t: q += gamma**(t_-t) * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) else: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): q += gamma**t_ * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) # Computing Baselines if nn_baseline: # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no}) b_n = value_nn.predict(ob_no) b_n = normalize(b_n) b_n = denormalize(b_n, np.std(q_n), np.mean(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() # Advantage Normalization if normalize_advantages: adv_n = normalize(adv_n) # Optimizing Neural Network Baseline if nn_baseline: b_n_target = normalize(q_n) value_nn.fit(ob_no, b_n_target) # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target}) # Performing the Policy Update # policy_nn.fit(ob_no, ac_na, adv_n) policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na) # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train(sess, env, args, actor, critic, actor_noise, logdir): logz.configure_output_dir(logdir) locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} print('params: ', params) params['env'] = 'InvertedPendulum' params['exp_name'] = '3layer' logz.save_params(params) # Set up summary Ops summary_ops, summary_vars = build_summaries() checkpoint_actor_dir = os.path.join(os.curdir, 'Actor_InvertedPendulum') if not os.path.exists(checkpoint_actor_dir): os.makedirs(checkpoint_actor_dir) actor_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt") ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir) checkpoint_critic_dir = os.path.join(os.curdir, 'Critic_InvertedPendulum') if not os.path.exists(checkpoint_critic_dir): os.makedirs(checkpoint_critic_dir) critic_prefix = os.path.join(checkpoint_critic_dir, "model.ckpt") ckpt_2 = tf.train.get_checkpoint_state(checkpoint_critic_dir) if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path): print("Reading actor parameters from %s" % ckpt_1.model_checkpoint_path) actor.saver.restore(sess, ckpt_1.model_checkpoint_path) if ckpt_2 and tf.train.checkpoint_exists(ckpt_2.model_checkpoint_path): print("Reading critic parameters from %s" % ckpt_2.model_checkpoint_path) critic.saver.restore(sess, ckpt_2.model_checkpoint_path) uninitialized_vars = [] for var in tf.all_variables(): try: sess.run(var) except tf.errors.FailedPreconditionError: uninitialized_vars.append(var) if len(uninitialized_vars) > 0: init_new_vars_op = tf.variables_initializer(uninitialized_vars) sess.run(init_new_vars_op) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. # tflearn.is_training(True) def testing(): env1 = gym.make(args['env']) s = env1.reset() done = False total_reward = 0 max_steps = env1.spec.timestep_limit step = 0 while not done: a = actor.predict(np.reshape(s, (1, actor.s_dim))) s2, r, done, _ = env1.step(a[0]) total_reward += r step += 1 s = s2 # env.render() if step > max_steps: break print('total steps: ', step) print('total reward: ', total_reward) return step, total_reward iter = 0 start = time.time() best_step, best_rew = testing() for i in range(int(args['max_episodes'])): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() # Added exploration noise # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) num = np.random.uniform() a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples batch_size = int(args['minibatch_size']) if replay_buffer.size() > 100000: iter += 1 s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(batch_size) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets # critic will be trained to minimise the mean square error of the predicted Q value # and the target value. predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient # gradients of the critic Q value according to the action valu --> action gradients a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) # del_a Q(s,a) actor.train(s_batch, grads[0]) # del_a Q(s,a) * del_theta Mu_theta(s) ---> actor gradients # directly apply these gradients on actor params. No special loss to minimize if iter%20 == 0: new_steps, new_rew = testing() if new_rew > best_rew: best_rew = new_rew actor.saver.save(sess, actor_prefix) critic.saver.save(sess, critic_prefix) print('model saved to disk.') actor.saver.restore(sess, ckpt_1.model_checkpoint_path) critic.saver.restore(sess, ckpt_2.model_checkpoint_path) best_step, best_rew = testing() # print('actor model saved to: ', actor_prefix) # print('critic model saved to: ', critic_prefix) if iter%10 == 0: new_steps, new_rew = testing() logz.log_tabular("Time", time.time() - start) logz.log_tabular('Iteration', iter/10) logz.log_tabular('Reward', new_rew) logz.log_tabular('Steps', new_steps) logz.dump_tabular() # Update target networks if iter%50 == 0: replay_buffer.update() print('updating buffer') print('updating target networks..') actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \ i, (ep_ave_max_q / float(j)))) break
def train_PG(exp_name, env_name, n_iter, \ gamma, min_timesteps_per_batch, max_path_length, learning_rate, \ reward_to_go, animate, logdir, normalize_advantages, nn_baseline, \ seed, n_layers, size): start = time.time() setup_logger(logdir, locals()) ## Set up Logger env = gym.make(env_name) tf.set_random_seed(seed) env.seed(seed) max_path_length = max_path_length or env.spec.max_episode_steps discrete = isinstance(env.action_space, gym.spaces.Discrete) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n if discrete else env.action_space.shape[0] ## Initialize Agent computation_graph_args = {'n_layers': n_layers, 'obs_dim': obs_dim, 'act_dim': act_dim, \ 'discrete': discrete, 'size': size, 'learning_rate': learning_rate} sample_trajectory_args = {'animate': animate, 'max_path_length': max_path_length, \ 'min_timesteps_per_batch': min_timesteps_per_batch} estimate_return_args = {'gamma': gamma, 'reward_to_go': reward_to_go, \ 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages} agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) agent.build_computation_graph() agent.init_tf_sess() ## Training Loop total_time_steps = 0 for itr in range(n_iter): print("********* Iteration %i *********" % itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_time_steps += timesteps_this_batch obs_no = np.concatenate([path['observation'] for path in paths]) act_na = np.concatenate([path['action'] for path in paths]) ret_n = [path['reward'] for path in paths] q_n, adv_n = agent.estimate_return(obs_no, ret_n) agent.update_parameters(obs_no, act_na, q_n, adv_n) # Log dianostics returns = [path['reward'].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenSt", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_time_steps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name='adv', dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(sy_ob_no, ac_dim, 'network', n_layers=n_layers) # Hint: Use the tf.multinomial sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), [-1]) sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=sy_ac_na, logits=sy_logits_na) else: # YOUR_CODE_HERE sy_mean = build_mlp(sy_ob_no, ac_dim, 'network', n_layers=n_layers) # logstd should just be a trainable variable, not a network output. sy_logstd = tf.Variable(tf.zeros([1, ac_dim]), name='sy_logstd', dtype=tf.float32) sy_std = tf.exp(sy_logstd) sy_sampled_ac = tf.random_normal(tf.shape(sy_mean), mean=sy_mean, stddev=sy_std) sy_z = (sy_ac_na - sy_mean) / sy_std sy_logprob_n = 0.5 * tf.reduce_sum(tf.square(sy_z), axis=1) #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# # Loss function that we'll differentiate to get the policy gradient. loss = tf.reduce_mean(sy_logprob_n * sy_adv_n) update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_target = tf.placeholder(shape=[None], name='baseline_target', dtype=tf.float32) baseline_loss = tf.reduce_sum( tf.losses.mean_squared_error(labels=baseline_target, predictions=baseline_prediction)) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( baseline_loss) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE discounted_rewards = [] for path in paths: r = 0 path_rewards = [0.0] * pathlength(path) for t in reversed(range(pathlength(path))): r = path['reward'][t] + gamma * r path_rewards[t] = r if reward_to_go: discounted_rewards.append(path_rewards) else: discounted_rewards.append([path_rewards[0]] * pathlength(path)) q_n = np.concatenate(discounted_rewards) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) b_n = (b_n - b_n.mean(axis=0)) / (b_n.std(axis=0) + 1e-8) q_mean = q_n.mean(axis=0) q_std = q_n.std(axis=0) b_n = q_mean + q_std * b_n adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = (adv_n - adv_n.mean(axis=0)) / (adv_n.std(axis=0) + 1e-8) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE q_n = (q_n - q_n.mean(axis=0)) / (q_n.std(axis=0) + 1e-8) sess.run([baseline_update_op], feed_dict={ sy_ob_no: ob_no, baseline_target: q_n }) pass #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE sess.run(update_op, feed_dict={ sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n }) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train(self, num_iter): start = time.time() for i in range(num_iter): t1 = time.time() self.train_step() t2 = time.time() print("total time of one step", t2 - t1) print("iter ", i, " done") # record statistics every 10 iterations if (i + 1) % 10 == 0: rewards = self.aggregate_rollouts(num_rollouts=100, evaluate=True) w = ray.get(self.workers[0].get_weights_plus_stats.remote()) np.savez(self.logdir + "/lin_policy_plus", w) print(sorted(self.params.items())) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", i + 1) logz.log_tabular("AverageReward", np.mean(rewards)) logz.log_tabular("StdRewards", np.std(rewards)) logz.log_tabular("MaxRewardRollout", np.max(rewards)) logz.log_tabular("MinRewardRollout", np.min(rewards)) logz.log_tabular("timesteps", self.timesteps) logz.dump_tabular() t1 = time.time() # get statistics from all workers for j in range(self.num_workers): self.policy.observation_filter.update( ray.get(self.workers[j].get_filter.remote())) self.policy.observation_filter.stats_increment() # make sure master filter buffer is clear self.policy.observation_filter.clear_buffer() # sync all workers filter_id = ray.put(self.policy.observation_filter) setting_filters_ids = [ worker.sync_filter.remote(filter_id) for worker in self.workers ] # waiting for sync of all workers ray.get(setting_filters_ids) increment_filters_ids = [ worker.stats_increment.remote() for worker in self.workers ] # waiting for increment of all workers ray.get(increment_filters_ids) t2 = time.time() print("Time to sync statistics:", t2 - t1) return
def train_AC( exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, num_target_updates, num_grad_steps_per_target_update, animate, logdir, normalize_advantages, seed, n_layers, size, ######################################################################## # Exploration args bonus_coeff, kl_weight, density_lr, density_train_iters, density_batch_size, density_hiddim, dm, replay_size, sigma, ######################################################################## ): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment ######################################################################## # Exploration if env_name == 'PointMass-v0': from pointmass import PointMass env = PointMass() else: env = gym.make(env_name) dirname = logz.G.output_dir ######################################################################## # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, 'num_target_updates': num_target_updates, 'num_grad_steps_per_target_update': num_grad_steps_per_target_update, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_advantage_args = { 'gamma': gamma, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_advantage_args) #estimate_return_args # build computation graph agent.build_computation_graph() ######################################################################## # Initalize exploration density model if dm != 'none': if env_name == 'PointMass-v0' and dm == 'hist': density_model = Histogram( nbins=env.grid_size, preprocessor=env.preprocess) exploration = DiscreteExploration( density_model=density_model, bonus_coeff=bonus_coeff) elif dm == 'rbf': density_model = RBF(sigma=sigma) exploration = RBFExploration( density_model=density_model, bonus_coeff=bonus_coeff, replay_size=int(replay_size)) elif dm == 'ex2': density_model = Exemplar( ob_dim=ob_dim, hid_dim=density_hiddim, learning_rate=density_lr, kl_weight=kl_weight) exploration = ExemplarExploration( density_model=density_model, bonus_coeff=bonus_coeff, train_iters=density_train_iters, bsize=density_batch_size, replay_size=int(replay_size)) exploration.density_model.build_computation_graph() else: raise NotImplementedError ######################################################################## # tensorflow: config, session, variable initialization agent.init_tf_sess() ######################################################################## if dm != 'none': exploration.receive_tf_sess(agent.sess) ######################################################################## #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = np.concatenate([path["reward"] for path in paths]) next_ob_no = np.concatenate([path["next_observation"] for path in paths]) terminal_n = np.concatenate([path["terminal"] for path in paths]) ######################################################################## # Modify the reward to include exploration bonus """ 1. Fit density model if dm == 'ex2': the call to exploration.fit_density_model should return ll, kl, elbo else: the call to exploration.fit_density_model should return nothing 2. Modify the re_n with the reward bonus by calling exploration.modify_reward """ old_re_n = re_n if dm == 'none': pass else: # 1. Fit density model if dm == 'ex2': ### PROBLEM 3 ### YOUR CODE HERE ll, kl, elbo = exploration.fit_density_model(ob_no) elif dm == 'hist' or dm == 'rbf': ### PROBLEM 1 ### YOUR CODE HERE exploration.fit_density_model(ob_no) else: assert False # 2. Modify the reward ### PROBLEM 1 ### YOUR CODE HERE # raise NotImplementedError re_n = exploration.modify_reward(old_re_n,ob_no) print('average state', np.mean(ob_no, axis=0)) print('average action', np.mean(ac_na, axis=0)) # Logging stuff. # Only works for point mass. if env_name == 'PointMass-v0': np.save(os.path.join(dirname, '{}'.format(itr)), ob_no) ######################################################################## agent.update_critic(ob_no, next_ob_no, re_n, terminal_n) adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) agent.update_actor(ob_no, ac_na, adv_n) if n_iter - itr < 10: max_reward_path_idx = np.argmax(np.array([path["reward"].sum() for path in paths])) print(paths[max_reward_path_idx]['reward']) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) ######################################################################## logz.log_tabular("Unmodified Rewards Mean", np.mean(old_re_n)) logz.log_tabular("Unmodified Rewards Std", np.mean(old_re_n)) logz.log_tabular("Modified Rewards Mean", np.mean(re_n)) logz.log_tabular("Modified Rewards Std", np.mean(re_n)) if dm == 'ex2': logz.log_tabular("Log Likelihood Mean", np.mean(ll)) logz.log_tabular("Log Likelihood Std", np.std(ll)) logz.log_tabular("KL Divergence Mean", np.mean(kl)) logz.log_tabular("KL Divergence Std", np.std(kl)) logz.log_tabular("Negative ELBo", -elbo) ######################################################################## logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train(self, num_iter, state_filter=False): start = time.time() for i in range(num_iter): t1 = time.time() rewards = self.train_step(state_filter=state_filter) t2 = time.time() print('total time of one step', t2 - t1) print('Iteration', i + 1, 'done') print('AverageReward:', np.mean(rewards)) print('StdRewards:', np.std(rewards)) print('MaxRewardRollout:', np.max(rewards)) print('MinRewardRollout:', np.min(rewards)) # record weights and stats every n iterations if ((i + 1) % self.log_every == 0): rewards = self.aggregate_rollouts( num_rollouts=self.eval_rollouts, evaluate=True) #w = ray.get(self.workers[0].get_weights.remote()) if state_filter: w = ray.get( self.workers[0].get_weights_plus_stats.remote()) else: w = ray.get(self.workers[0].get_weights.remote()) np.savez(self.logdir + "/lin_policy_plus", w) #print(sorted(self.params.items())) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", i + 1) logz.log_tabular("AverageReward", np.mean(rewards)) logz.log_tabular("StdRewards", np.std(rewards)) logz.log_tabular("MaxRewardRollout", np.max(rewards)) logz.log_tabular("MinRewardRollout", np.min(rewards)) logz.log_tabular("Timesteps", self.timesteps) logz.log_tabular("LearningRate", self.optimizer.learning_rate) logz.log_tabular("DeltaStd", self.delta_std) logz.dump_tabular() if state_filter: t1 = time.time() # get statistics from all workers for j in range(self.num_workers): self.policy.observation_filter.update( ray.get(self.workers[j].get_filter.remote())) self.policy.observation_filter.stats_increment() # make sure master filter buffer is clear self.policy.observation_filter.clear_buffer() # sync all workers filter_id = ray.put(self.policy.observation_filter) setting_filters_ids = [ worker.sync_filter.remote(filter_id) for worker in self.workers ] # waiting for sync of all workers ray.get(setting_filters_ids) increment_filters_ids = [ worker.stats_increment.remote() for worker in self.workers ] # waiting for increment of all workers ray.get(increment_filters_ids) t2 = time.time() print('Time to sync statistics:', t2 - t1) return
def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=False, logfile=None): env = gym.make("CartPole-v0") ob_dim = env.observation_space.shape[0] num_actions = env.action_space.n logz.configure_output_file(logfile) #vf = LinearValueFunction() vf = NeuralValueFunction(ob_dim) # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in these function sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate sy_h1 = tf.nn.relu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer # we use a small initialization for the last layer, so the initial policy has maximal entropy sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic) sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient) sy_n = tf.shape(sy_ob_no)[0] sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>> sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na) sy_oldp_na = tf.exp(sy_oldlogp_na) sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n) sy_p_na = tf.exp(sy_logp_na) sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n) # <<<<<<<<<<<<< sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) total_timesteps = 0 obs_mean = np.zeros(ob_dim) obs_std = np.zeros(ob_dim) for i in range(n_iter): print("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate) while True: if animate_this_episode: env.render() obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) if done: break path = {"observation" : np.array(obs), "terminated" : terminated, "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vpred_t = vf.predict((path["observation"]-obs_mean)/(obs_std+1e-8)) adv_t = return_t - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n-adv_n.mean())/(adv_n.std()+1e-8) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) obs_mean = np.average(ob_no,axis=0) obs_std = np.std(ob_no,axis=0) vf.fit((ob_no-obs_mean)/(obs_std+1e-8), vtarg_n) # Policy update _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize}) kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na}) # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def run_experiment(exp_params, learner_params, discriminator_params): # Experiment parameters file_location = exp_params.get('expert_samples_location', 'expert_data') prior_file_location = exp_params.get('prior_samples_location', 'prior_data') env_name = exp_params.get('env_name', 'InvertedPendulum-v2') env_type = exp_params.get('env_type', 'expert') exp_name = exp_params.get('exp_name', '{}_{}'.format(env_name, env_type)) exp_num = exp_params.get('exp_num', 0) epochs = exp_params.get('epochs', 100) test_runs_per_epoch = exp_params.get('test_runs_per_epoch', 10) steps_per_epoch = exp_params.get('steps_per_epoch', 1000) init_random_samples = exp_params.get('init_random_samples', 5000) training_starts = exp_params.get('training_starts', 0) episode_limit = exp_params.get('episode_limit', 200) return_threshold = exp_params.get('return_threshold', 1e4) return_agent_buffer = exp_params.get('return_agent_buffer', False) visualize_collected_observations = exp_params.get( 'visualize_collected_observations', False) save_weights_checkpoints = exp_params.get('save_weights_checkpoints', False) # Learner parameters l_type = learner_params.get('l_type', 'TD3') l_buffer_size = learner_params.get('l_buffer_size', 10000) l_exploration_noise = learner_params.get('l_exploration_noise', 0.2) l_learning_rate = learner_params.get('l_learning_rate', 1e-3) l_batch_size = learner_params.get('l_batch_size', 128) l_updates_per_step = learner_params.get('l_updates_per_step', 1) l_act_delay = learner_params.get('l_act_delay', 2) l_gamma = learner_params.get('l_gamma', 0.99) l_polyak = learner_params.get('l_polyak', 0.995) l_train_actor_noise = learner_params.get('l_train_actor_noise', 0.1) l_entropy_coefficient = learner_params.get('l_entropy_coefficient', 0.2) l_tune_entropy_coefficient = learner_params.get( 'l_tune_entropy_coefficient', True) l_target_entropy = learner_params.get('l_target_entropy', None) l_clip_actor_gradients = learner_params.get('l_clip_actor_gradients', False) # Discriminator parameters d_type = discriminator_params.get('d_type', 'latent') d_loss = discriminator_params.get('d_loss', 'ce') d_rew = discriminator_params.get('d_rew', 'mixed') d_rew_noise = discriminator_params.get('d_rew_noise', True) d_learning_rate = discriminator_params.get('d_learning_rate', 1e-3) d_mi_learning_rate = discriminator_params.get('d_mi_learning_rate', 1e-3) d_updates_per_step = discriminator_params.get('d_updates_per_step', 1) d_mi_updates_per_step = discriminator_params.get('d_mi_updates_per_step', 1) d_e_batch_size = discriminator_params.get('d_e_batch_size', 64) d_l_batch_size = discriminator_params.get('d_l_batch_size', 64) d_label_smoothing = discriminator_params.get('d_label_smoothing', 0.0) d_stability_constant = discriminator_params.get('d_stability_constant', 0.0) d_sn_discriminator = discriminator_params.get('d_sn_discriminator', False) d_mi_constant = discriminator_params.get('d_mi_constant', 0.0) d_adaptive_mi = discriminator_params.get('d_adaptive_mi', False) d_double_mi = discriminator_params.get('d_double_mi', False) d_use_min_double_mi = discriminator_params.get('d_use_min_double_mi', False) d_max_mi = discriminator_params.get('d_max_mi', 1) d_min_mi = discriminator_params.get('d_min_mi', d_max_mi / 2) d_use_dual_mi = discriminator_params.get('d_use_dual_mi', False) d_mi_lagrangian_lr = discriminator_params.get('d_mi_lagrangian_lr', 1e-3) d_max_mi_constant = discriminator_params.get('d_max_mi_constant', 10) d_min_mi_constant = discriminator_params.get('d_min_mi_constant', 1e-4) d_unbiased_mi = discriminator_params.get('d_unbiased_mi', False) d_unbiased_mi_decay = discriminator_params.get('d_unbiased_mi_decay', 0.99) d_prior_mi_constant = discriminator_params.get('d_prior_mi_constant', 0.0) d_negative_priors = discriminator_params.get('d_negative_priors', False) d_max_mi_prior = discriminator_params.get('d_max_mi_prior', 0.05) d_min_mi_prior_constant = discriminator_params.get( 'd_min_mi_prior_constant', 1e-4) d_clip_mi_predictions = discriminator_params.get('d_clip_mi_predictions', False) d_pre_filters = discriminator_params.get('d_pre_filters', [32, 32, 1]) d_hidden_units = discriminator_params.get('d_hidden_units', [32]) d_mi_hidden_units = discriminator_params.get('d_mi_hidden_units', [32, 32]) d_mi2_hidden_units = discriminator_params.get('d_mi2_hidden_units', d_mi_hidden_units) d_pre_scale_stddev = discriminator_params.get('d_pre_scale_stddev', 1.0) n_expert_demos = discriminator_params.get('n_expert_demos', None) n_expert_prior_demos = discriminator_params.get('n_expert_prior_demos', None) n_agent_prior_demos = discriminator_params.get('n_agent_prior_demos', n_expert_prior_demos) if env_name == 'InvertedPendulum-v2': im_side = 32 im_shape = [im_side, im_side] expert_prior_location = 'Expert' + env_name if env_type == 'expert': env = ExpertInvertedPendulumEnv() agent_prior_location = 'Expert' + env_name elif env_type == 'agent' or env_type == 'colored' or env_type == 'to_colored': env = AgentInvertedPendulumEnv() agent_prior_location = 'Agent' + env_name elif env_type == 'to_two': env = ExpertInvertedDoublePendulumEnv() agent_prior_location = 'ExpertInvertedDoublePendulum-v2' elif env_type == 'to_colored_two': env = AgentInvertedDoublePendulumEnv() agent_prior_location = 'AgentInvertedDoublePendulum-v2' else: raise NotImplementedError elif env_name == 'InvertedDoublePendulum-v2': im_side = 32 im_shape = [im_side, im_side] expert_prior_location = 'ExpertInvertedDoublePendulum-v2' if env_type == 'expert': agent_prior_location = 'ExpertInvertedDoublePendulum-v2' env = ExpertInvertedDoublePendulumEnv() elif env_type == 'colored' or env_type == 'to_colored': env = AgentInvertedDoublePendulumEnv() agent_prior_location = 'AgentInvertedDoublePendulum-v2' elif env_type == 'to_one': agent_prior_location = 'ExpertInvertedPendulum-v2' env = ExpertInvertedPendulumEnv() elif env_type == 'agent' or env_type == 'to_colored_one': agent_prior_location = 'AgentInvertedPendulum-v2' env = AgentInvertedPendulumEnv() else: raise NotImplementedError elif env_name == 'ThreeReacherEasy-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'Expert' + env_name if env_type == 'expert': env = ThreeReacherEasyEnv() agent_prior_location = 'Expert' + env_name elif env_type == 'agent' or env_type == 'to_two': agent_prior_location = 'ExpertReacherEasy-v2' env = ReacherEasyEnv() elif env_type == 'tilted' or env_type == 'to_tilted': agent_prior_location = 'AgentThreeReacherEasy-v2' env = Tilted3ReacherEasyEnv() elif env_type == 'to_tilted_two': env = TiltedReacherEasyEnv() agent_prior_location = 'AgentReacherEasy-v2' else: raise NotImplementedError elif env_name == 'ReacherEasy-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'ExpertReacherEasy-v2' if env_type == 'expert': env = ReacherEasyEnv() agent_prior_location = 'ExpertReacherEasy-v2' elif env_type == 'agent' or env_type == 'tilted' or env_type == 'to_tilted': env = TiltedReacherEasyEnv() agent_prior_location = 'AgentReacherEasy-v2' elif env_type == 'to_three': env = ThreeReacherEasyEnv() agent_prior_location = 'ExpertThreeReacherEasy-v2' elif env_type == 'to_tilted_three': agent_prior_location = 'AgentThreeReacherEasy-v2' env = Tilted3ReacherEasyEnv() else: raise NotImplementedError elif env_name == 'Hopper-v2': im_side = 64 im_shape = [im_side, im_side] expert_prior_location = 'Hopper-v2' if env_type == 'expert': env = HopperEnv() agent_prior_location = 'Hopper-v2' elif env_type == 'flexible': env = HopperFlexibleEnv() agent_prior_location = 'HopperFlexible-v2' else: raise NotImplementedError elif env_name == 'HalfCheetah-v2': im_side = 64 im_shape = [im_side, im_side] expert_prior_location = 'HalfCheetah-v2' if env_type == 'expert': env = ExpertHalfCheetahEnv() agent_prior_location = 'HalfCheetah-v2' elif env_type == 'locked_legs': env = LockedLegsHalfCheetahEnv() agent_prior_location = 'LockedLegsHalfCheetah-v2' else: raise NotImplementedError elif env_name == 'Striker-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'Striker-v2' if env_type == 'expert': env = StrikerEnv() agent_prior_location = 'Striker-v2' elif env_type == 'to_human': env = StrikerHumanSimEnv() agent_prior_location = 'StrikerHuman-v2' else: raise NotImplementedError elif env_name == 'StrikerHumanSim-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'StrikerHumanSim-v2' if env_type == 'expert': env = StrikerHumanSimEnv() agent_prior_location = 'StrikerHumanSim-v2' elif env_type == 'to_robot': env = StrikerEnv() agent_prior_location = 'Striker-v2' else: raise NotImplementedError elif env_name == 'Pusher-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'Pusher-v2' if env_type == 'expert': env = PusherEnv() agent_prior_location = 'Pusher-v2' elif env_type == 'to_human': env = PusherHumanSimEnv() agent_prior_location = 'PusherHuman-v2' else: raise NotImplementedError elif env_name == 'PusherHumanSim-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'PusherHumanSim-v2' if env_type == 'expert': env = PusherHumanSimEnv() agent_prior_location = 'PusherHumanSim-v2' elif env_type == 'to_robot': env = PusherEnv() agent_prior_location = 'Pusher-v2' else: raise NotImplementedError else: raise NotImplementedError expert_buffer = DemonstrationsReplayBuffer( load_expert_trajectories(env_name, file_location, visual_data=True, load_ids=True, max_demos=n_expert_demos)) expert_visual_data_shape = expert_buffer.get_random_batch( 1)['ims'][0].shape print('Visual data shape: {}'.format(expert_visual_data_shape)) past_frames = expert_visual_data_shape[0] print('Past frames: {}'.format(past_frames)) if d_prior_mi_constant > 0.0 or d_negative_priors: prior_expert_buffer = DemonstrationsReplayBuffer( load_expert_trajectories(agent_prior_location, prior_file_location, visual_data=True, load_ids=True, max_demos=n_expert_prior_demos)) prior_agent_buffer = DemonstrationsReplayBuffer( load_expert_trajectories(expert_prior_location, prior_file_location, visual_data=True, load_ids=True, max_demos=n_agent_prior_demos)) else: prior_expert_buffer, prior_agent_buffer = None, None if d_type == 'latent': im_shape += [3] else: im_shape += [3 * past_frames] action_size = env.action_space.shape[0] if exp_num == -1: logz.configure_output_dir(None, True) else: log_dir = osp.join('experiments_data/', '{}/{}'.format(exp_name, exp_num)) logz.configure_output_dir(log_dir, True) params = { 'exp': exp_params, 'learner': learner_params, 'discriminator': discriminator_params, } print(params) logz.save_params(params) if l_type == 'TD3': def make_actor(): actor = Actor([ tf.keras.layers.Dense(400, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(300, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense( action_size, 'tanh', kernel_initializer=tf.keras.initializers.Orthogonal(0.01)) ]) return actor def make_critic(): critic = Critic([ tf.keras.layers.Dense(400, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(300, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense( 1, kernel_initializer=tf.keras.initializers.Orthogonal(0.01)) ]) return critic elif l_type == 'SAC': def make_actor(): actor = StochasticActor([ tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense( action_size * 2, kernel_initializer=tf.keras.initializers.Orthogonal(0.01)) ]) return actor def make_critic(): critic = Critic([ tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense( 1, kernel_initializer=tf.keras.initializers.Orthogonal(0.01)) ]) return critic if l_target_entropy is None: l_target_entropy = -1 * (np.prod(env.action_space.shape)) else: raise NotImplementedError d_optimizer = tf.keras.optimizers.Adam(learning_rate=d_learning_rate) d_mi_optimizer = tf.keras.optimizers.Adam(learning_rate=d_mi_learning_rate) d_mi_lagrangian_optimizer = tf.keras.optimizers.Adam( learning_rate=d_mi_lagrangian_lr) tfl = tf.keras.layers if d_type == 'latent': pre_layers = [tfl.Reshape(im_shape)] else: pre_layers = [tfl.Permute((2, 3, 1, 4)), tfl.Reshape(im_shape)] if (d_type == 'latent') or (not d_sn_discriminator): for filters in d_pre_filters[:-1]: pre_layers += [ tfl.Conv2D(filters, 3, activation='tanh', padding='same'), tfl.MaxPooling2D(2, padding='same') ] pre_layers += [ tfl.Conv2D(d_pre_filters[-1], 3, padding='same'), tfl.MaxPooling2D(2, padding='same'), tfl.Reshape([-1]) ] else: for filters in d_pre_filters[:-1]: pre_layers += [ SpectralNormalization(tfl.Conv2D(filters, 3, padding='same')), tfl.LeakyReLU(), tfl.MaxPooling2D(2, padding='same') ] pre_layers += [ SpectralNormalization( tfl.Conv2D(d_pre_filters[-1], 3, padding='same')), tfl.MaxPooling2D(2, padding='same'), tfl.Reshape([-1]) ] if d_sn_discriminator: disc_layers = [ SpectralNormalization(tfl.Dense(units, activation='relu')) for units in d_hidden_units ] disc_layers.append(SpectralNormalization(tfl.Dense(1))) else: disc_layers = [ tfl.Dense(units, activation='tanh') for units in d_hidden_units ] disc_layers.append(tfl.Dense(1)) if d_type == 'latent': def make_pre(): pre = GaussianPreprocessor(pre_layers, d_pre_scale_stddev) return pre def make_disc(): disc = InvariantDiscriminator(disc_layers, d_stability_constant, d_rew) return disc else: def make_pre(): pre = DeterministicPreprocessor(pre_layers) return pre def make_disc(): disc = InvariantDiscriminator(disc_layers, d_stability_constant, d_rew) return disc mi_layers = [ tfl.Dense(units, activation='tanh') for units in d_mi_hidden_units ] mi_layers.append(tfl.Dense(1)) def make_mi_est(): mi_est = MIEstimator(mi_layers) return mi_est if d_double_mi: mi2_layers = [ tfl.Dense(units, activation='tanh') for units in d_mi2_hidden_units ] mi2_layers.append(tfl.Dense(1)) def make_mi2_est(): mi2_est = MIEstimator(mi2_layers) return mi2_est else: make_mi2_est = None l_optimizer = tf.keras.optimizers.Adam(l_learning_rate) if l_type == 'TD3': l_agent = DDPG( make_actor=make_actor, make_critic=make_critic, make_critic2=make_critic, actor_optimizer=l_optimizer, critic_optimizer=l_optimizer, gamma=l_gamma, polyak=l_polyak, train_actor_noise=l_train_actor_noise, clip_actor_gradients=l_clip_actor_gradients, ) elif l_type == 'SAC': l_agent = SAC( make_actor=make_actor, make_critic=make_critic, make_critic2=make_critic, actor_optimizer=l_optimizer, critic_optimizer=l_optimizer, gamma=l_gamma, polyak=l_polyak, entropy_coefficient=l_entropy_coefficient, tune_entropy_coefficient=l_tune_entropy_coefficient, target_entropy=l_target_entropy, clip_actor_gradients=l_clip_actor_gradients, ) else: raise NotImplementedError sampler = Sampler(env, episode_limit, init_random_samples, visual_env=True) gail = DisentanGAIL( agent=l_agent, make_discriminator=make_disc, make_preprocessing=make_pre, expert_buffer=expert_buffer, prior_expert_buffer=prior_expert_buffer, prior_agent_buffer=prior_agent_buffer, make_mi_estimator=make_mi_est, make_mi2_estimator=make_mi2_est, use_min_double_mi=d_use_min_double_mi, d_loss=d_loss, d_optimizer=d_optimizer, mi_optimizer=d_mi_optimizer, label_smoothing=d_label_smoothing, stab_const=d_stability_constant, mi_constant=d_mi_constant, adaptive_mi=d_adaptive_mi, max_mi=d_max_mi, min_mi=d_min_mi, prior_mi_constant=d_prior_mi_constant, negative_priors=d_negative_priors, max_mi_prior=d_max_mi_prior, use_dual_mi=d_use_dual_mi, mi_lagrangian_optimizer=d_mi_lagrangian_optimizer, max_mi_constant=d_max_mi_constant, min_mi_constant=d_min_mi_constant, min_mi_prior_constant=d_min_mi_prior_constant, unbiased_mi=d_unbiased_mi, clip_mi_predictions=d_clip_mi_predictions, unbiased_mi_decay=d_unbiased_mi_decay, im_side=im_side, past_frames=past_frames, ) agent_buffer = LearnerAgentReplayBuffer(gail, l_buffer_size, reward_noise=d_rew_noise) test_input = expert_buffer.get_random_batch(1) test_input['obs'] = np.expand_dims((env.reset()['obs']).astype('float32'), axis=0) gail(test_input) gail.summary() mean_test_returns = [] mean_test_std = [] steps = [] step_counter = 0 logz.log_tabular('Iteration', 0) logz.log_tabular('Steps', step_counter) print('Epoch {}/{} - total steps {}'.format(0, epochs, step_counter)) out = sampler.evaluate(l_agent, test_runs_per_epoch, False) mean_test_returns.append(out['mean']) mean_test_std.append(out['std']) steps.append(step_counter) for k, v in out.items(): logz.log_tabular(k, v) logz.dump_tabular() for e in range(epochs): while step_counter < (e + 1) * steps_per_epoch: traj_data = sampler.sample_trajectory(l_agent, l_exploration_noise) agent_buffer.add(traj_data) n = traj_data['n'] step_counter += traj_data['n'] if step_counter > training_starts: gail.train( agent_buffer=agent_buffer, l_batch_size=l_batch_size, l_updates=l_updates_per_step * n, l_act_delay=l_act_delay, d_updates=d_updates_per_step * n, mi_updates=d_mi_updates_per_step * n, d_e_batch_size=d_e_batch_size, d_l_batch_size=d_l_batch_size, ) logz.log_tabular('Iteration', e + 1) logz.log_tabular('Steps', step_counter) print('Epoch {}/{} - total steps {}'.format(e + 1, epochs, step_counter)) traj_test = sampler.sample_test_trajectories(l_agent, 0.0, test_runs_per_epoch) out = log_trajectory_statistics(traj_test['ret'], False) mean_test_returns.append(out['mean']) mean_test_std.append(out['std']) steps.append(step_counter) for k, v in out.items(): logz.log_tabular(k, v) logz.dump_tabular() if save_weights_checkpoints: weights_log_dir = 'experiments_data/{}/{}/{}/{}.h5'.format( exp_name, exp_num, 'weights', e) l_agent.save_weights(weights_log_dir) if visualize_collected_observations: training_sample = traj_data['ims'][-1, 0] print('Visualization of latest training sample') plt.imshow(training_sample) plt.show() test_sample = traj_test['ims'][-1, 0] print('Visualization of latest test sample') plt.imshow(test_sample) plt.show() if out['mean'] >= return_threshold: print('Early termination due to reaching return threshold') break if return_agent_buffer: return gail, sampler, agent_buffer else: return gail, sampler,
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, network_activation='tanh' ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #activation function for the network if network_activation=='relu': activation=torch.nn.functional.relu elif network_activation=='leaky_relu': activation=torch.nn.functional.leaky_relu else: activation=torch.nn.functional.tanh #todo: create policy actor=build_mlp(ob_dim, ac_dim, "actor",\ n_layers=n_layers, size=size, activation=activation, discrete=discrete) actor_loss=reinforce_loss actor_optimizer=torch.optim.Adam(actor.parameters(), lr=learning_rate) #todo: initilize Agent: #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: critic=build_mlp(ob_dim,1,"nn_baseline",\ n_layers=n_layers,size=size, discrete=discrete) critic_loss=nn.MSELoss() critic_optimizer=torch.optim.Adam(critic.parameters(), lr=learning_rate) #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards, log_probs = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) ob = torch.from_numpy(ob).float().unsqueeze(0) obs.append(ob) ac, log_prob = actor.run(ob) acs.append(ac) log_probs.append(log_prob) #format the action from policy if discrete: ac = int(ac) else: ac = ac.squeeze(0).numpy() ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : torch.cat(obs, 0), "reward" : torch.Tensor(rewards), "action" : torch.cat(acs, 0), "log_prob" : torch.cat(log_probs, 0)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch ob_no = torch.cat([path["observation"] for path in paths], 0) ac_na = torch.cat([path["action"] for path in paths], 0) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# q_n = [] for path in paths: rewards = path['reward'] num_steps = pathlength(path) R=[] if reward_to_go: for t in range(num_steps): R.append((torch.pow(gamma, torch.arange(num_steps-t))*rewards[t:]).sum().view(-1,1)) q_n.append(torch.cat(R)) else: q_n.append((torch.pow(gamma, torch.arange(num_steps)) * rewards).sum() * torch.ones(num_steps, 1)) q_n = torch.cat(q_n, 0) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = critic(ob_no) q_n_std = q_n.std() q_n_mean = q_n.mean() b_n_scaled = b_n * q_n_std + q_n_mean adv_n = (q_n - b_n_scaled).detach() else: adv_n = q_n #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + np.finfo(np.float32).eps.item()) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE target = (q_n - q_n_mean) / (q_n_std + np.finfo(np.float32).eps.item()) critic_optimizer.zero_grad() c_loss = critic_loss(b_n, target) c_loss.backward() critic_optimizer.step() #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE log_probs = torch.cat([path["log_prob"] for path in paths], 0) actor_optimizer.zero_grad() loss = actor_loss(log_probs, adv_n, len(paths)) print(loss) loss.backward() actor_optimizer.step() # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def run_vanilla_policy_gradient_experiment(args, vf_params, logdir, env, sess, continuous_control): """ General purpose method to run vanilla policy gradients. Works for both continuous and discrete environments. Roughly inspired by starter code for this homework and https://github.com/DanielTakeshi/rl_algorithms/blob/master/vpg/main.py Thanks! Params ------ args: arguments for vanilla policy gradient. vf_params: dict of params for value function logdir: where to store outputs or None if you don't want to store anything env: openai gym env sess: TF session continuous_control: boolean, if true then we do gaussian continuous control """ ob_dim = env.observation_space.shape[0] if args.vf_type == 'linear': value_function = LinearValueFunction(**vf_params) elif args.vf_type == 'nn': value_function = NnValueFunction(session=sess, ob_dim=ob_dim) #value_function = LinearValueFunction() if continuous_control: ac_dim = env.action_space.shape[0] policy_fn = policies.GaussianPolicy(sess, ob_dim, ac_dim) else: ac_dim = env.action_space.n policy_fn = policies.DisceretePolicy(sess, ob_dim, ac_dim) sess.__enter__() # equivalent to with sess, to reduce indentation tf.global_variables_initializer().run() total_timesteps = 0 stepsize = args.initial_stepsize filterAction = 0.1 stepMax = 100 for i in range(args.n_iter): print("\n********** Iteration %i ************" % i) # Collect paths until we have enough timesteps. timesteps_this_batch = 0 paths = [] step = 0 #if(filterAction > 1.0): # filterAction = 1.0 #else: # filterAction = filterAction*1.1 while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode = ( len(paths) == 0 and (i % 10 == 0) and args.render) while True: if animate_this_episode: env.render() obs.append(ob) ac = policy_fn.sample_action(ob) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) step = step + 1 if done: step = 0 #print "done " break #if done or step > stepMax: # print "max steps: {}".format(stepMax) # step = 0 # stepMax = stepMax + 2 # break path = {"observation": np.array(obs), "terminated": terminated, "reward": np.array(rewards), "action": np.array(acs)} paths.append(path) timesteps_this_batch += utils.pathlength(path) if timesteps_this_batch > args.min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function using baseline vf (these are lists!). # return_t: list of sum of discounted rewards (to end of # episode), one per time # vpred_t: list of value function's predictions of components of # return_t vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = utils.discount(rew_t, args.gamma) vpred_t = value_function.predict(path["observation"]) adv_t = return_t - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update and **re-fit the baseline**. ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) std_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) value_function.fit(ob_no, vtarg_n) # Policy update, plus diagnostics stuff. Is there a better way to # handle # the continuous vs discrete control cases? if continuous_control: surr_loss, oldmean_na, oldlogstd_a = policy_fn.update_policy( ob_no, ac_n, std_adv_n, stepsize) kl, ent = policy_fn.kldiv_and_entropy( ob_no, oldmean_na, oldlogstd_a ) else: surr_loss, oldlogits_na = policy_fn.update_policy( ob_no, ac_n, std_adv_n, stepsize) kl, ent = policy_fn.kldiv_and_entropy(ob_no, oldlogits_na) # Step size heuristic to ensure that we don't take too large steps. if args.use_kl_heuristic: if kl > args.desired_kl * 2: stepsize /= 1.5 print('PG stepsize -> %s' % stepsize) elif kl < args.desired_kl / 2: stepsize *= 1.5 print('PG stepsize -> %s' % stepsize) else: print('PG stepsize OK') # Log diagnostics if i % args.log_every_t_iter == 0: logz.log_tabular("EpRewMean", np.mean( [path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean( [utils.pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", utils.explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", utils.explained_variance_1d(value_function.predict(ob_no), vtarg_n)) logz.log_tabular("SurrogateLoss", surr_loss) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than # EVBefore. # Note that we fit the value function AFTER using it to # compute the # advantage function to avoid introducing bias logz.dump_tabular()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #todo: create Agent #todo: initilize Agent: #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = actor.run(ob) print("need to type-check action here:(two lines)") print(ac) print(ac.size()) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break #One episode finishes; perform update here finish_episode(actor, actor_optimizer, critic=None, critic_optimizer=None, ) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = TODO #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = TODO sy_sampled_ac = TODO # Hint: Use the tf.multinomial op sy_logprob_n = TODO else: # YOUR_CODE_HERE sy_mean = TODO sy_logstd = TODO # logstd should just be a trainable variable, not a network output. sy_sampled_ac = TODO sy_logprob_n = TODO # Hint: Use the log probability under a multivariate gaussian. #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = TODO # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze(build_mlp( sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_update_op = TODO #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE q_n = TODO #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = TODO adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()