def __init__(self, env_name='HalfCheetah-v1', policy_params=None, num_workers=32, num_deltas=320, deltas_used=320, delta_std=0.02, logdir=None, rollout_length=1000, step_size=0.01, shift='constant zero', params=None, seed=123): logz.configure_output_dir(logdir) logz.save_params(params) env = gym.make(env_name) self.timesteps = 0 self.action_size = env.action_space.shape[0] self.ob_size = env.observation_space.shape[0] self.num_deltas = num_deltas self.deltas_used = deltas_used self.rollout_length = rollout_length self.step_size = step_size self.delta_std = delta_std self.logdir = logdir self.shift = shift self.params = params self.max_past_avg_reward = float('-inf') self.num_episodes_used = float('inf') # create shared table for storing noise print("Creating deltas table.") deltas_id = create_shared_noise.remote() self.deltas = SharedNoiseTable(ray.get(deltas_id), seed = seed + 3) print('Created deltas table.') # initialize workers with different random seeds print('Initializing workers.') self.num_workers = num_workers self.workers = [Worker.remote(seed + 7 * i, env_name=env_name, policy_params=policy_params, deltas=deltas_id, rollout_length=rollout_length, delta_std=delta_std) for i in range(num_workers)] # initialize policy if policy_params['type'] == 'linear': self.policy = LinearPolicy(policy_params) self.w_policy = self.policy.get_weights() else: raise NotImplementedError # initialize optimization algorithm self.optimizer = optimizers.SGD(self.w_policy, self.step_size) print("Initialization of ARS complete.")
def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(Supervisor.__init__)[0] params = {k: locals_[k] if k in locals_ and not isinstance(locals_[k], types.FunctionType) and k is not "self" else None for k in args} logz.save_params(params)
def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params)
def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters # args = inspect.getargspec(learn)[0] # params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(locals_.get("kwargs"))
def __init__(self, env_name='HalfCheetah-v1', policy_params=None, num_workers=32, num_deltas=320, deltas_used=320, delta_std=0.02, logdir=None, rollout_length=1000, step_size=0.01, shift='constant zero', params=None, seed=123): logz.configure_output_dir(logdir) logz.save_params(params) env = minitaur_gym_env.MinitaurBulletEnv() #gym.make(env_name) self.timesteps = 0 self.action_size = env.action_space.shape[0] self.ob_size = env.observation_space.shape[0] self.num_deltas = num_deltas self.deltas_used = deltas_used self.rollout_length = rollout_length self.step_size = step_size self.delta_std = delta_std self.logdir = logdir self.shift = shift self.params = params self.max_past_avg_reward = float('-inf') self.num_episodes_used = float('inf') # create shared table for storing noise print("Creating deltas table.") deltas_id = create_shared_noise.remote() self.deltas = SharedNoiseTable(ray.get(deltas_id), seed = seed + 3) print('Created deltas table.') # initialize workers with different random seeds print('Initializing workers.') self.num_workers = num_workers self.workers = [Worker.remote(seed + 7 * i, env_name=env_name, policy_params=policy_params, deltas=deltas_id, rollout_length=rollout_length, delta_std=delta_std) for i in range(num_workers)] # initialize policy if policy_params['type'] == 'linear': self.policy = LinearPolicy(policy_params) self.w_policy = self.policy.get_weights() else: raise NotImplementedError # initialize optimization algorithm self.optimizer = optimizers.SGD(self.w_policy, self.step_size) print("Initialization of ARS complete.")
def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(QLearner)[0] params = {k: str(locals_[k]) if k in locals_ else None for k in args} params['exp_name'] = locals_['q_func'].__name__ + locals_['double_q'] * '_doubleQ' logz.save_params(params)
def setup_logger(logdir, locals_): # Configure output directory for logging seed = np.random.get_state()[1][0] logz.configure_output_dir(logdir + '/%s/' % seed) # Log experimental parameters params = {k: str(locals_[k]) for k in locals_ if '__' not in k} params['seed'] = str(seed) logz.save_params(params)
def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] params = {k: locals_[k] if k in locals_ else None for k in args} # print(params.items()) # print(json.dumps(list(params.values()))) logz.save_params(params)
def __init__( self, organism_builder=None, logdir=None, params=None, master_organism=None, sampler_builder=None, ): logz.configure_output_dir(logdir) logz.save_params(params) # env = env_registry.get_env_constructor(params['env_name'])() self.logdir = logdir self.params = params self.max_past_avg_reward = float('-inf') self.num_episodes_used = float('inf') # create shared table for storing noise print("Creating deltas table.") deltas_id = create_shared_noise_serial() self.deltas = SharedNoiseTable(deltas_id, seed=params['seed'] + 3) print('Created deltas table.') ######################################################## self.master_organism = master_organism self.sampler = sampler_builder( num_deltas=params['n_directions'], shift=params['shift'], num_workers=params['n_workers'], seed=params['seed'], env_name=params['env_name'], organism_builder= organism_builder, #lambda: ARS_LinearAgent(agent_args) deltas_id=deltas_id, rollout_length=params['rollout_length'], delta_std=params['delta_std'], ) # maybe we'd need to merge Sampler and Agent # agent holds the parameters, but sampler takes the agent and does the parallel rollouts # so agent should not have the workers at all... # agent should just contain the parameter. # but the sampler would need to take the agent in. # so the sampler is the thing that takes a single agent, and creates a bunch of workers # modeled the agent. self.rl_alg = ARS_RL_Alg( deltas=self.deltas, # noise table num_deltas=params['n_directions'], # N deltas_used=params['deltas_used'] # b )
def run_model(session, predict, loss, train_step, saver, images, labels, X, y, epochs=1, batch_size=64, print_every=100, is_test=False): if not is_test: # Configure output directory for logging logz.configure_output_dir('logs') # Log experimental parameters args = inspect.getargspec(main)[0] # Get the names and default values of a function's parameters. locals_ = locals() # Return a dictionary containing the current scope's local variables params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # have tensorflow compute accuracy correct_prediction = tf.equal(tf.argmax(predict, axis=1), tf.argmax(y, axis=1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # counter iter_cnt = 0 iters_each_epoch = len(images)//batch_size - 1 for e in range(epochs): # keep track of losses and accuracy correct = 0 losses = [] # make sure we iterate over the dataset once images, labels = shuffle_dataset(images, labels) for i in range(iters_each_epoch): current_iter = i+1 batch_X, batch_y = images[current_iter*batch_size:(current_iter+1)*batch_size], labels[current_iter*batch_size:(current_iter+1)*batch_size] feed_dict = {X: batch_X, y: batch_y} # have tensorflow compute loss and correct predictions # and (if given) perform a training step l, corr, _ = session.run([loss, correct_prediction, train_step],feed_dict=feed_dict) # aggregate performance stats losses.append(l*batch_size) correct += np.sum(corr) # print every now and then if (iter_cnt % print_every) == 0 and not is_test: logz.log_tabular("Iteration", iter_cnt) logz.log_tabular("minibatch_loss", l) logz.log_tabular("minibatch_accuracy", np.sum(corr)/batch_size) logz.dump_tabular() logz.pickle_tf_vars() iter_cnt += 1 if is_test: total_correct = correct/len(images) total_loss = np.sum(losses)/len(images) print('acc:', total_correct) print('los:', total_loss) else: saver.save(session, 'checkpoints/mnist_plus', iter_cnt)
def setup_logger(logdir, params): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters # args = inspect.getargspec(learn)[0] check_params = params.copy() log_params = params.copy() for param in check_params.keys(): try: json.dumps(check_params[param]) except: del log_params[param] logz.save_params(log_params)
def main(): # Get Atari games. task = gym.make('LunarLander-v2') file_dir = osp.dirname(osp.abspath(__file__)) unique_name = datetime.datetime.now(dateutil.tz.tzlocal()).strftime( '%Y_%m_%d_%H_%M_%S_%f_%Z') + '__' + str(uuid.uuid4()) result_dir = osp.join(file_dir, unique_name) logz.configure_output_dir(result_dir) logz.save_params(dict(exp_name=unique_name, )) # Run training seed = 1 print('random seed = %d' % seed) env = get_env(task, seed, result_dir) session = get_session() atari_learn(env, session, num_timesteps=5e5, result_dir=result_dir)
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # i need here to give a directory # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds #seed: it makes sure that you will not have the same random number twice/ ref:https://en.wikipedia.org/wiki/Random_seed tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: sy_logits_na =build_mlp(sy_ob_no,ac_dim,"discrete",n_layers,size,activation=tf.nn.relu,output_activation=tf.nn.relu) #print(sy_logits_na.shape) #env_actions=tf.concat(axis=1,values=[sy_logits_na,1-sy_logits_na]) sy_sampled_ac =tf.reshape(tf.multinomial(sy_logits_na,1,seed),[-1]) sy_logprob_n =tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na) else: # YOUR_CODE_HERE #sy_mean =-tf.reduce_mean(build_mlp(sy_ob_no,ac_dim,"cont",n_layers,size,activation=tf.tanh)) #sy_logstd = tf.Variable(tf.random_uniform([None, ac_dim])) # logstd should just be a trainable variable, not a network output. #sy_sampled_ac = tf.random_normal([None, ac_dim],sy_mean,sy_logstd,dtype=tf.float32,seed=seed) #sy_logprob_n = -0.5*(sy_sampled_ac-sy_ac_na)^2 # Hint: Use the log probability under a multivariate gaussian. print("Continous System") #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = tf.reduce_mean(tf.multiply(sy_logprob_n,sy_adv_n)) # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze(build_mlp( sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_target=tf.placeholder(shape=[None], name="tr", dtype=tf.float32) baseline_loss=tf.placeholder(shape=[None], name="lo", dtype=tf.float32) #baseline_update_op=tf.placeholder(shape=[None], name="up", dtype=tf.float32) b_loss=tf.losses.mean_squared_error(labels=baseline_target,predictions=baseline_prediction) baseline_update_op=tf.train.AdamOptimizer(learning_rate).minimize(b_loss) #baseline_loss=(baseline_prediction-baseline_target)**2 #baseline_update_op=tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) with tf.Session() as sess: # equivalent to `with sess:` sess.run(tf.global_variables_initializer()) #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=( (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no :[ob]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) print(ac_na.shape, "action sizeeeee") #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) #0 # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE print(total_timesteps) Q_t=[] if(reward_to_go): #Case 2: reward-to-go PG for no_traj in range(len(paths)): for _ in range(np.size((paths[no_traj])["reward"])): temp_rew=0 t_=np.size((paths[no_traj])["reward"])-1 for no_rew in range(t_+1): temp_rew+=(math.pow(gamma,t_-no_rew)*(((paths[no_traj])["reward"])[no_rew,])) Q_t.append(temp_rew) else:# Case 1: trajectory-based PG count =0 index=len(paths) i=0 t_=0 while(count<=total_timesteps and i <index): for _ in range (np.size((paths[i])["reward"])): Q_t.append((math.pow(gamma,total_timesteps-t_)*((paths[i])["reward"])[_,])) t_+=1 count+=np.size((paths[i])["reward"]) i+=1 q_n=Q_t print(len(q_n)) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction,feed_dict={sy_ob_no:ob_no}) b_n = preprocessing.scale(b_n) adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = preprocessing.scale(adv_n) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE target_tmp=1+gamma*b_n target_tmp=preprocessing.scale(target_tmp) sess.run(b_loss,feed_dict={sy_ob_no:ob_no,baseline_target:target_tmp}) sess.run(baseline_update_op,feed_dict={sy_ob_no:ob_no,baseline_target:target_tmp}) #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE #print(sess.run(sy_logits_na,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n})) #print(sess.run(sy_sampled_ac,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n})) loss_=sess.run(loss,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n}) sess.run(update_op,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n}) loss_=sess.run(loss,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.log_tabular("loss_",loss_) logz.dump_tabular() logz.pickle_tf_vars()
def train_SAC(env_name, exp_name, n_iter, ep_len, seed, logdir, alpha, prefill_steps, discount, batch_size, learning_rate, tau, two_qf): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, }.get(env_name, alpha) algorithm_params = { 'alpha': alpha, 'batch_size': batch_size, 'discount': discount, 'learning_rate': learning_rate, 'reparameterize': True, 'tau': tau, 'epoch_length': ep_len, 'n_epochs': n_iter, 'two_qf': two_qf, } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': prefill_steps, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (64, 64), } q_function_params = { 'hidden_layer_sizes': (64, 64), } policy_params = { 'hidden_layer_sizes': (64, 64), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) env = gym.envs.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=env.action_space.shape, **replay_pool_params) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction( name='value_function', **value_function_params) target_value_function = nn.ValueFunction( name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=env.action_space.shape[0], reparameterize=algorithm_params['reparameterize'], **policy_params) sampler.initialize(env, policy, replay_pool) algorithm = SAC(**algorithm_params) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True # may need if using GPU with tf.Session(config=tf_config): algorithm.build( env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()
def train_PG( exp_name='', batch_size=250, n_episodes=25000, learning_rate=1e-3, logdir=None, seed=0, # network arguments n_layers=2, size=64): env = Environment() agent1 = Agent(env, n_layers, size, learning_rate, "agent1") agent2 = Agent(env, n_layers, size, learning_rate, "agent2") agent1_Nash = Agent(env, 3, 32, 1e-2, "agent1_Nash") agent2_Nash = Agent(env, 3, 32, 1e-2, "agent2_Nash") start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) n_iter = n_episodes // batch_size #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# for itr in range(n_iter): print("********** Iteration %i ************" % itr) #simulate a batch of temperature-gas price states s = env.samplestatess(batch_size) ag1_prices, _ = agent1.sample_actions(sess, s) ag2_prices, _ = agent2.sample_actions(sess, s) #====================================================================================# # Feed agents' actions into the market simulator and obtain corresponding rewards #====================================================================================# #Convert agent RTM actions to corresponding prices ag1_rewards, ag2_rewards = get_rewards(env, ag1_prices, ag2_prices) #====================================================================================# # # Advantage Normalization #====================================================================================# ag1_adv = normalize(ag1_rewards) ag2_adv = normalize(ag2_rewards) #====================================================================================# # # Performing the Policy Update #====================================================================================# #update policy parameters for agent1 #if (itr % 20 < 10): loss1 = agent1.improve_policy(sess, s, ag1_adv, ag1_prices) #update policy parameters for agent2 #else: loss2 = agent2.improve_policy(sess, s, ag2_adv, ag2_prices) # Log diagnostics logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageProfit_agt1", np.mean(ag1_rewards)) logz.log_tabular("AverageProfit_agt2", np.mean(ag2_rewards)) logz.log_tabular("Agt1_StdReturn", np.std(ag1_rewards)) logz.log_tabular("Agt2_StdReturn", np.std(ag2_rewards)) logz.log_tabular("Agt1_MaxReturn", np.max(ag1_rewards)) logz.log_tabular("Agt2_MaxReturn", np.max(ag2_rewards)) logz.log_tabular("Agt1_MinReturn", np.min(ag1_rewards)) logz.log_tabular("Agt2_MinReturn", np.min(ag2_rewards)) logz.dump_tabular() logz.pickle_tf_vars() m1, m2, m1_m, m2_m, ag1_p, ag2_p = get_smart_rewards( sess, agent1, agent2, env) print("Agent1 Stochastic Profit: " + repr(m1)) print("Agent2 Stochastic Profit: " + repr(m2)) print("Agent1 Deterministic Profit: " + repr(m1_m)) print("Agent2 Deterministic Profit: " + repr(m2_m)) print("Agent1 Mean Price") print(ag1_p) print("Agent2 Prices") print(ag2_p) print("Assessing degree of deviation from Nash Eq") ag1_imp, ag2_imp = assess_policy_accuracy(sess, agent1, agent1_Nash, agent2, agent2_Nash, env) print("Agent1 Accuracy: " + repr(ag1_imp)) print("Agent2 Accuracy: " + repr(ag2_imp))
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name = "adv", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: sy_logits_na = build_mlp(sy_ob_no, ac_dim, "scope", n_layers, size) sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1, seed = seed), [-1]) sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na) #[None] else: sy_mean = build_mlp(sy_ob_no, ac_dim, "scope", n_layers, size) # [None, ac_dim] # logstd should just be a trainable variable, not a network output. sy_logstd = tf.get_variable("logstd", shape = [ac_dim, 1], trainable = True, initializer = tf.contrib.layers.xavier_initializer()) z = tf.random_normal(tf.shape(sy_mean), mean = 0.0, stddev = 1.0, seed = seed) # [None, ac_dim] sigma = tf.reshape(tf.exp(sy_logstd), [1, ac_dim]) # [1, ac_dim] STANDARD DEVIATION sy_sampled_ac = (sigma * z) + sy_mean # [None, ac_dim] # Hint: Use the log probability under a multivariate gaussian. # diff = sy_ac_na - sy_mean # # the implementation below is by hand and assumes that sigma is covariance, though i've changed it to be SD instead. # first_term = -0.5 * tf.diag_part(tf.matmul(diff, tf.matmul(tf.matrix_inverse(sigma), tf.transpose(diff)))) # second_term = -0.5 * ac_dim * tf.log(tf.norm(sigma)) # third_term = -0.5 * ac_dim * tf.log(2*math.pi) # sy_logprob_n = first_term + second_term + third_term # [None, 1] sy_logprob_n = -tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean, scale_diag=sigma).log_prob(sy_ac_na) # [None] #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = tf.reduce_mean(sy_logprob_n * sy_adv_n) # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze(build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. sy_value_n = tf.placeholder(shape=[None], name = "V", dtype=tf.float32) baseline_loss = tf.losses.mean_squared_error(sy_value_n, baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# rewards_n = [path["reward"] for path in paths] if not reward_to_go: weighted_rewards = np.array([[(gamma**i)*r for i, r in enumerate(row)] for row in rewards_n]) q_sums = [sum(row) for row in weighted_rewards] q_n = np.hstack(np.array([[q_sums[i]]*len(weighted_rewards[i]) for i in range(len(weighted_rewards))])) # [None] else: q_n = np.hstack(np.array([[sum(map_gamma(row[i:], gamma)) for i in range(len(row))] for row in rewards_n])) # [None] #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no : ob_no}) b_n = tf.nn.l2_normalize(b_n, 0) q_mean = np.mean(q_n) q_std = np.std(q_n) b_n = b_n * q_std b_n = b_n + q_mean adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. adv_n = tf.nn.l2_normalize(adv_n, 0) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) q_n = tf.nn.l2_normalize(q_n, 0) sess.run(baseline_update_op, feed_dict={sy_ob_no : ob_no, sy_value_n : q_n.eval()}) #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. _, l_next = sess.run([update_op, loss], feed_dict = {sy_ob_no : ob_no, sy_ac_na : ac_na, sy_adv_n : adv_n.eval()}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.log_tabular("Loss After Update", l_next) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, num_threads_gen=1, multi_steps_gd=1, reuse_nn_bl=False): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# tf.reset_default_graph() sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(sy_ob_no, ac_dim, "nn", n_layers=n_layers, size=size) # Hint: Use the tf.multinomial op # the shape -1 automatically infers that the reshape will be done in the None axis sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), shape=[-1]) # negative in front is to remove the negative nature of cross entropy sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits( logits=sy_logits_na, labels=sy_ac_na) else: # YOUR_CODE_HERE sy_mean = build_mlp(sy_ob_no, ac_dim, "nn", n_layers=n_layers, size=size) # logstd should just be a trainable variable, not a network output. sy_logstd = tf.get_variable('logstd', shape=[1, ac_dim], dtype=tf.float32, initializer=tf.zeros_initializer) sy_sampled_ac = sy_mean + tf.exp(sy_logstd) * tf.random_normal( tf.shape(sy_mean)) # Hint: Use the log probability under a multivariate gaussian. sy_z = (sy_ac_na - sy_mean) / tf.exp(sy_logstd) sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(sy_z), axis=1) # sy_logprob_n = - 1/2 * tf.nn.l2_loss(sy_mean - sy_ac_na) #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# # Loss function that we'll differentiate to get the policy gradient. # Negative is to maximize the loss, instead of minimizing loss = -tf.reduce_mean(sy_logprob_n * sy_adv_n) update_op = tf.train.AdamOptimizer(learning_rate, name='AdamPolicy').minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: if not reuse_nn_bl: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) else: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size, reuse_hidden_layers=True, reuse_scope_name="nn")) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE sy_target_bn = tf.placeholder(tf.float32, shape=[None], name='target_bn') loss_bn = tf.nn.l2_loss(sy_target_bn - baseline_prediction) baseline_update_op = tf.train.AdamOptimizer( learning_rate, name='AdamBL').minimize(loss_bn) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() # pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) paths = [] gen_start_time = time.time() if num_threads_gen == 1: # Collect paths until we have enough timesteps timesteps_this_batch = 0 while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch else: # Multithread approach using tf coordinator coord = tf.train.Coordinator() workers = [ TrajectionRunner(sess, sy_sampled_ac, sy_ob_no, env_name, max_path_length, min_timesteps_per_batch // num_threads_gen) for _ in range(num_threads_gen) ] for wrk in workers: wrk.start() coord.join(workers) # After here, all workers should be ready, let's collect their data timesteps_this_batch = 0 for wrk in workers: paths.extend(wrk.paths) timesteps_this_batch = wrk.total_timesteps total_timesteps += wrk.total_timesteps gen_total_time = time.time() - gen_start_time # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # # ====================================================================================# # YOUR_CODE_HERE # wrong, every path leads to different rewards! def discount_rewards(rwds, rtg): q = np.zeros_like(rwds) s = 0 for t in reversed(range(rwds.shape[0])): s = s * gamma + rwds[t] q[t] = s if not rtg: q[:] = q[0] return q q_n = np.concatenate( [discount_rewards(path["reward"], reward_to_go) for path in paths]) # ====================================================================================# # ----------SECTION 5---------- # Computing Baselines # ====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) b_n = rescale(normalize(b_n), q_n.mean(axis=0, keepdims=True), q_n.std(axis=0, keepdims=True)) adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = normalize(adv_n) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE norm_q_n = normalize(q_n) total_bn_loss = 0 for _ in range(multi_steps_gd): _, bn_loss = sess.run([baseline_update_op, loss_bn], feed_dict={ sy_ob_no: ob_no, sy_target_bn: norm_q_n }) total_bn_loss += bn_loss total_bn_loss /= multi_steps_gd #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE total_loss = 0 for _ in range(multi_steps_gd): _, current_loss = sess.run([update_op, loss], feed_dict={ sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n }) total_loss += current_loss total_loss /= multi_steps_gd # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("GenTime", gen_total_time) logz.log_tabular("Iteration", itr) logz.log_tabular("Loss", total_loss) if nn_baseline: logz.log_tabular("BNLoss", total_bn_loss) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def learn(env, q_func, optimizer_spec, session, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10): assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete # Log the progress during the trainining start = time.time() logdir = 'pacman_hra_' + time.strftime("%d-%m-%Y_%H-%M-%S") logdir = os.path.join('hra_result', logdir) logz.configure_output_dir(logdir) args = inspect.getargspec(q_func)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) time_name = path.join(logdir, "rha_t.dat") mean_name = path.join(logdir, "rha_mean.dat") best_name = path.join(logdir, "rha_best.dat") if not os.path.exists(logdir): os.makedirs(logdir) times, mean_ep_rewards, best_ep_rewards = [], [], [] img_h, img_w, img_c = env.observation_space.shape input_shape = (img_h, img_w, frame_history_len * img_c) num_actions = env.action_space.n # set up placeholders # placeholder for current observation (or state) obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) # placeholder for current action act_t_ph = tf.placeholder(tf.int32, [None]) # placeholder for current reward rew_food_t_ph = tf.placeholder(tf.float32, [None]) rew_fruit_t_ph = tf.placeholder(tf.float32, [None]) rew_avoid_t_ph = tf.placeholder(tf.float32, [None]) rew_eat_t_ph = tf.placeholder(tf.float32, [None]) # placeholder for next observation (or state) obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) # placeholder for end of episode mask done_mask_ph = tf.placeholder(tf.float32, [None]) # casting to float on GPU ensures lower data transfer times. obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0 q_val, img_val = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) q_food, q_avoid, q_fruit, q_eat = q_val target_val, target_img_val = q_func(obs_tp1_float, num_actions, scope="target_q_func", reuse=False) target_food, target_avoid, target_fruit, target_eat = target_val q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func') # q_all = 1/4 * (q_food + q_avoid + q_fruit + q_eat) # action_select = tf.argmax(q_all, 1) q_all = tf.concat([food, avoid, fruit, eat], 1) q_total = aggregator(img_val, q_all, num_actions, scope="q_agg", reuse=False) action_selected = tf.argmax(q_total, 1) # potential problem agg_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_agg') q_act_food_t_val = tf.reduce_sum(q_food * tf.one_hot(act_t_ph, num_actions), axis=1) q_act_avoid_t_val = tf.reduce_sum(q_avoid * tf.one_hot(act_t_ph, num_actions), axis=1) q_act_fruit_t_val = tf.reduce_sum(q_fruit * tf.one_hot(act_t_ph, num_actions), axis=1) q_act_eat_t_val = tf.reduce_sum(q_eat * tf.one_hot(act_t_ph, num_actions), axis=1) y_food_t_val = rew_food_t_ph + (1 - done_mask_ph) * gamma * tf.reduce_max( target_food, axis=1) y_avoid_t_val = rew_avoid_t_ph + ( 1 - done_mask_ph) * gamma * tf.reduce_max(target_avoid, axis=1) y_fruit_t_val = rew_fruit_t_ph + ( 1 - done_mask_ph) * gamma * tf.reduce_max(target_fruit, axis=1) y_eat_t_val = rew_eat_t_ph + (1 - done_mask_ph) * gamma * tf.reduce_max( target_eat, axis=1) food_error = tf.reduce_mean( tf.losses.huber_loss(y_food_t_val, q_act_food_t_val)) avoid_error = tf.reduce_mean( tf.losses.huber_loss(y_avoid_t_val, q_act_avoid_t_val)) fruit_error = tf.reduce_mean( tf.losses.huber_loss(y_fruit_t_val, q_act_fruit_t_val)) eat_error = tf.reduce_mean( tf.losses.huber_loss(y_eat_t_val, q_act_eat_t_val)) q_weight_val = tf.reduce_sum(target_q_total * tf.one_hot(act_t_ph, num_actions), axis=1) q_weight_y = rew_food_t_ph + rew_avoid_t_ph + rew_fruit_t_ph + rew_eat_t_ph q_weight_y += (1 - done_mask_ph) * gamma * tf.reduce_max( target_q_total, axis=1) # - q_weight_val weight_error = tf.reduce_mean( tf.losses.huber_loss(q_weight_y, q_weight_val)) ###### # construct optimization op (with gradient clipping) learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs) train_food_fn = minimize_and_clip(optimizer, food_error, var_list=q_func_vars, clip_val=grad_norm_clipping) train_avoid_fn = minimize_and_clip(optimizer, avoid_error, var_list=q_func_vars, clip_val=grad_norm_clipping) train_fruit_fn = minimize_and_clip(optimizer, fruit_error, var_list=q_func_vars, clip_val=grad_norm_clipping) train_eat_fn = minimize_and_clip(optimizer, eat_error, var_list=q_func_vars, clip_val=grad_norm_clipping) train_weight = minimize_and_clip(optimizer, weight_error, var_list=agg_vars, clip_val=grad_norm_clipping) # update_target_fn will be called periodically to copy Q network to target Q network update_target_fn = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_fn.append(var_target.assign(var)) update_target_fn = tf.group(*update_target_fn) # construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### model_initialized = False num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in itertools.count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env, t): break ### 2. Step the env and store the transition idx = replay_buffer.store_frame(last_obs, rha_shape=4) epsilon = exploration.value(t) if not model_initialized or np.random.rand(1) < epsilon: action = env.action_space.sample() else: obs_input = replay_buffer.encode_recent_observation()[None, :] action = session.run(action_selected, feed_dict={obs_tp1_ph: obs_input}) # potential problem obs, reward, done, info = env.step(action) replay_buffer.store_effect(idx, action, reward, done) if done: obs = env.reset() last_obs = obs ### 3. Perform experience replay and train the network. if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): obs_t_batch, act_t_batch, rew_t_batch, obs_tp1_batch, done_mask_batch = replay_buffer.sample( batch_size) rew_food_t_batch = rew_t_batch[:, 0] rew_fruit_t_batch = rew_t_batch[:, 1] rew_avoid_t_batch = rew_t_batch[:, 2] rew_eat_t_batch = rew_t_batch[:, 3] if not model_initialized: initialize_interdependent_variables( session, tf.global_variables(), { obs_t_ph: obs_t_batch, obs_tp1_ph: obs_tp1_batch }) session.run(update_target_fn) model_initialized = True session.run(train_food_fn, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_food_t_ph: rew_food_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch, learning_rate: optimizer_spec.lr_schedule.value(t) }) session.run(train_avoid_fn, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_avoid_t_ph: rew_avoid_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch, learning_rate: optimizer_spec.lr_schedule.value(t) }) session.run(train_fruit_fn, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_fruit_t_ph: rew_fruit_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch, learning_rate: optimizer_spec.lr_schedule.value(t) }) session.run(train_eat_fn, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_eat_t_ph: rew_eat_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch, learning_rate: optimizer_spec.lr_schedule.value(t) }) session.run(train_weight, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_food_t_ph: rew_food_t_batch, rew_avoid_t_ph: rew_avoid_t_batch, rew_fruit_t_ph: rew_fruit_t_batch, rew_eat_t_ph: rew_eat_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch, learning_rate: optimizer_spec.lr_schedule.value(t) }) if num_param_updates % target_update_freq == 0: session.run(update_target_fn) train_food_loss = session.run(food_error, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_food_t_ph: rew_food_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch }) train_avoid_loss = session.run(avoid_error, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_avoid_t_ph: rew_avoid_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch }) train_fruit_loss = session.run(fruit_error, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_fruit_t_ph: rew_fruit_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch }) train_eat_loss = session.run(eat_error, feed_dict={ obs_t_ph: obs_t_batch, act_t_ph: act_t_batch, rew_eat_t_ph: rew_eat_t_batch, obs_tp1_ph: obs_tp1_batch, done_mask_ph: done_mask_batch }) train_loss = 0.25 * (train_food_loss + train_avoid_loss + train_fruit_loss + train_eat_loss) # print("Loss at iteration {} is: {}".format(t, train_loss)) print("\n \ Food loss: {}\n \ Avoid loss: {}\n \ Fruit loss: {}\n \ Eat loss: {}".format(train_food_loss, train_avoid_loss, train_fruit_loss, train_eat_loss)) print("Average loss at iteration {} is: {}".format( t, train_loss)) num_param_updates += 1 ##### ### 4. Log progress episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and model_initialized: times.append(t) mean_ep_rewards.append(mean_episode_reward) best_ep_rewards.append(best_mean_episode_reward) joblib.dump(value=times, filename=time_name, compress=3) joblib.dump(value=mean_ep_rewards, filename=mean_name, compress=3) joblib.dump(value=best_ep_rewards, filename=best_name, compress=3) logz.log_tabular("Training Time", time.time() - start) logz.log_tabular("Loss", train_loss) logz.log_tabular("Iteration", t) logz.log_tabular("Mean Reward (/100ep)", mean_episode_reward) logz.log_tabular("Best Mean Reward", best_mean_episode_reward) logz.log_tabular("Episodes", len(episode_rewards)) logz.log_tabular("Exploration", exploration.value(t)) logz.log_tabular("Learning Rate", optimizer_spec.lr_schedule.value(t)) logz.dump_tabular() sys.stdout.flush() return times, mean_ep_rewards, best_ep_rewards
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, network_activation='tanh' ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #activation function for the network if network_activation=='relu': activation=torch.nn.functional.relu elif network_activation=='leaky_relu': activation=torch.nn.functional.leaky_relu else: activation=torch.nn.functional.tanh #todo: create policy actor=build_mlp(ob_dim, ac_dim, "actor",\ n_layers=n_layers, size=size, activation=activation, discrete=discrete) actor_loss=reinforce_loss actor_optimizer=torch.optim.Adam(actor.parameters(), lr=learning_rate) #todo: initilize Agent: #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: critic=build_mlp(ob_dim,1,"nn_baseline",\ n_layers=n_layers,size=size, discrete=discrete) critic_loss=nn.MSELoss() critic_optimizer=torch.optim.Adam(critic.parameters(), lr=learning_rate) #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards, log_probs = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) ob = torch.from_numpy(ob).float().unsqueeze(0) obs.append(ob) ac, log_prob = actor.run(ob) acs.append(ac) log_probs.append(log_prob) #format the action from policy if discrete: ac = int(ac) else: ac = ac.squeeze(0).numpy() ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : torch.cat(obs, 0), "reward" : torch.Tensor(rewards), "action" : torch.cat(acs, 0), "log_prob" : torch.cat(log_probs, 0)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch ob_no = torch.cat([path["observation"] for path in paths], 0) ac_na = torch.cat([path["action"] for path in paths], 0) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# q_n = [] for path in paths: rewards = path['reward'] num_steps = pathlength(path) R=[] if reward_to_go: for t in range(num_steps): R.append((torch.pow(gamma, torch.arange(num_steps-t))*rewards[t:]).sum().view(-1,1)) q_n.append(torch.cat(R)) else: q_n.append((torch.pow(gamma, torch.arange(num_steps)) * rewards).sum() * torch.ones(num_steps, 1)) q_n = torch.cat(q_n, 0) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = critic(ob_no) q_n_std = q_n.std() q_n_mean = q_n.mean() b_n_scaled = b_n * q_n_std + q_n_mean adv_n = (q_n - b_n_scaled).detach() else: adv_n = q_n #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + np.finfo(np.float32).eps.item()) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE target = (q_n - q_n_mean) / (q_n_std + np.finfo(np.float32).eps.item()) critic_optimizer.zero_grad() c_loss = critic_loss(b_n, target) c_loss.backward() critic_optimizer.step() #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE log_probs = torch.cat([path["log_prob"] for path in paths], 0) actor_optimizer.zero_grad() loss = actor_loss(log_probs, adv_n, len(paths)) print(loss) loss.backward() actor_optimizer.step() # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--env_name', type=str, default='HalfCheetah-v1') # Experiment meta-params parser.add_argument('--exp_name', type=str, default='mb_mpc') parser.add_argument('--seed', type=int, default=3) parser.add_argument('--render', action='store_true') # Training args parser.add_argument('--learning_rate', '-lr', type=float, default=1e-3) parser.add_argument('--onpol_iters', '-n', type=int, default=1) parser.add_argument('--dyn_iters', '-nd', type=int, default=60) parser.add_argument('--batch_size', '-b', type=int, default=512) # Data collection parser.add_argument('--random_paths', '-r', type=int, default=10) parser.add_argument('--onpol_paths', '-d', type=int, default=10) parser.add_argument('--ep_len', '-ep', type=int, default=1000) # Neural network architecture args parser.add_argument('--n_layers', '-l', type=int, default=2) parser.add_argument('--size', '-s', type=int, default=500) # MPC Controller parser.add_argument('--simulated_paths', '-sp', type=int, default=1000) parser.add_argument('--mpc_horizon', '-m', type=int, default=15) # Debug parser.add_argument('--quiet', '-q', action='count', default=0) args = parser.parse_args() logging.basicConfig(level=args.quiet * 10) # Set seed np.random.seed(args.seed) tf.set_random_seed(args.seed) # Make data directory if it does not already exist if not (os.path.exists('data')): os.makedirs('data') logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") logdir = os.path.join('data', logdir) if not (os.path.exists(logdir)): os.makedirs(logdir) logz.configure_output_dir(logdir) logz.save_params(vars(args)) # Make env if args.env_name is "HalfCheetah-v1": env = HalfCheetahEnvNew() cost_fn = cheetah_cost_fn train( env=env, cost_fn=cost_fn, render=args.render, learning_rate=args.learning_rate, onpol_iters=args.onpol_iters, dynamics_iters=args.dyn_iters, batch_size=args.batch_size, num_paths_random=args.random_paths, num_paths_onpol=args.onpol_paths, num_simulated_paths=args.simulated_paths, env_horizon=args.ep_len, mpc_horizon=args.mpc_horizon, n_layers=args.n_layers, size=args.size, activation=tf.nn.relu, output_activation=None, )
def train_PG(exp_name='', env_name=' HalfCheetah', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=False, animate=True, logdir=None, normalize_advantages=False, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = HalfCheetahEnvNew() # env = gym.make("RoboschoolHalfCheetah-v1") # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # Print environment infomation print("Environment name: ", "HalfCheetah") print("Action space is discrete: ", discrete) print("Action space dim: ", ac_dim) print("Observation space dim: ", ob_dim) print("Max_path_length ", max_path_length) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=4) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` data_buffer_ppo = DataBuffer_general(10000, 4) timesteps_per_actorbatch=1000 max_timesteps = 10000000 clip_param=0.2 entcoeff=0.0 optim_epochs=10 optim_stepsize=3e-4 optim_batchsize=64 gamma=0.99 lam=0.95 schedule='linear' callback=None # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5 policy_nn = MlpPolicy_bc(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff) # policy_nn = MlpPolicy(sess=sess, env=env, hid_size=64, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff, adam_epsilon=adam_epsilon) tf.global_variables_initializer().run() #pylint: disable=E1101 # Prepare for rollouts # ---------------------------------------- # seg_gen = traj_segment_generator_old(policy_nn, env, timesteps_per_actorbatch) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) data_buffer_ppo.clear() seg = traj_segment_generator(policy_nn, env, timesteps_per_actorbatch) # seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not policy_nn.recurrent) for n in range(len(ob)): data_buffer_ppo.add([ob[n], ac[n], atarg[n], tdlamret[n]]) print("data_buffer_ppo", data_buffer_ppo.size) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new() # set old parameter values to new parameter values # logger.log("Optimizing...") # logger.log(fmt_row(13, policy_nn.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for i in range(int(timesteps_per_actorbatch/optim_batchsize)): sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = data_buffer_ppo.sample(optim_batchsize) newlosses = policy_nn.lossandupdate_ppo(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, optim_stepsize*cur_lrmult) losses.append(newlosses) # logger.log(fmt_row(13, np.mean(losses, axis=0))) # logger.log("Evaluating losses...") # losses = [] # # for batch in d.iterate_once(optim_batchsize): # sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = data_buffer_ppo.sample(optim_batchsize) # newlosses = policy_nn.compute_losses(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult) # losses.append(newlosses) # meanlosses,_,_ = mpi_moments(losses, axis=0) # logger.log(fmt_row(13, meanlosses)) # for (lossval, name) in zipsame(meanlosses, policy_nn.loss_names): # logger.record_tabular("loss_"+name, lossval) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) # logger.record_tabular("EpLenMean", np.mean(lenbuffer)) # logger.record_tabular("EpRewMean", np.mean(rewbuffer)) # logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 # logger.record_tabular("EpisodesSoFar", episodes_so_far) # logger.record_tabular("TimestepsSoFar", timesteps_so_far) # logger.record_tabular("TimeElapsed", time.time() - tstart) # if MPI.COMM_WORLD.Get_rank()==0: # logger.dump_tabular() # Log diagnostics # returns = [path["reward"].sum() for path in paths] # ep_lengths = [pathlength(path) for path in paths] ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, test=False, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, seed=0, # network arguments n_layers=1, size=32): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # ========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None # ========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] print('observation dim: ', ob_dim) print('action dim: ', ac_dim) print('action space: ', discrete) # print("hellooooooo",ac_dim,env.action_space.shape) # ========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. # ========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(dtype=tf.float32, shape=[None], name="adv") # ========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # # ========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(sy_ob_no, ac_dim, scope="build_nn", n_layers=n_layers, size=size, activation=tf.nn.relu) sy_sampled_ac = tf.one_hot(tf.squeeze(tf.multinomial(sy_logits_na, 1)), ac_dim) # Hint: Use the tf.multinomial op # batch_size x ac_dim sy_logprob_n = tf.nn.softmax_cross_entropy_with_logits_v2( labels=sy_ac_na, logits=sy_logits_na) # batch_size ---> log probability for each action # Learned from https://github.com/InnerPeace-Wu/ # # Another way to do it # N = tf.shape(sy_ob_no)[0] # sy_prob_na = tf.nn.softmax(sy_logits_na) # sy_logprob_n = tf.log(tf.gather_nd(sy_prob_na, tf.stack((tf.range(N), sy_ac_na), axis=1))) else: # YOUR_CODE_HERE sy_mean = build_mlp(sy_ob_no, ac_dim, scope="build_nn", n_layers=n_layers, size=size, activation=tf.nn.relu) sy_logstd = tf.Variable(tf.zeros(ac_dim), name='logstd', dtype=tf.float32) sy_std = tf.exp(sy_logstd) sy_sampled_ac = sy_mean + tf.multiply( sy_std, tf.random_normal(tf.shape(sy_mean))) sy_z = (sy_ac_na - sy_mean) / sy_std sy_logprob_n = 0.5 * tf.reduce_sum(tf.square(sy_z), axis=1) # sy_logprob_n = 0.5*tf.reduce_sum(tf.squared_difference(tf.div(sy_mean,sy_std), # tf.div(sy_ac_na,sy_std))) # Hint: Use the log probability under a multivariate gaussian. # ========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation # ========================================================================================# # loss = tf.reduce_sum(tf.multiply(tf.nn.softmax_cross_entropy_with_logits_v2(labels=sy_ac_na,logits=sy_logits_na),sy_adv_n)) # Loss function that we'll differentiate to get the policy gradient. loss = tf.reduce_sum(tf.multiply(sy_logprob_n, sy_adv_n)) actor_update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) actor_params = tf.trainable_variables() # ========================================================================================# # critic graph # Loss and training operations # ========================================================================================# predict_value = critic(sy_ob_no) sy_target_value = tf.placeholder(dtype=tf.float32, shape=[None], name="target_value") predict_value = tf.squeeze(predict_value) rms_loss = tf.reduce_mean( tf.squared_difference(predict_value, sy_target_value)) critic_update_op = tf.train.AdamOptimizer(learning_rate).minimize(rms_loss) critic_params = tf.trainable_variables()[len(actor_params):] # ========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization # ========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` actor_saver = tf.train.Saver(actor_params, max_to_keep=1) critic_saver = tf.train.Saver(critic_params, max_to_keep=1) checkpoint_actor_dir = os.path.join(os.curdir, 'Actor_GAE_0.7' + str(env_name)) if not os.path.exists(checkpoint_actor_dir): os.makedirs(checkpoint_actor_dir) actor_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt") ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir) checkpoint_critic_dir = os.path.join(os.curdir, 'Critic_GAE_0.7' + str(env_name)) if not os.path.exists(checkpoint_critic_dir): os.makedirs(checkpoint_critic_dir) critic_prefix = os.path.join(checkpoint_critic_dir, "model.ckpt") ckpt_2 = tf.train.get_checkpoint_state(checkpoint_critic_dir) if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path): print("Reading actor parameters from %s" % ckpt_1.model_checkpoint_path) actor_saver.restore(sess, ckpt_1.model_checkpoint_path) if ckpt_2 and tf.train.checkpoint_exists(ckpt_2.model_checkpoint_path): print("Reading critic parameters from %s" % ckpt_2.model_checkpoint_path) critic_saver.restore(sess, ckpt_2.model_checkpoint_path) uninitialized_vars = [] for var in tf.global_variables(): try: sess.run(var) except tf.errors.FailedPreconditionError: uninitialized_vars.append(var) if len(uninitialized_vars) > 0: init_new_vars_op = tf.variables_initializer(uninitialized_vars) sess.run(init_new_vars_op) def testing(): print('testing..') ob = env.reset() steps = 0 total_r = 0 while True: one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) if discrete: ac = int(np.argmax(one_hot_ac)) else: ac = one_hot_ac ob, rew, done, _ = env.step(ac) env.render() total_r += rew steps += 1 if steps > max_path_length: break print(steps, total_r) return steps, total_r # ========================================================================================# # Training Loop # ========================================================================================# if test: testing() return total_timesteps = 0 best_steps, best_rew = testing() # best_rew = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] next_obs = [] animate_this_episode = (len(paths) == 0 and (itr % 30 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) if discrete: ac = int(np.argmax(one_hot_ac)) else: ac = one_hot_ac # print("helloooo",ac) acs.append(one_hot_ac) next_ob, rew, done, _ = env.step( ac ) # transition dynamics P(s_t+1/s_t,a_t), r(s_t+1/s_t,a_t) next_obs.append(next_ob) ob = next_ob rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs), "next_observation": np.array(next_obs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) next_ob_no = np.concatenate( [path["next_observation"] for path in paths]) rew_no = np.concatenate([path["reward"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) ac_na = ac_na.reshape([-1, ac_dim]) print("helloooo", ac_na.shape) # ======================== Finding target values ===================================# # target = r(s,a) + gamma* V(s') - V(s) # This estimate has less variance but is biased. Alternatively # we can go for n-step returns or GAE(Generalised Advantage Estimation) # ==================================================================================# next_values = sess.run(predict_value, feed_dict={sy_ob_no: next_ob_no}) target_values = rew_no + gamma * next_values # fit critic with target r(s,a) + gamma*V(s') print('updating the critic params..') sess.run(critic_update_op, feed_dict={ sy_ob_no: ob_no, sy_target_value: target_values }) current_values = sess.run(predict_value, feed_dict={sy_ob_no: ob_no}) next_values = sess.run(predict_value, feed_dict={sy_ob_no: next_ob_no}) adv_n = rew_no + gamma * next_values - current_values # ====================== Generalized Advatage Estimation =========================== # # A(s_t, a_t) = sum_{t'=t}^{t'=inf} (gamma*lambda)^{t'-t} delta_{t'}, where # delta_{t} = r(s_t, a_t) + gamma*V(s_{t+1}) - V(s_t) # ================================================================================== # q_n = list() GAE = True if GAE: ind = 0 lam = 0.7 for path in paths: pLen = pathlength(path) q_p = np.zeros(pLen) q_p[pLen - 1] = adv_n[ind + pLen - 1] for t in reversed(range(pLen - 1)): q_p[t] = adv_n[ind + t] + (gamma * lam) * q_p[t + 1] q_p = np.array(q_p) q_n.append(q_p) ind += pLen # =========================== n-step returns =========================================# # Consider only the n-step returns instead of until the end of episode. # Variance reduction technique # adv(s_t) = sum_{t'=t}^(t+n) gamma^{t'-t}*r(t') + gamma^{n} V(s_{t+n}) - V(s_t) # ====================================================================================# n_step_returns = False if n_step_returns: n = 100 value_paths = [] for path in paths: ob = path['observation'] pLen = pathlength(path) values = sess.run(predict_value, feed_dict={sy_ob_no: ob}) x = {} x['value'] = values value_paths.append(x) for ind, path in enumerate(paths): pLen = pathlength(path) q_p = np.zeros(pLen) rew = path['reward'] values = value_paths[ind]['value'] for i in range(pLen): start = i end = min(start + n, pLen - 1) for j, r in enumerate(rew[start:end]): q_p[i] += pow(gamma, j) * r q_p[i] += pow(gamma, n) * values[end] q_p[i] -= values[start] q_p = np.array(q_p) q_n.append(q_p) q_n = np.concatenate(q_n) adv_n = q_n.copy() # ====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update # ====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] if np.mean(returns) > best_rew: best_rew = np.mean(returns) print('saving actor to ', actor_prefix) actor_saver.save(sess, actor_prefix) print('saving critic to ', critic_prefix) critic_saver.save(sess, critic_prefix) sess.run(actor_update_op, feed_dict={ sy_ac_na: ac_na, sy_ob_no: ob_no, sy_adv_n: adv_n }) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train(sess, env, args, actor, critic, actor_noise, logdir): logz.configure_output_dir(logdir) locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} print('params: ', params) params['env'] = 'InvertedPendulum' params['exp_name'] = '3layer' logz.save_params(params) # Set up summary Ops summary_ops, summary_vars = build_summaries() checkpoint_actor_dir = os.path.join(os.curdir, 'Actor_InvertedPendulum') if not os.path.exists(checkpoint_actor_dir): os.makedirs(checkpoint_actor_dir) actor_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt") ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir) checkpoint_critic_dir = os.path.join(os.curdir, 'Critic_InvertedPendulum') if not os.path.exists(checkpoint_critic_dir): os.makedirs(checkpoint_critic_dir) critic_prefix = os.path.join(checkpoint_critic_dir, "model.ckpt") ckpt_2 = tf.train.get_checkpoint_state(checkpoint_critic_dir) if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path): print("Reading actor parameters from %s" % ckpt_1.model_checkpoint_path) actor.saver.restore(sess, ckpt_1.model_checkpoint_path) if ckpt_2 and tf.train.checkpoint_exists(ckpt_2.model_checkpoint_path): print("Reading critic parameters from %s" % ckpt_2.model_checkpoint_path) critic.saver.restore(sess, ckpt_2.model_checkpoint_path) uninitialized_vars = [] for var in tf.all_variables(): try: sess.run(var) except tf.errors.FailedPreconditionError: uninitialized_vars.append(var) if len(uninitialized_vars) > 0: init_new_vars_op = tf.variables_initializer(uninitialized_vars) sess.run(init_new_vars_op) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. # tflearn.is_training(True) def testing(): env1 = gym.make(args['env']) s = env1.reset() done = False total_reward = 0 max_steps = env1.spec.timestep_limit step = 0 while not done: a = actor.predict(np.reshape(s, (1, actor.s_dim))) s2, r, done, _ = env1.step(a[0]) total_reward += r step += 1 s = s2 # env.render() if step > max_steps: break print('total steps: ', step) print('total reward: ', total_reward) return step, total_reward iter = 0 start = time.time() best_step, best_rew = testing() for i in range(int(args['max_episodes'])): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(int(args['max_episode_len'])): if args['render_env']: env.render() # Added exploration noise # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i)) num = np.random.uniform() a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise() s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples batch_size = int(args['minibatch_size']) if replay_buffer.size() > 100000: iter += 1 s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(batch_size) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(int(args['minibatch_size'])): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + critic.gamma * target_q[k]) # Update the critic given the targets # critic will be trained to minimise the mean square error of the predicted Q value # and the target value. predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient # gradients of the critic Q value according to the action valu --> action gradients a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) # del_a Q(s,a) actor.train( s_batch, grads[0] ) # del_a Q(s,a) * del_theta Mu_theta(s) ---> actor gradients # directly apply these gradients on actor params. No special loss to minimize if iter % 20 == 0: new_steps, new_rew = testing() if new_rew > best_rew: best_rew = new_rew actor.saver.save(sess, actor_prefix) critic.saver.save(sess, critic_prefix) print('model saved to disk.') actor.saver.restore(sess, ckpt_1.model_checkpoint_path) critic.saver.restore(sess, ckpt_2.model_checkpoint_path) best_step, best_rew = testing() # print('actor model saved to: ', actor_prefix) # print('critic model saved to: ', critic_prefix) if iter % 10 == 0: new_steps, new_rew = testing() logz.log_tabular("Time", time.time() - start) logz.log_tabular('Iteration', iter / 10) logz.log_tabular('Reward', new_rew) logz.log_tabular('Steps', new_steps) logz.dump_tabular() # Update target networks if iter % 50 == 0: replay_buffer.update() print('updating buffer') print('updating target networks..') actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \ i, (ep_ave_max_q / float(j)))) break
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} print(params) # the three lines below are to override the functions passed in, which aren't serializable params["activation"] = "relu" params["cost_fn"] = "cheetah_cost_fn" params["env"] = "HalfCheetahEnvNew" logz.save_params(params) returns_file = "returns.csv" returns_array = [] #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ data = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then # taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in # https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ print(itr) # learn/fit dynamics model using the Adam optimization algorithm l = dyn_model.fit(data) print(l) # sample a set of on-policy trajectories from the environment new_data = sample(env, mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, render=render, verbose=False) # append transition to dataset data += new_data # compute costs costs = np.array([path_cost(cost_fn, path) for path in new_data]) print(costs) # compute returns returns = np.array( [new_data[i]["returns"] for i in range(len(new_data))]) print(returns) returns_array.append(returns) np.array(returns_array).dump(returns_file) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train_PG( exp_name='', env_name='ProstheticsEnv', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, test=False): start = time.time() logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} params['env_name'] = 'Prosthetic_3D' print('params: ', params) logz.save_params(params) args = inspect.getargspec(train_PG)[0] # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = env_name # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.timestep_limit # ========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None # ========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] print('observation dim: ', ob_dim) print('action dim: ', ac_dim) print('action space: ', discrete) # print("hellooooooo",ac_dim,env.action_space.shape) # ========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. # ========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(dtype=tf.float32, shape=[None], name="adv") # ========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # # ========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(env.action_space.high, sy_ob_no, ac_dim, scope="build_nn", n_layers=n_layers, size=size, activation=tf.nn.relu) sy_sampled_ac = tf.one_hot(tf.squeeze(tf.multinomial(sy_logits_na, 1)), ac_dim) # Hint: Use the tf.multinomial op # batch_size x ac_dim sy_logprob_n = tf.nn.softmax_cross_entropy_with_logits( labels=sy_ac_na, logits=sy_logits_na) # batch_size ---> log probability for each action # Learned from https://github.com/InnerPeace-Wu/ # # Another way to do it # N = tf.shape(sy_ob_no)[0] # sy_prob_na = tf.nn.softmax(sy_logits_na) # sy_logprob_n = tf.log(tf.gather_nd(sy_prob_na, tf.stack((tf.range(N), sy_ac_na), axis=1))) else: # YOUR_CODE_HERE sy_mean = build_mlp(env.action_space.high, sy_ob_no, ac_dim, scope="build_nn", n_layers=n_layers, size=size, activation=tf.nn.relu) sy_logstd = tf.Variable(tf.zeros(ac_dim), name='logstd', dtype=tf.float32) sy_std = tf.exp(sy_logstd) sy_sampled_ac = sy_mean + tf.multiply( sy_std, tf.random_normal(tf.shape(sy_mean))) sy_z = (sy_ac_na - sy_mean) / sy_std sy_logprob_n = 0.5 * tf.reduce_sum(tf.square(sy_z), axis=1) # sy_logprob_n = 0.5*tf.reduce_sum(tf.squared_difference(tf.div(sy_mean,sy_std), # tf.div(sy_ac_na,sy_std))) # Hint: Use the log probability under a multivariate gaussian. # ========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation # ========================================================================================# # loss = tf.reduce_sum(tf.multiply(tf.nn.softmax_cross_entropy_with_logits_v2(labels=sy_ac_na,logits=sy_logits_na),sy_adv_n)) # Loss function that we'll differentiate to get the policy gradient. loss = tf.reduce_sum(tf.multiply(sy_logprob_n, sy_adv_n)) update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) # ========================================================================================# # ----------SECTION 5---------- # Optional Baseline - Defining Second Graph # ========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(1, sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE sy_rew_n = tf.placeholder(shape=[None], name="rew", dtype=tf.int32) loss2 = tf.losses.mean_squared_error(labels=sy_rew_n, predictions=baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( loss2) # ========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization # ========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` # pylint: disable=E1101 network_params = tf.trainable_variables() saver = tf.train.Saver(network_params, max_to_keep=1) checkpoint_actor_dir = os.path.join(os.curdir, 'PG_MODEL_CONT_TANH') if not os.path.exists(checkpoint_actor_dir): os.makedirs(checkpoint_actor_dir) model_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt") ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir) if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path): print("Reading actor parameters from %s" % ckpt_1.model_checkpoint_path) saver.restore(sess, ckpt_1.model_checkpoint_path) uninitialized_vars = [] for var in tf.global_variables(): try: sess.run(var) except tf.errors.FailedPreconditionError: uninitialized_vars.append(var) if len(uninitialized_vars) > 0: init_new_vars_op = tf.variables_initializer(uninitialized_vars) sess.run(init_new_vars_op) # ========================================================================================# # Training Loop # ========================================================================================# total_timesteps = 0 t = 0 def testing(): print('testing the model..') ob = env.reset() steps = 0 done = False total_r = 0 one_hot_ac = env.action_space.sample() while not done: k = np.reshape(np.array(ob), newshape=(-1, len(ob))) # print('sampling an action...') if steps % 1 == 0: one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k}) ac = np.reshape(one_hot_ac, newshape=(one_hot_ac.shape[1])) # print('getting observations from env ..') # ac = np.clip(ac, -1.0, 1.0) ob, rew, done, _ = env.step(ac) total_r += rew env.render() steps += 1 if steps > max_path_length: break print('steps, rew', steps, total_r) return steps, total_r test = False if test: steps, rew = testing() return exp = False if exp: print('generating exp data..') import pickle as pkl paths = [] timesteps_this_batch = 0 while True: ob = env.reset() obs, acs = [], [] total_r = 0 while True: obs.append(ob) k = np.reshape(np.array(ob), newshape=(-1, len(ob))) one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k}) ac = np.reshape(one_hot_ac, newshape=(one_hot_ac.shape[1])) ac = np.clip(ac, 0.0, 1.0) acs.append(ac) ob, rew, done, _ = env.step(ac) total_r += rew if done: done = False break path = { "observation": np.array(obs[:-15]), "action": np.array(acs[:-15]) } if total_r > 50: timesteps_this_batch += len(path['action']) timesteps_this_batch -= 15 paths.append(path) print(timesteps_this_batch, total_r) if timesteps_this_batch > 1000: break ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) pkl.dump(ob_no, open('./simulation_0_1/obs_pg.p', 'wb')) pkl.dump(ac_na, open('./simulation_0_1/acts_pg.p', 'wb')) return _, best_rew = testing() for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 30 == 0) and animate) steps = 0 total_r = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) k = np.reshape(np.array(ob), newshape=(-1, len(ob))) # print(k.shape) # print('sampling an action...') one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k}) if discrete: ac = int(np.argmax(one_hot_ac)) else: ac = one_hot_ac acs.append(one_hot_ac) max_action = env.action_space.high ac = np.reshape(ac, newshape=(ac.shape[1])) # print('getting observations from env ..') ob, rew, done, _ = env.step( ac ) # transition dynamics P(s_t+1/s_t,a_t), r(s_t+1/s_t,a_t) total_r += rew rew = rew * 4 rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } if total_r > 0: paths.append(path) timesteps_this_batch += pathlength(path) print(total_r) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) ac_na = ac_na.reshape([-1, ac_dim]) import pickle as pkl # pkl.dump(ob_no, open('./simulation_data/obs_'+str(itr)+'.p', 'wb')) # pkl.dump(ac_na, open('./simulation_data/act_'+str(itr)+'.p', 'wb')) print("hello..", ac_na.shape) # ====================================================================================# # ----------..---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # # ====================================================================================# # DYNAMIC PROGRAMMING if reward_to_go: q_n = list() for path in paths: pLen = pathlength(path) q_p = np.zeros(pLen) q_p[pLen - 1] = path['reward'][pLen - 1] for t in reversed(range(pLen - 1)): q_p[t] = path['reward'][t] + gamma * q_p[t + 1] q_p = np.array(q_p) q_n.append(q_p) else: q_n = list() for path in paths: pLen = pathlength(path) q_p = 0 for t in range(pLen): q_p = q_p + (gamma**t) * (path['reward'][t]) q_n.append(q_p * np.ones(pLen)) q_n = np.concatenate(q_n) # print(q_n.shape) # ====================================================================================# # ----------SECTION 5---------- # Computing Baselines # ====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) b_n = normalize(b_n, np.mean(q_n), np.std(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() # ====================================================================================# # ----------SECTION 4---------- # Advantage Normalization # ====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = normalize(adv_n) # ====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline # ====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE sess.run(baseline_update_op, feed_dict={ sy_ob_no: ob_no, sy_rew_n: q_n }) # ====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update # ====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. t += 1 for i in range(1): print('updating model params..') sess.run(update_op, feed_dict={ sy_ac_na: ac_na, sy_ob_no: ob_no, sy_adv_n: adv_n }) _, new_r = testing() if new_r > best_rew: print('saving model params to, ', model_prefix) best_rew = new_r saver.save(sess, model_prefix) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=0.99, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=2e-2, reward_to_go=True, animate=False, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, activation='Tanh', #baseline_network arguments bl_learning_rate=1e-3, bl_n_layers=1, bl_size=32, bl_activation='Tanh', bl_n_iter=1): start = time.time() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.manual_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) env.seed(seed) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] ''' Do not need in PyTorch #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = TODO ''' #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# def sampling(ob, sy_logstd): sy_logit = mlp(ob) if discrete: # YOUR_CODE_HERE sy_probs = F.softmax(sy_logit) sy_sampled_ac = torch.multinomial(sy_probs, 1) else: # YOUR_CODE_HERE sy_std = torch.exp(sy_logstd) z = torch.normal(torch.zeros(sy_logit.size())).to(device) sy_sampled_ac = sy_logit + z * sy_std return sy_sampled_ac '''Loss is defined in last section : "Performing the Policy Update" #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = TODO # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) ''' #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = build_mlp(ob_dim, 1, bl_n_layers, bl_size, bl_activation).to(device) bl_optimizer = Adam(baseline_prediction.parameters(), lr=bl_learning_rate) #========================================================================================# # Training Loop #========================================================================================# mlp = build_mlp(ob_dim, ac_dim, n_layers, size, activation).to(device) sy_logstd = nn.Parameter(torch.zeros(1, ac_dim).to(device)) optimizer = Adam(list(mlp.parameters()) + [sy_logstd], lr=learning_rate) total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sampling(torch.FloatTensor(ob).to(device), sy_logstd) ac = ac.cpu().detach().numpy()[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) returns = [path["reward"].sum() for path in paths] average_returns = (np.mean(returns)) print("average_rewards : ", average_returns) print("\n") if average_returns > env.spec.reward_threshold: print("task solved") #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE q_n = [] if reward_to_go: for path in paths: qs = [] q = 0 for reward in reversed(path["reward"]): q = reward + q * gamma qs.append(q) q_n = q_n + qs[::-1] else: for path in paths: discounted_reward = [ path["reward"][i] * (gamma**i) for i in range(pathlength(path)) ] q_n = q_n + [np.sum(discounted_reward)] * pathlength(path) q_n = torch.FloatTensor(q_n).to(device) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = baseline_prediction( Variable(torch.FloatTensor(ob_no)).to(device)).squeeze(1) b_n = torch.mean(q_n) + ( (b_n - torch.mean(b_n)) / torch.std(b_n)) * torch.std(q_n) adv_n = q_n - b_n else: adv_n = q_n.clone() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. adv_n = (adv_n - torch.mean(adv_n)) / torch.std(adv_n) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) normalize_q_n = (q_n - torch.mean(q_n)) / torch.std(q_n) for i in range(bl_n_iter): b_n = baseline_prediction( Variable(torch.FloatTensor(ob_no)).to(device)).squeeze(1) bl_loss = F.mse_loss(b_n, normalize_q_n) bl_optimizer.zero_grad() bl_loss.backward() bl_optimizer.step() #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. sy_logit = mlp(Variable(torch.FloatTensor(ob_no)).to(device)) if discrete: sy_logprob_n = -F.cross_entropy( sy_logit, torch.LongTensor(ac_na).to(device), reduce=False) else: sy_std = torch.exp(sy_logstd) sy_logprob_n = -0.5 * torch.sum( (((sy_logit - torch.FloatTensor(ac_na).to(device)) / sy_std)** 2), dim=1 ) # Hint: Use the log probability under a multivariate gaussian. weighted_negative_likelihoods = sy_logprob_n * adv_n loss = -torch.mean(weighted_negative_likelihoods) optimizer.zero_grad() loss.backward() optimizer.step() # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) if itr == 0: logz.G.first_row = True logz.dump_tabular()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, bootstrap=False): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] print ob_dim, ac_dim #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="advantage", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(sy_ob_no, ac_dim, "discrete_mlp", n_layers=n_layers, size=size, activation=tf.nn.relu, output_activation=None) # print sy_logits_na sy_logprob_na = tf.nn.log_softmax(sy_logits_na) sy_sampled_ac = tf.multinomial(sy_logprob_na, 1) # Hint: Use the tf.multinomial op # print sy_sampled_ac batch_n = tf.shape(sy_ob_no)[0] act_index = tf.stack([tf.range(0, batch_n), sy_ac_na], axis=1) # sy_sampled_ac = tf.gather_nd(sy_sampled_ac,tf.range(0,batch_n)) # sy_sampled_ac = sy_sampled_ac[0] sy_logprob_n = tf.gather_nd(sy_logprob_na, act_index) else: # YOUR_CODE_HERE sy_mean = build_mlp(sy_ob_no, ac_dim, "continuous_mlp", n_layers=2, size=32, activation=tf.nn.relu, output_activation=None) sy_logstd = tf.Variable( tf.ones(batch_n), name="std" ) # logstd should just be a trainable variable, not a network output. sy_sampled_ac = sy_mean + sy_logstd * tf.random_normal( tf.shape(sy_mean)) sy_logprob_n = normal_log_prob( sy_ac_na, sy_mean, sy_log_std, ac_dim ) # Hint: Use the log probability under a multivariate gaussian. #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = -tf.reduce_mean( sy_logprob_n * sy_adv_n ) # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=1, size=32, activation=tf.nn.relu, output_activation=None)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE v_t = tf.placeholder("float", [None]) l_2 = 0.5 * tf.nn.l2_loss(v_t - baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( l_2) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards, obs_2 = [], [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) pi = sess.run(sy_logits_na, feed_dict={sy_ob_no: ob[None]}) # print pi ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) # print ac ac = ac[0][0] # print ac acs.append(ac) # print ac ob, rew, done, _ = env.step(ac) obs_2.append(ob) rewards.append(rew) steps += 1 if done or steps > max_path_length: terminated = done break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs), "obs_next": np.array(obs_2) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) ob_next_no = np.concatenate([path["obs_next"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # q_n = np.zero(q_n.shape) # YOUR_CODE_HERE if reward_to_go: q_n = [] # for path in paths.reverse(): # q_t = 0 # r_path = path["reward"].reverse() # path_len = pathlength(r_path) # for r in enumerate(r_path): # q_t = r + gamma*q_t # q_n[i] = q_t # i += 1 # q_n.reverse() if not bootstrap: for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) q_n.append(return_t) else: for path in paths: v_nxt = sess.run(baseline_prediction, feed_dict={sy_ob_no: path["obs_next"]}) q_target = v_nxt + path["reward"] q_n.append(q_target) q_n = np.concatenate(q_n) else: i = 0 q_n = np.concatenate([path["reward"] for path in paths]) for path in paths: q_t = 0 for idx, r in enumerate(path["reward"]): q_t += gamma**idx * r q_n[i] = q_t i += 1 #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) adv_n = q_n - b_n else: adv_n = q_n.copy() # print q_n #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE normal_adv = tf.nn.l2_normalize(sy_adv_n, 0, epsilon=1e-8, name="adv_normal") sess.run(normal_adv, feed_dict={sy_adv_n: adv_n}) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE v_target = [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) v_target.append(return_t) v_target = np.concatenate(v_target) print v_target.shape for _ in range(40): sess.run(baseline_update_op, feed_dict={ sy_ob_no: ob_no, v_t: v_target }) #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE sess.run(update_op, feed_dict={ sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n }) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def __init__(self, env_params=None, policy_params=None, num_workers=16, num_deltas=60, deltas_used=60, delta_std=0.003, logdir=None, model_path=None, save_path=None, rollout_length=1000, step_size=0.01, shift='constant zero', params=None, seed=123, eval_num=5): logz.configure_output_dir(logdir) logz.save_params(params) self.timesteps = 0 self.ob_size = policy_params["ob_dim"] self.action_size = policy_params["ac_dim"] self.num_deltas = num_deltas self.deltas_used = deltas_used self.rollout_length = rollout_length self.step_size = step_size self.delta_std = delta_std self.logdir = logdir self.model_path = model_path self.save_path = save_path self.shift = shift self.params = params self.max_past_avg_reward = float('-inf') self.num_episodes_used = float('inf') self.eval_num = eval_num self.best_score = -np.inf # create shared table for storing noise print("Creating deltas table.") deltas_id = create_shared_noise.remote() self.deltas = SharedNoiseTable(ray.get(deltas_id), seed=seed + 3) print('Created deltas table.') # initialize workers with different random seeds print('Initializing workers.') self.num_workers = num_workers self.workers = [ Worker.remote(seed + 7 * i, env_params=env_params, policy_params=policy_params, deltas=deltas_id, rollout_length=rollout_length, delta_std=delta_std) for i in range(num_workers) ] # initialize policy if policy_params['type'] == 'linear': self.policy = LinearPolicy(policy_params) else: self.policy = MLPPolicy("policy", policy_params["ob_dim"], policy_params["ac_dim"], policy_params["layer_norm"], tf.nn.selu, policy_params["layer_depth"],\ policy_params["layer_width"], self.save_path) # load model self.load_model() self.w_policy = self.policy.get_weights() # initialize optimization algorithm # self.optimizer = optimizers.SGD(self.w_policy, self.step_size) print("Initialization of ARS complete.")
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #todo: create Agent #todo: initilize Agent: #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = actor.run(ob) print("need to type-check action here:(two lines)") print(ac) print(ac.size()) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break #One episode finishes; perform update here finish_episode(actor, actor_optimizer, critic=None, critic_optimizer=None, ) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def setup_logger(logdir, env, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experiment title based on env params = {"exp_name": env.spec.id} logz.save_params(params)
def train_SAC(env_name, exp_name, seed, reparametrize, two_qf, old_funct, logdir, debug, gpu): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, }.get(env_name, 0.2) algorithm_params = { 'alpha': alpha, 'batch_size': 256, 'discount': 0.99, 'learning_rate': 1e-3, 'reparameterize': reparametrize, 'tau': 0.01, 'epoch_length': 1000, 'n_epochs': 500, 'two_qf': two_qf, } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': 1000, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (128, 128), } q_function_params = { 'hidden_layer_sizes': (128, 128), } policy_params = { 'hidden_layer_sizes': (128, 128), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) env = gym.envs.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=env.action_space.shape, **replay_pool_params) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction(name='value_function', **value_function_params) target_value_function = nn.ValueFunction(name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=env.action_space.shape[0], reparameterize=algorithm_params['reparameterize'], old_funct=old_funct, **policy_params) sampler.initialize(env, policy, replay_pool) algorithm = SAC(**algorithm_params) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=gpu) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) with tf.Session(config=tf_config) as sess: if debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) algorithm.build(env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) for epoch in algorithm.train(sampler, session=sess, n_epochs=algorithm_params.get( 'n_epochs', 1000)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = TODO #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = TODO sy_sampled_ac = TODO # Hint: Use the tf.multinomial op sy_logprob_n = TODO else: # YOUR_CODE_HERE sy_mean = TODO sy_logstd = TODO # logstd should just be a trainable variable, not a network output. sy_sampled_ac = TODO sy_logprob_n = TODO # Hint: Use the log probability under a multivariate gaussian. #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = TODO # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze(build_mlp( sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_update_op = TODO #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE q_n = TODO #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = TODO adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()