def setup_replay_buffer(self): """Setup experiental memory unit""" logger.info("setting up replay buffer") # In the discrete actions case, we store the acs indices if isinstance(self.ac_space, spaces.Box): ac_shape_ = self.ac_shape elif isinstance(self.ac_space, spaces.Discrete): ac_shape_ = () else: raise RuntimeError("ac space is neither Box nor Discrete") xp_params = { 'limit': self.hps.mem_size, 'ob_shape': self.ob_shape, 'ac_shape': ac_shape_ } extra_xp_params = { 'alpha': self.hps.alpha, 'beta': self.hps.beta, 'ranked': self.hps.ranked } if self.hps.prioritized_replay: if self.hps.unreal: # Unreal prioritized experience replay self.replay_buffer = XP.UnrealRB(**xp_params) else: # Vanilla prioritized experience replay self.replay_buffer = XP.PrioritizedRB(**xp_params, **extra_xp_params) else: # Vanilla experience replay self.replay_buffer = XP.RB(**xp_params) # Summarize replay buffer creation (relies on `__repr__` method) logger.info(" {} configured".format(self.replay_buffer))
def setup_param_noise(self): """Setup two separate perturbed actors, one which be used only for interacting with the environment, while the other will be used exclusively for std adaption. We use two instead of one for clarity-related purposes. """ # Define parameter corresponding to the current parameter noise stddev self.pn_cur_std = self.param_noise.cur_std # real value, not the placeholder logger.info("setting up param noise") # Configure parameter-noise-perturbed ('pnp') actor # Use: interact with the environment self.pnp_actor_pred = self.clip_acs(self.pnp_actor(self.obz0)) self.p_actor_updates = get_p_actor_updates(self.actor, self.pnp_actor, self.pn_std) logger.info("setting up adaptive param noise") # Configure adaptive-parameter-noise-perturbed ('apnp') actor # Use: adapt the standard deviation self.apnp_actor_pred = self.clip_acs(self.apnp_actor(self.obz0)) self.a_p_actor_updates = get_p_actor_updates(self.actor, self.apnp_actor, self.pn_std) self.a_dist = tf.sqrt( tf.reduce_mean(tf.square(self.actor_pred - self.apnp_actor_pred))) # Create callable objects # Act (and compute Q) according to the parameter-noise-perturbed actor self.p_act = TheanoFunction(inputs=[self.obs0], outputs=[self.pnp_actor_pred]) self.p_act_q = TheanoFunction( inputs=[self.obs0], outputs=[self.pnp_actor_pred, self.critic_pred_w_actor]) if isinstance(self.ac_space, spaces.Box): self.p_act = TheanoFunction(inputs=[self.obs0], outputs=[self.pnp_actor_pred]) self.p_act_q = TheanoFunction( inputs=[self.obs0], outputs=[self.pnp_actor_pred, self.critic_pred_w_actor]) elif isinstance(self.ac_space, spaces.Discrete): # Note: actor network outputs softmax -> take argmax to pick one action self.pnp_actor_pred_ = tf.argmax(self.pnp_actor_pred, axis=-1) self.p_act = TheanoFunction(inputs=[self.obs0], outputs=[self.pnp_actor_pred_]) self.p_act_q = TheanoFunction( inputs=[self.obs0], outputs=[self.pnp_actor_pred_, self.critic_pred_w_actor]) # Create distance between actor and adaptive-parameter-noise-perturbed actor predictions self.get_a_p_dist = TheanoFunction(inputs=[self.obs0, self.pn_std], outputs=self.a_dist) # Retrieve parameter-noise-perturbation updates self.apply_p_actor_updates = TheanoFunction( inputs=[self.pn_std], outputs=[self.p_actor_updates]) # Retrieve adaptive-parameter-noise-perturbation updates self.apply_a_p_actor_updates = TheanoFunction( inputs=[self.pn_std], outputs=[self.a_p_actor_updates])
def close_video_recorder(self): """Close video recorder""" if self.recording: logger.info("saving video to:\n {}".format(self.video_recorder.path)) # If recording, close the recorder self.video_recorder.close() # Reset running statistics self.recording = False self.num_recorded_frames = 1
def get_benchmark(env_id): """Verify that the specified env is amongst the admissible ones""" envs = yaml.load(open("admissible_envs.yml"))['environments'] benchmark = None for k, v in envs.items(): if env_id in list(v.keys()): benchmark = k assert benchmark is not None, "env not found in 'project_root/admissible_envs.yml'" logger.info("env_id = {} <- admissibility check passed!".format(env_id)) return benchmark
def setup_actor(self): logger.info("setting up actor optimizer") losses = OrderedDict() # Create the Q loss as the negative of the cumulated Q values q_loss = -tf.reduce_mean(self.critic_pred_w_actor) q_loss *= self.hps.q_actor_loss_scale # Create the actor loss w/ the scaled Q loss loss = q_loss losses.update({'actor_q_loss': q_loss}) # Create the D loss as the negative of the cumulated D values d_loss = -tf.reduce_mean(self.d_pred_w_actor) d_loss *= self.hps.d_actor_loss_scale # Add the D loss to the actor loss loss += d_loss losses.update({'actor_d_loss': d_loss}) # Add assembled actor loss losses.update({'actor_total_loss': loss}) # Create gradients grads = flatgrad(loss, self.actor.trainable_vars, self.hps.clip_norm) # Create mpi adam optimizer optimizer = MpiAdamOptimizer(comm=self.comm, clip_norm=self.hps.clip_norm, learning_rate=self.hps.actor_lr, name='actor_adam') optimize_ = optimizer.minimize(loss=loss, var_list=self.actor.trainable_vars) # Create callable objects get_losses = TheanoFunction(inputs=[self.obs0], outputs=list(losses.values())) get_grads = TheanoFunction(inputs=[self.obs0], outputs=grads) optimize = TheanoFunction(inputs=[self.obs0], outputs=optimize_) # Log statistics log_module_info(logger, self.name, self.actor) # Return the actor ops return { 'names': list(losses.keys()), 'losses': get_losses, 'grads': get_grads, 'optimizer': optimizer, 'optimize': optimize }
def add_demo_transitions_to_mem(self, dset): """Add transitions from expert demonstration trajectories to memory""" # Ensure the replay buffer is empty as demos need to be first assert self.num_entries == 0 and self.num_demos == 0 logger.info("adding demonstrations to memory") # Zip transition atoms transitions = zipsame(dset.obs0, dset.acs, dset.env_rews, dset.obs1, dset.dones1) # Note: careful w/ the order, it should correspond to the order in `append` signature for transition in transitions: self.append(*transition, is_demo=True) self.num_demos += 1 assert self.num_demos == self.num_entries logger.info(" num entries in memory after addition: {}".format(self.num_entries))
def __init__(self, expert_arxiv, size, train_fraction=None, randomize=True, full=False): """Create a dataset given the `expert_path` expert demonstration trajectories archive. Data structure of the archive in .npz format: the transitions are saved in python dictionary format with keys: 'obs0', 'acs', 'rews', 'dones1', 'obs1', 'ep_rets', the values of each item is a list storing the expert trajectory sequentially. Note that 'ep_rets' is stored solely for monitoring purposes, and w/o 'ep_rets', a transition corrsponds exactly to the format of transitions stored in memory. """ # Load the .npz archive file logger.info("loading expert demonstration trajectories from archive") traj_data = np.load(expert_arxiv) self.size = size assert 0 <= self.size <= len(traj_data['obs0']), "wrong demo dataset size" # arbitrarily # Unpack # 1. Slice the desired quantity of trajectories # 2. Flatten the list of trajectories into a list of transitions # Unpacking in done separately for each atom self.obs0 = np.array(flatten(traj_data['obs0'][:self.size])) self.acs = np.array(flatten(traj_data['acs'][:self.size])) if full: self.env_rews = np.array(flatten(traj_data['env_rews'][:self.size])) self.dones1 = np.array(flatten(traj_data['dones1'][:self.size])) self.obs1 = np.array(flatten(traj_data['obs1'][:self.size])) self.ep_rets = traj_data['ep_env_rets'][:self.size] self.ep_lens = traj_data['ep_lens'][:self.size] # Compute dataset statistics self.ret_mean = np.mean(np.array(self.ep_rets)) self.ret_std = np.std(np.array(self.ep_rets)) self.len_mean = np.mean(np.array(self.ep_lens)) self.len_std = np.std(np.array(self.ep_lens)) # Create (obs0,acs) dataset self.randomize = randomize self.pair_dset = PairDataset(self.obs0, self.acs, self.randomize) if train_fraction is not None: # Split dataset into train and test datasets (used in BC) t_t_frontier = int(self.size * train_fraction) self.pair_train_set = PairDataset(self.obs0[:t_t_frontier, :], self.acs[:t_t_frontier, :], self.randomize) self.pair_val_set = PairDataset(self.obs0[t_t_frontier:, :], self.acs[t_t_frontier:, :], self.randomize) # Log message upon successful trajectory dataset initialization self.log_info()
def parse_noise_type(self, noise_type): """Parse the `noise_type` hyperparameter""" ac_noise = None param_noise = None if isinstance(self.ac_space, spaces.Box): ac_dim = self.ac_space.shape[-1] # num dims elif isinstance(self.ac_space, spaces.Discrete): ac_dim = self.ac_space.n # num ac choices else: raise RuntimeError("ac space is neither Box nor Discrete") logger.info("parsing noise type") # Parse the comma-seprated (with possible whitespaces) list of noise params for cur_noise_type in noise_type.split(','): cur_noise_type = cur_noise_type.strip( ) # remove all whitespaces (start and end) # If the specified noise type is litterally 'none' if cur_noise_type == 'none': pass # If 'adaptive-param' is in the specified string for noise type elif 'adaptive-param' in cur_noise_type: # Set parameter noise from imitation.imitation_algorithms.param_noise import AdaptiveParamNoise if isinstance(self.ac_space, spaces.Box): _, std = cur_noise_type.split('_') std = float(std) param_noise = AdaptiveParamNoise(initial_std=std, delta=std) elif isinstance(self.ac_space, spaces.Discrete): _, init_eps = cur_noise_type.split('_') init_eps = float(init_eps) # Compute param noise thres depending on eps, as explained in Appendix C.1 # of the paper 'Parameter Space Noise for Exploration', Plappert, ICLR 2017 init_delta = -np.log(1. - init_eps + (init_eps / float(ac_dim))) param_noise = AdaptiveParamNoise(delta=init_delta) self.setup_eps_greedy(init_eps) logger.info(" {} configured".format(param_noise)) elif 'normal' in cur_noise_type: assert isinstance(self.ac_space, spaces.Box), "must be continuous ac space" _, std = cur_noise_type.split('_') # Spherical (isotropic) gaussian action noise from imitation.imitation_algorithms.ac_noise import NormalAcNoise ac_noise = NormalAcNoise(mu=np.zeros(ac_dim), sigma=float(std) * np.ones(ac_dim)) logger.info(" {} configured".format(ac_noise)) elif 'ou' in cur_noise_type: assert isinstance(self.ac_space, spaces.Box), "must be continuous ac space" _, std = cur_noise_type.split('_') # Ornstein-Uhlenbeck action noise from imitation.imitation_algorithms.ac_noise import OUAcNoise ac_noise = OUAcNoise(mu=np.zeros(ac_dim), sigma=float(std) * np.ones(ac_dim)) logger.info(" {} configured".format(ac_noise)) else: raise RuntimeError("unknown specified noise type: '{}'".format( cur_noise_type)) return param_noise, ac_noise
def setup_popart(self): """Play w/ the magnitude of the return @ the critic output by renormalizing the critic output vars (w + b) w/ old running statistics Reference paper: https://arxiv.org/pdf/1602.07714.pdf """ logger.info("setting up popart") # Setting old and new stds and means self.old_std = tf.placeholder(name='old_std', dtype=tf.float32, shape=[1]) new_std = self.ret_rms.std self.old_mean = tf.placeholder(name='old_mean', dtype=tf.float32, shape=[1]) new_mean = self.ret_rms.mean self.popart_op = [] # Pass once in the critic and once in the target critic -> 2 loop steps for output_vars in [ self.critic.output_vars, self.targ_critic.output_vars ]: # Ensure the network only has 2 vars w/ 'final' in their names (w + b of output layer) assert len( output_vars) == 2, "only w + b of the critic output layer \ should be caught -> 2 vars" out_names = [var.name for var in output_vars] for out_name in out_names: # Log output variables on which popart involved in popart logger.info(" {}".format(out_name)) # Unpack weight and bias of output layer w, b = output_vars # Ensure that w is indeed a weight, and that b is indeed a bias assert 'kernel' in w.name, "'w' not in w.name" assert 'bias' in b.name, "'b' not in b.name" # Ensure that both w and b are compatible w/ the critic spitting out a scalar assert w.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.popart_op += [w.assign(w * self.old_std / new_std)] self.popart_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] # Create callable objects self.popart = TheanoFunction(inputs=[self.old_mean, self.old_std], outputs=[self.popart_op])
def setup_target_network_updates(self): logger.info("setting up target network updates") actor_args = [self.actor.vars, self.targ_actor.vars, self.hps.polyak] critic_args = [ self.critic.vars, self.targ_critic.vars, self.hps.polyak ] actor_hard_updates, actor_soft_updates = get_target_updates( *actor_args) critic_hard_updates, critic_soft_updates = get_target_updates( *critic_args) self.targ_hard_updates = [actor_hard_updates, critic_hard_updates] self.targ_soft_updates = [actor_soft_updates, critic_soft_updates] # Create callable objects self.perform_targ_hard_updates = TheanoFunction( inputs=[], outputs=[self.targ_hard_updates]) self.perform_targ_soft_updates = TheanoFunction( inputs=[], outputs=[self.targ_soft_updates])
def evaluate(env, trpo_agent_wrapper, discriminator_wrapper, num_trajs, sample_or_mode, render, exact_model_path=None, model_ckpt_dir=None): """Evaluate a trained GAIL agent""" # Only one of the two arguments can be provided assert sum([exact_model_path is None, model_ckpt_dir is None]) == 1 # Rebuild the computational graph to gain evaluation access to a learned and saved policy pi = trpo_agent_wrapper('pi') d = discriminator_wrapper('d') traj_gen = traj_ep_generator(env=env, pi=pi, d=d, sample_or_mode=sample_or_mode, render=render) # Initialize and load the previously learned weights into the freshly re-built graph initialize() if exact_model_path is not None: load_model(exact_model_path) logger.info( "model loaded from exact path:\n {}".format(exact_model_path)) else: # `exact_model_path` is None -> `model_ckpt_dir` is not None load_latest_checkpoint(model_ckpt_dir) logger.info("model loaded from ckpt dir:\n {}".format(model_ckpt_dir)) # Initialize the history data structures ep_lens = [] ep_syn_rets = [] ep_env_rets = [] # Collect trajectories for i in range(num_trajs): logger.info("evaluating [{}/{}]".format(i + 1, num_trajs)) traj = traj_gen.__next__() ep_len, ep_syn_ret, ep_env_ret = traj['ep_len'], traj[ 'ep_syn_ret'], traj['ep_env_ret'] # Aggregate to the history data structures ep_lens.append(ep_len) ep_syn_rets.append(ep_syn_ret) ep_env_rets.append(ep_env_ret) # Log some statistics of the collected trajectories sample_or_mode = 'sample' if sample_or_mode else 'mode' logger.info("action picking: {}".format(sample_or_mode)) ep_len_mean = np.mean(ep_lens) ep_syn_ret_mean = np.mean(ep_syn_rets) ep_env_ret_mean = np.mean(ep_env_rets) logger.record_tabular("ep_len_mean", ep_len_mean) logger.record_tabular("ep_syn_ret_mean", ep_syn_ret_mean) logger.record_tabular("ep_env_ret_mean", ep_env_ret_mean) logger.dump_tabular()
def get_target_updates(vars_, targ_vars, polyak): """Return assignment ops for target network updates. Hard updates are used for initialization only, while soft updates are used throughout the training process, at every iteration. Note that DQN uses hard updates while training, but those updates are not performed every iteration (only once every XX iterations). """ logger.info("setting up target updates") hard_updates = [] soft_updates = [] assert len(vars_) == len(targ_vars) for var_, targ_var in zipsame(vars_, targ_vars): logger.info(' {} <- {}'.format(targ_var.name, var_.name)) hard_updates.append(tf.assign(targ_var, var_)) soft_updates.append( tf.assign(targ_var, (1. - polyak) * targ_var + polyak * var_)) assert len(hard_updates) == len(vars_) assert len(soft_updates) == len(vars_) return tf.group(*hard_updates), tf.group( *soft_updates) # ops that group ops
def get_p_actor_updates(actor, perturbed_actor, pn_std): """Return assignment ops for actor parameters noise perturbations. The perturbations consist in applying additive gaussian noise the the perturbable actor variables, while simply leaving the non-perturbable ones untouched. """ assert len(actor.vars) == len(perturbed_actor.vars) assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars) updates = [] for var_, perturbed_var in zipsame(actor.vars, perturbed_actor.vars): if var_ in actor.perturbable_vars: logger.info(" {} <- {} + noise".format(perturbed_var.name, var_.name)) noised_up_var = var_ + tf.random_normal( tf.shape(var_), mean=0., stddev=pn_std) updates.append(tf.assign(perturbed_var, noised_up_var)) else: logger.info(" {} <- {}".format(perturbed_var.name, var_.name)) updates.append(tf.assign(perturbed_var, var_)) assert len(updates) == len(actor.vars) return tf.group(*updates)
def test_logger(): info("hi") debug("shouldn't appear") set_level(DEBUG) debug("should appear") dir = "/tmp/testlogging" if osp.exists(dir): shutil.rmtree(dir) configure(dir_=dir) logkv("a", 3) logkv("b", 2.5) dumpkvs() logkv("b", -2.5) logkv("a", 5.5) dumpkvs() info("^^^ should see a = 5.5") logkv("b", -2.5) dumpkvs() logkv("a", "longasslongasslongasslongasslongasslongassvalue") dumpkvs()
def configure_logging(self): """Configure the experiment""" if self.comm is None or self.rank == 0: log_path = self.get_log_path() formats_strs = ['stdout', 'log', 'csv'] fmtstr = "configuring logger" if self.comm is not None and self.rank == 0: fmtstr += " [master]" logger.info(fmtstr) logger.configure(dir_=log_path, format_strs=formats_strs) fmtstr = "logger configured" if self.comm is not None and self.rank == 0: fmtstr += " [master]" logger.info(fmtstr) logger.info(" directory: {}".format(log_path)) logger.info(" output formats: {}".format(formats_strs)) # In the same log folder, log args in yaml in yaml file file_logger = FileLogger(uuid=self.uuid, path=self.get_log_path(), file_prefix=self.name_prefix) file_logger.set_info('note', self.args.note) file_logger.set_info('uuid', self.uuid) file_logger.set_info('task', self.args.task) file_logger.set_info('args', str(self.args)) fmtstr = "experiment configured" if self.comm is not None: fmtstr += " [{} MPI workers]".format(self.comm.Get_size()) logger.info(fmtstr) else: logger.info("configuring logger [worker #{}]".format(self.rank)) logger.configure(dir_=None, format_strs=None) logger.set_level(logger.DISABLED)
def log_info(self): logger.info("successfully initialized (obs0,acs) dataset, w/ statitics:") logger.info(" extracted num trajectories: {}".format(self.size)) logger.info(" extracted num transitions: {}".format(len(self.obs0))) # arbitrarily logger.info(" trajectory return mean: {}".format(self.ret_mean)) logger.info(" trajectory return std: {}".format(self.ret_std)) logger.info(" trajectory length mean: {}".format(self.len_mean)) logger.info(" trajectory length std: {}".format(self.len_std))
def setup_critic(self): logger.info("setting up critic optimizer") losses = OrderedDict() phs = [self.obs0, self.acs] if self.hps.prioritized_replay: phs.append(self.iws) # Create the 1-step look-ahead TD error loss td_errors_1 = self.critic_pred - self.tc1z hubered_td_errors_1 = huber_loss(td_errors_1) if self.hps.prioritized_replay: # Adjust with importance weights hubered_td_errors_1 *= self.iws td_loss_1 = tf.reduce_mean(hubered_td_errors_1) td_loss_1 *= self.hps.td_loss_1_scale # Create the critic loss w/ the scaled 1-step TD loss loss = td_loss_1 losses.update({'critic_td_loss_1': td_loss_1}) phs.append(self.tc1s) if self.hps.n_step_returns: # Create the n-step look-ahead TD error loss td_errors_n = self.critic_pred - self.tcnz hubered_td_errors_n = huber_loss(td_errors_n) if self.hps.prioritized_replay: # Adjust with importance weights hubered_td_errors_n *= self.iws td_loss_n = tf.reduce_mean(hubered_td_errors_n) td_loss_n *= self.hps.td_loss_n_scale # Add the scaled n-step TD loss to the critic loss loss += td_loss_n losses.update({'critic_td_loss_n': td_loss_n}) phs.append(self.tcns) # Fetch critic's regularization losses (@property of the network) wd_loss = tf.reduce_sum(self.critic.regularization_losses) # Note: no need to multiply by a scale as it has already been scaled logger.info("setting up weight decay") if self.hps.wd_scale > 0: for var in self.critic.trainable_vars: if var in self.critic.decayable_vars: logger.info(" {} <- wd w/ scale {}".format( var.name, self.hps.wd_scale)) else: logger.info(" {}".format(var.name)) # Add critic weight decay regularization to the critic loss loss += wd_loss losses.update({'critic_wd': wd_loss}) # Add assembled critic loss losses.update({'critic_total_loss': loss}) # Create gradients grads = flatgrad(loss, self.critic.trainable_vars, self.hps.clip_norm) # Create mpi adam optimizer optimizer = MpiAdamOptimizer(comm=self.comm, clip_norm=self.hps.clip_norm, learning_rate=self.hps.critic_lr, name='critic_adam') optimize_ = optimizer.minimize(loss=loss, var_list=self.critic.trainable_vars) # Create callable objects get_losses = TheanoFunction(inputs=phs, outputs=list(losses.values())) get_grads = TheanoFunction(inputs=phs, outputs=grads) optimize = TheanoFunction(inputs=phs, outputs=optimize_) if self.hps.prioritized_replay: td_errors_ops = [td_errors_1] + ([td_errors_n] if self.hps.n_step_returns else []) get_td_errors = TheanoFunction(inputs=phs, outputs=td_errors_ops) # Log statistics log_module_info(logger, self.name, self.critic) # Return the critic ops out = { 'names': list(losses.keys()), 'losses': get_losses, 'grads': get_grads, 'optimizer': optimizer, 'optimize': optimize } if self.hps.prioritized_replay: out.update({'td_errors': get_td_errors}) return out
def learn(comm, env, xpo_agent_wrapper, sample_or_mode, gamma, save_frequency, ckpt_dir, summary_dir, timesteps_per_batch, batch_size, optim_epochs_per_iter, lr, experiment_name, ent_reg_scale, clipping_eps, gae_lambda, schedule, max_iters): rank = comm.Get_rank() # Create policies pi = xpo_agent_wrapper('pi') old_pi = xpo_agent_wrapper('old_pi') # Create and retrieve already-existing placeholders ob = get_placeholder_cached(name='ob') ac = pi.pd_type.sample_placeholder([None]) adv = tf.placeholder(name='adv', dtype=tf.float32, shape=[None]) ret = tf.placeholder(name='ret', dtype=tf.float32, shape=[None]) # Adaptive learning rate multiplier, updated with schedule lr_mult = tf.placeholder(name='lr_mult', dtype=tf.float32, shape=[]) # Build graphs kl_mean = tf.reduce_mean(old_pi.pd_pred.kl(pi.pd_pred)) ent_mean = tf.reduce_mean(pi.pd_pred.entropy()) ent_pen = (-ent_reg_scale) * ent_mean vf_err = tf.reduce_mean(tf.square(pi.v_pred - ret)) # MC error # The surrogate objective is defined as: advantage * pnew / pold ratio = tf.exp(pi.pd_pred.logp(ac) - old_pi.pd_pred.logp(ac)) # IS surr_gain = ratio * adv # surrogate objective (CPI) # Annealed clipping parameter epsilon clipping_eps = clipping_eps * lr_mult surr_gain_w_clipping = tf.clip_by_value(ratio, 1.0 - clipping_eps, 1.0 + clipping_eps) * adv # PPO's pessimistic surrogate (L^CLIP in paper) surr_loss = -tf.reduce_mean(tf.minimum(surr_gain, surr_gain_w_clipping)) # Assemble losses (including the value function loss) loss = surr_loss + ent_pen + vf_err losses = OrderedDict() # Add losses losses.update({'pol_kl_mean': kl_mean, 'pol_ent_mean': ent_mean, 'pol_ent_pen': ent_pen, 'pol_surr_loss': surr_loss, 'pol_vf_err': vf_err, 'pol_total_loss': loss}) # Make the current `pi` become the next `old_pi` zipped = zipsame(old_pi.vars, pi.vars) updates_op = [] for k, v in zipped: # Populate list of assignment operations logger.info("assignment: {} <- {}".format(k, v)) assign_op = tf.assign(k, v) updates_op.append(assign_op) assert len(updates_op) == len(pi.vars) # Create mpi adam optimizer optimizer = MpiAdamOptimizer(comm=comm, clip_norm=5.0, learning_rate=lr * lr_mult, name='adam') optimize = optimizer.minimize(loss=loss, var_list=pi.trainable_vars) # Create callable objects assign_old_eq_new = TheanoFunction(inputs=[], outputs=updates_op) compute_losses = TheanoFunction(inputs=[ob, ac, adv, ret, lr_mult], outputs=list(losses.values())) optimize = TheanoFunction(inputs=[ob, ac, adv, ret, lr_mult], outputs=optimize) # Initialise variables initialize() # Sync params of all processes with the params of the root process optimizer.sync_from_root(pi.trainable_vars) # Create context manager that records the time taken by encapsulated ops timed = timed_cm_wrapper(comm, logger) if rank == 0: # Create summary writer summary_writer = tf.summary.FileWriterCache.get(summary_dir) seg_gen = traj_segment_generator(env, pi, timesteps_per_batch, sample_or_mode) eps_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() # Define rolling buffers for recent stats aggregation maxlen = 100 len_buffer = deque(maxlen=maxlen) env_ret_buffer = deque(maxlen=maxlen) pol_losses_buffer = deque(maxlen=maxlen) while iters_so_far <= max_iters: pretty_iter(logger, iters_so_far) pretty_elapsed(logger, tstart) # Verify that the processes are still in sync if iters_so_far > 0 and iters_so_far % 10 == 0: optimizer.check_synced(pi.trainable_vars) logger.info("params still in sync across processes") # Manage lr multiplier schedule if schedule == 'constant': curr_lr_mult = 1.0 elif schedule == 'linear': curr_lr_mult = max(1.0 - float(iters_so_far * timesteps_per_batch) / max_iters * timesteps_per_batch, 0) else: raise NotImplementedError # Save the model if rank == 0 and iters_so_far % save_frequency == 0 and ckpt_dir is not None: model_path = osp.join(ckpt_dir, experiment_name) save_state(model_path, iters_so_far=iters_so_far) logger.info("saving model") logger.info(" @: {}".format(model_path)) with timed("sampling mini-batch"): seg = seg_gen.__next__() augment_segment_gae_stats(seg, gamma, gae_lambda, rew_key="env_rews") # Standardize advantage function estimate seg['advs'] = (seg['advs'] - seg['advs'].mean()) / (seg['advs'].std() + 1e-8) # Update running mean and std if hasattr(pi, 'obs_rms'): with timed("normalizing obs via rms"): pi.obs_rms.update(seg['obs'], comm) assign_old_eq_new({}) # Create Feeder object to iterate over (ob, ac, adv, td_lam_ret) tuples data_map = {'obs': seg['obs'], 'acs': seg['acs'], 'advs': seg['advs'], 'td_lam_rets': seg['td_lam_rets']} feeder = Feeder(data_map=data_map, enable_shuffle=True) # Update policy and state-value function with timed("updating policy and value function"): for _ in range(optim_epochs_per_iter): for minibatch in feeder.get_feed(batch_size=batch_size): feeds = {ob: minibatch['obs'], ac: minibatch['acs'], adv: minibatch['advs'], ret: minibatch['td_lam_rets'], lr_mult: curr_lr_mult} # Compute losses pol_losses = compute_losses(feeds) # Update the policy and value function optimize(feeds) # Store the losses pol_losses_buffer.append(pol_losses) # Log policy update statistics logger.info("logging training losses (log)") pol_losses_np_mean = np.mean(pol_losses_buffer, axis=0) pol_losses_mpi_mean = mpi_mean_reduce(pol_losses_buffer, comm, axis=0) zipped_pol_losses = zipsame(list(losses.keys()), pol_losses_np_mean, pol_losses_mpi_mean) logger.info(columnize(names=['name', 'local', 'global'], tuples=zipped_pol_losses, widths=[20, 16, 16])) # Log statistics logger.info("logging misc training stats (log + csv)") # Gather statistics across workers local_lens_rets = (seg['ep_lens'], seg['ep_env_rets']) gathered_lens_rets = comm.allgather(local_lens_rets) lens, env_rets = map(flatten_lists, zip(*gathered_lens_rets)) # Extend the deques of recorded statistics len_buffer.extend(lens) env_ret_buffer.extend(env_rets) ep_len_mpi_mean = np.mean(len_buffer) ep_env_ret_mpi_mean = np.mean(env_ret_buffer) logger.record_tabular('ep_len_mpi_mean', ep_len_mpi_mean) logger.record_tabular('ep_env_ret_mpi_mean', ep_env_ret_mpi_mean) eps_this_iter = len(lens) timesteps_this_iter = sum(lens) eps_so_far += eps_this_iter timesteps_so_far += timesteps_this_iter eps_this_iter_mpi_mean = mpi_mean_like(eps_this_iter, comm) timesteps_this_iter_mpi_mean = mpi_mean_like(timesteps_this_iter, comm) eps_so_far_mpi_mean = mpi_mean_like(eps_so_far, comm) timesteps_so_far_mpi_mean = mpi_mean_like(timesteps_so_far, comm) logger.record_tabular('eps_this_iter_mpi_mean', eps_this_iter_mpi_mean) logger.record_tabular('timesteps_this_iter_mpi_mean', timesteps_this_iter_mpi_mean) logger.record_tabular('eps_so_far_mpi_mean', eps_so_far_mpi_mean) logger.record_tabular('timesteps_so_far_mpi_mean', timesteps_so_far_mpi_mean) logger.record_tabular('elapsed time', prettify_time(time.time() - tstart)) # no mpi mean logger.record_tabular('ev_td_lam_before', explained_variance(seg['vs'], seg['td_lam_rets'])) iters_so_far += 1 if rank == 0: logger.dump_tabular() if rank == 0: # Add summaries summary = tf.summary.Summary() tab = 'ppo' # Episode stats summary.value.add(tag="{}/{}".format(tab, 'mean_ep_len'), simple_value=ep_len_mpi_mean) summary.value.add(tag="{}/{}".format(tab, 'mean_ep_env_ret'), simple_value=ep_env_ret_mpi_mean) # Losses for name, loss in zipsame(list(losses.keys()), pol_losses_mpi_mean): summary.value.add(tag="{}/{}".format(tab, name), simple_value=loss) summary_writer.add_summary(summary, iters_so_far)
def learn(comm, env, xpo_agent_wrapper, sample_or_mode, gamma, max_kl, save_frequency, ckpt_dir, summary_dir, timesteps_per_batch, batch_size, experiment_name, ent_reg_scale, gae_lambda, cg_iters, cg_damping, vf_iters, vf_lr, max_iters): rank = comm.Get_rank() # Create policies pi = xpo_agent_wrapper('pi') old_pi = xpo_agent_wrapper('old_pi') # Create and retrieve already-existing placeholders ob = get_placeholder_cached(name='ob') ac = pi.pd_type.sample_placeholder([None]) adv = tf.placeholder(name='adv', dtype=tf.float32, shape=[None]) ret = tf.placeholder(name='ret', dtype=tf.float32, shape=[None]) flat_tangent = tf.placeholder(name='flat_tan', dtype=tf.float32, shape=[None]) # Build graphs kl_mean = tf.reduce_mean(old_pi.pd_pred.kl(pi.pd_pred)) ent_mean = tf.reduce_mean(pi.pd_pred.entropy()) ent_bonus = ent_reg_scale * ent_mean vf_err = tf.reduce_mean(tf.square(pi.v_pred - ret)) # MC error # The surrogate objective is defined as: advantage * pnew / pold ratio = tf.exp(pi.pd_pred.logp(ac) - old_pi.pd_pred.logp(ac)) # IS surr_gain = tf.reduce_mean(ratio * adv) # surrogate objective (CPI) # Add entropy bonus optim_gain = surr_gain + ent_bonus losses = OrderedDict() # Add losses losses.update({ 'pol_kl_mean': kl_mean, 'pol_ent_mean': ent_mean, 'pol_ent_bonus': ent_bonus, 'pol_surr_gain': surr_gain, 'pol_optim_gain': optim_gain, 'pol_vf_err': vf_err }) # Build natural gradient material get_flat = GetFlat(pi.pol_trainable_vars) set_from_flat = SetFromFlat(pi.pol_trainable_vars) kl_grads = tf.gradients(kl_mean, pi.pol_trainable_vars) shapes = [var.get_shape().as_list() for var in pi.pol_trainable_vars] start = 0 tangents = [] for shape in shapes: sz = intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz # Create the gradient vector product gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(kl_grads, tangents) ]) # Create the Fisher vector product fvp = flatgrad(gvp, pi.pol_trainable_vars) # Make the current `pi` become the next `old_pi` zipped = zipsame(old_pi.vars, pi.vars) updates_op = [] for k, v in zipped: # Populate list of assignment operations logger.info("assignment: {} <- {}".format(k, v)) assign_op = tf.assign(k, v) updates_op.append(assign_op) assert len(updates_op) == len(pi.vars) # Create mpi adam optimizer for the value function vf_optimizer = MpiAdamOptimizer(comm=comm, clip_norm=5.0, learning_rate=vf_lr, name='vf_adam') optimize_vf = vf_optimizer.minimize(loss=vf_err, var_list=pi.vf_trainable_vars) # Create gradients grads = flatgrad(optim_gain, pi.pol_trainable_vars) # Create callable objects assign_old_eq_new = TheanoFunction(inputs=[], outputs=updates_op) compute_losses = TheanoFunction(inputs=[ob, ac, adv, ret], outputs=list(losses.values())) compute_losses_grads = TheanoFunction(inputs=[ob, ac, adv, ret], outputs=list(losses.values()) + [grads]) compute_fvp = TheanoFunction(inputs=[flat_tangent, ob, ac, adv], outputs=fvp) optimize_vf = TheanoFunction(inputs=[ob, ret], outputs=optimize_vf) # Initialise variables initialize() # Sync params of all processes with the params of the root process theta_init = get_flat() comm.Bcast(theta_init, root=0) set_from_flat(theta_init) vf_optimizer.sync_from_root(pi.vf_trainable_vars) # Create context manager that records the time taken by encapsulated ops timed = timed_cm_wrapper(comm, logger) if rank == 0: # Create summary writer summary_writer = tf.summary.FileWriterCache.get(summary_dir) # Create segment generator seg_gen = traj_segment_generator(env, pi, timesteps_per_batch, sample_or_mode) eps_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() # Define rolling buffers for recent stats aggregation maxlen = 100 len_buffer = deque(maxlen=maxlen) env_ret_buffer = deque(maxlen=maxlen) pol_losses_buffer = deque(maxlen=maxlen) while iters_so_far <= max_iters: pretty_iter(logger, iters_so_far) pretty_elapsed(logger, tstart) # Verify that the processes are still in sync if iters_so_far > 0 and iters_so_far % 10 == 0: vf_optimizer.check_synced(pi.vf_trainable_vars) logger.info("vf params still in sync across processes") # Save the model if rank == 0 and iters_so_far % save_frequency == 0 and ckpt_dir is not None: model_path = osp.join(ckpt_dir, experiment_name) save_state(model_path, iters_so_far=iters_so_far) logger.info("saving model") logger.info(" @: {}".format(model_path)) with timed("sampling mini-batch"): seg = seg_gen.__next__() augment_segment_gae_stats(seg, gamma, gae_lambda, rew_key="env_rews") # Standardize advantage function estimate seg['advs'] = (seg['advs'] - seg['advs'].mean()) / (seg['advs'].std() + 1e-8) # Update running mean and std if hasattr(pi, 'obs_rms'): with timed("normalizing obs via rms"): pi.obs_rms.update(seg['obs'], comm) def fisher_vector_product(p): computed_fvp = compute_fvp({ flat_tangent: p, ob: seg['obs'], ac: seg['acs'], adv: seg['advs'] }) return mpi_mean_like(computed_fvp, comm) + cg_damping * p assign_old_eq_new({}) # Compute gradients with timed("computing gradients"): *loss_before, g = compute_losses_grads({ ob: seg['obs'], ac: seg['acs'], adv: seg['advs'], ret: seg['td_lam_rets'] }) loss_before = mpi_mean_like(loss_before, comm) g = mpi_mean_like(g, comm) if np.allclose(g, 0): logger.info("got zero gradient -> not updating") else: with timed("performing conjugate gradient procedure"): step_direction = conjugate_gradient(f_Ax=fisher_vector_product, b=g, cg_iters=cg_iters, verbose=(rank == 0)) assert np.isfinite(step_direction).all() shs = 0.5 * step_direction.dot( fisher_vector_product(step_direction)) # shs is (1/2)*s^T*A*s in the paper lm = np.sqrt(shs / max_kl) # lm is 1/beta in the paper (max_kl is user-specified delta) full_step = step_direction / lm # beta*s expected_improve = g.dot(full_step) # project s on g surr_before = loss_before[4] # 5-th in loss list step_size = 1.0 theta_before = get_flat() with timed("updating policy"): for _ in range( 10): # trying (10 times max) until the stepsize is OK # Update the policy parameters theta_new = theta_before + full_step * step_size set_from_flat(theta_new) pol_losses = compute_losses({ ob: seg['obs'], ac: seg['acs'], adv: seg['advs'], ret: seg['td_lam_rets'] }) pol_losses_buffer.append(pol_losses) pol_losses_mpi_mean = mpi_mean_like(pol_losses, comm) surr = pol_losses_mpi_mean[4] kl = pol_losses_mpi_mean[0] actual_improve = surr - surr_before logger.info(" expected: {:.3f} | actual: {:.3f}".format( expected_improve, actual_improve)) if not np.isfinite(pol_losses_mpi_mean).all(): logger.info(" got non-finite value of losses :(") elif kl > max_kl * 1.5: logger.info( " violated KL constraint -> shrinking step.") elif actual_improve < 0: logger.info( " surrogate didn't improve -> shrinking step.") else: logger.info(" stepsize fine :)") break step_size *= 0.5 # backtracking when the step size is deemed inappropriate else: logger.info(" couldn't compute a good step") set_from_flat(theta_before) # Create Feeder object to iterate over (ob, ret) pairs feeder = Feeder(data_map={ 'obs': seg['obs'], 'td_lam_rets': seg['td_lam_rets'] }, enable_shuffle=True) # Update state-value function with timed("updating value function"): for _ in range(vf_iters): for minibatch in feeder.get_feed(batch_size=batch_size): optimize_vf({ ob: minibatch['obs'], ret: minibatch['td_lam_rets'] }) # Log policy update statistics logger.info("logging pol training losses (log)") pol_losses_np_mean = np.mean(pol_losses_buffer, axis=0) pol_losses_mpi_mean = mpi_mean_reduce(pol_losses_buffer, comm, axis=0) zipped_pol_losses = zipsame(list(losses.keys()), pol_losses_np_mean, pol_losses_mpi_mean) logger.info( columnize(names=['name', 'local', 'global'], tuples=zipped_pol_losses, widths=[20, 16, 16])) # Log statistics logger.info("logging misc training stats (log + csv)") # Gather statistics across workers local_lens_rets = (seg['ep_lens'], seg['ep_env_rets']) gathered_lens_rets = comm.allgather(local_lens_rets) lens, env_rets = map(flatten_lists, zip(*gathered_lens_rets)) # Extend the deques of recorded statistics len_buffer.extend(lens) env_ret_buffer.extend(env_rets) ep_len_mpi_mean = np.mean(len_buffer) ep_env_ret_mpi_mean = np.mean(env_ret_buffer) logger.record_tabular('ep_len_mpi_mean', ep_len_mpi_mean) logger.record_tabular('ep_env_ret_mpi_mean', ep_env_ret_mpi_mean) eps_this_iter = len(lens) timesteps_this_iter = sum(lens) eps_so_far += eps_this_iter timesteps_so_far += timesteps_this_iter eps_this_iter_mpi_mean = mpi_mean_like(eps_this_iter, comm) timesteps_this_iter_mpi_mean = mpi_mean_like(timesteps_this_iter, comm) eps_so_far_mpi_mean = mpi_mean_like(eps_so_far, comm) timesteps_so_far_mpi_mean = mpi_mean_like(timesteps_so_far, comm) logger.record_tabular('eps_this_iter_mpi_mean', eps_this_iter_mpi_mean) logger.record_tabular('timesteps_this_iter_mpi_mean', timesteps_this_iter_mpi_mean) logger.record_tabular('eps_so_far_mpi_mean', eps_so_far_mpi_mean) logger.record_tabular('timesteps_so_far_mpi_mean', timesteps_so_far_mpi_mean) logger.record_tabular('elapsed time', prettify_time(time.time() - tstart)) # no mpi mean logger.record_tabular( 'ev_td_lam_before', explained_variance(seg['vs'], seg['td_lam_rets'])) iters_so_far += 1 if rank == 0: logger.dump_tabular() if rank == 0: # Add summaries summary = tf.summary.Summary() tab = 'trpo' # Episode stats summary.value.add(tag="{}/{}".format(tab, 'mean_ep_len'), simple_value=ep_len_mpi_mean) summary.value.add(tag="{}/{}".format(tab, 'mean_ep_env_ret'), simple_value=ep_env_ret_mpi_mean) # Losses for name, loss in zipsame(list(losses.keys()), pol_losses_mpi_mean): summary.value.add(tag="{}/{}".format(tab, name), simple_value=loss) summary_writer.add_summary(summary, iters_so_far)
def gather_trajectories(env, xpo_agent_wrapper, demos_dir, num_trajs, sample_or_mode, render, expert_arxiv_name, exact_model_path=None, model_ckpt_dir=None): """Gather trajectories from a trained `mlp_policy` agent""" # Only one of the two arguments can be provided assert sum([exact_model_path is None, model_ckpt_dir is None]) == 1 # Rebuild the computational graph to gain evaluation access to a learned and saved policy pi = xpo_agent_wrapper('pi') # Create episode generator traj_gen = traj_ep_generator(env=env, pi=pi, sample_or_mode=sample_or_mode, render=render) # Initialize and load the previously learned weights into the freshly re-built graph initialize() if exact_model_path is not None: load_model(exact_model_path) logger.info( "model loaded from exact path:\n {}".format(exact_model_path)) else: # `exact_model_path` is None -> `model_ckpt_dir` is not None load_latest_checkpoint(model_ckpt_dir) logger.info("model loaded from ckpt dir:\n {}".format(model_ckpt_dir)) # Initialize the history data structures obs0 = [] acs = [] env_rews = [] dones1 = [] obs1 = [] ep_env_rets = [] ep_lens = [] # Collect trajectories for i in range(num_trajs): logger.info("gathering [{}/{}]".format(i + 1, num_trajs)) traj = traj_gen.__next__() # Next two steps are separated to shrink line length ep_obs0, ep_acs, ep_env_rews = traj['obs0'], traj['acs'], traj[ 'env_rews'] ep_dones1, ep_obs1 = traj['dones1'], traj['obs1'] ep_len, ep_env_ret = traj['ep_len'], traj['ep_env_ret'] # Aggregate to the history data structures obs0.append(ep_obs0) acs.append(ep_acs) env_rews.append(ep_env_rews) dones1.append(ep_dones1) obs1.append(ep_obs1) ep_lens.append(ep_len) ep_env_rets.append(ep_env_ret) # Log some statistics of the collected trajectories sample_or_mode = 'sample' if sample_or_mode else 'mode' logger.info("action picking: {}".format(sample_or_mode)) ep_len_mean = np.mean(ep_lens) ep_len_std = np.std(ep_lens) ep_env_ret_mean = np.mean(ep_env_rets) ep_env_ret_std = np.std(ep_env_rets) ep_env_ret_min = np.amin(ep_env_rets) ep_env_ret_max = np.amax(ep_env_rets) logger.record_tabular("ep_len_mean", ep_len_mean) logger.record_tabular("ep_len_std", ep_len_std) logger.record_tabular("ep_env_ret_mean", ep_env_ret_mean) logger.record_tabular("ep_env_ret_std", ep_env_ret_std) logger.record_tabular("ep_env_ret_min", ep_env_ret_min) logger.record_tabular("ep_env_ret_max", ep_env_ret_max) logger.dump_tabular() # Assemble the file name path = osp.join(demos_dir, "{}.{}".format(expert_arxiv_name, sample_or_mode)) # Save the gathered data collections to the filesystem np.savez(path, obs0=np.array(obs0), acs=np.array(acs), env_rews=np.array(env_rews), dones1=np.array(dones1), obs1=np.array(obs1), ep_lens=np.array(ep_lens), ep_env_rets=np.array(ep_env_rets)) logger.info("saving demonstrations") logger.info(" @: {}.npz".format(path))