def display_var_info(vars): from third_party.baselines import logger count_params = 0 for v in vars: name = v.name if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue v_params = np.prod(v.shape.as_list()) count_params += v_params if "/b:" in name or "/biases" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print logger.info(" %s%s %i params %s" % (name, " " * (55 - len(name)), v_params, str(v.shape))) logger.info("Total model parameters: %0.2f million" % (count_params * 1e-6))
def train(self, batch_gen, steps_per_epoch, num_epochs): mblossvals = [] mbhistos = [] mbscs = [] mbascs = [] for epoch in range(num_epochs): gather_histo = (epoch == num_epochs - 1) for step in range(steps_per_epoch): gather_sc = ((epoch == num_epochs - 1) and (step == steps_per_epoch - 1)) obs, obs_next, acs = next(batch_gen) with logger.ProfileKV('train_ot_inner'): fetches = self._train( obs, obs_next, acs, gather_histo=gather_histo, gather_sc=gather_sc) mblossvals.append(fetches['losses']) if gather_histo: mbhistos.append(fetches['stats_histo']) if gather_sc: mbscs.append(fetches['stats_sc']) mbascs.append(fetches['additional_sc']) lossvals = np.mean(mblossvals, axis=0) assert len(mbscs) == 1 assert len(mbascs) == 1 scalars = mbscs[0] additional_scalars = mbascs[0] histograms = { n: np.concatenate([f[n] for f in mbhistos], axis=0) for n in self._stats_histo_names } logger.info('RLBModelWrapper.train histograms: {}'.format([(n, histograms[n].shape) for n in histograms.keys()])) for (lossval, lossname) in zip(lossvals, self._loss_names): logger.logkv(lossname, lossval) for n, v in scalars.items(): logger.logkv(n, v) for n, v in additional_scalars.items(): logger.logkv(n, v) for n, v in histograms.items(): logger.logkv(n, v) logger.logkv('mean_' + n, np.mean(v)) logger.logkv('std_' + n, np.std(v)) logger.logkv('max_' + n, np.max(v)) logger.logkv('min_' + n, np.min(v))
def _generate_batch(self, x1, *data): """Generate batches of data used to train the R network.""" logger.info( 'RLBTrainer._generate_batch. # batches per epoch: {}'.format( len(x1) // self._batch_size)) while True: # Train for one epoch. sample_count = len(x1) number_of_batches = sample_count // self._batch_size for batch_index in range(number_of_batches): from_index = batch_index * self._batch_size to_index = (batch_index + 1) * self._batch_size yield (np.array(x1[from_index:to_index]), ) + tuple( np.array(d[from_index:to_index]) for d in data) # After each epoch, shuffle the data. res = self._shuffle(x1, *data) x1 = res[0] data = res[1:]
def create_environments(env_name, num_envs, r_network_weights_path = None, dmlab_homepath = '', action_set = '', base_seed = 123, scale_task_reward_for_eval = 1.0, scale_surrogate_reward_for_eval = 0.0, online_r_training = False, environment_engine = 'dmlab', r_network_weights_store_path = '', level_cache_mode=False): """Creates a environments with R-network-based curiosity reward. Args: env_name: Name of the DMLab environment. num_envs: Number of parallel environment to spawn. r_network_weights_path: Path to the weights of the R-network. dmlab_homepath: Path to the DMLab MPM. Required when running on borg. action_set: One of {'small', 'nofire', ''}. Which action set to use. base_seed: Each environment will use base_seed+env_index as seed. scale_task_reward_for_eval: scale of the task reward to be used for valid/test environments. scale_surrogate_reward_for_eval: scale of the surrogate reward to be used for valid/test environments. online_r_training: Whether to enable online training of the R-network. environment_engine: either 'dmlab', 'atari', 'parkour'. r_network_weights_store_path: Directory where to store R checkpoints generated during online training of the R network. Returns: Wrapped environment with curiosity. """ # Environments without intrinsic exploration rewards. # pylint: disable=g-long-lambda create_dmlab_single_env = functools.partial(create_single_env, dmlab_homepath=dmlab_homepath, action_set=action_set, level_cache_mode=level_cache_mode) if environment_engine == 'dmlab': create_env_fn = create_dmlab_single_env is_atari_environment = False elif environment_engine == 'atari': create_env_fn = create_single_atari_env is_atari_environment = True elif environment_engine == 'parkour': mujoco_key_path = '' create_env_fn = functools.partial( create_single_parkour_env, mujoco_key_path=mujoco_key_path) is_atari_environment = False else: raise ValueError('Unknown env engine {}'.format(environment_engine)) # WARNING: python processes are not really compatible with other google3 code, # which can lead to deadlock. See go/g3process. This is why you can use # ThreadedVecEnv. VecEnvClass = (subproc_vec_env.SubprocVecEnv if FLAGS.vec_env_class == 'SubprocVecEnv' else threaded_vec_env.ThreadedVecEnv) with logger.ProfileKV('create_envs'): vec_env = VecEnvClass([ (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=True, split='train')) for i in range(num_envs) ], level_cache_mode=level_cache_mode) valid_env = VecEnvClass([ (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=False, split='valid')) for i in range(num_envs) ], level_cache_mode=level_cache_mode) test_env = VecEnvClass([ (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=False, split='test')) for i in range(num_envs) ], level_cache_mode=level_cache_mode) if level_cache_mode: #logger.info('Starting the infinite map generation sequence...') logger.info('Starting the finite map generation sequence...') import time while True: time.sleep(10) # pylint: enable=g-long-lambda # Size of states when stored in the memory. embedding_size = models.EMBEDDING_DIM if not r_network_weights_path: # Empty string equivalent to no R_network checkpoint. r_network_weights_path = None r_net = r_network.RNetwork( (84, 84, 4) if is_atari_environment else Const.OBSERVATION_SHAPE, r_network_weights_path) # Only for online training do we need to train the R-network. r_network_trainer = None if online_r_training: r_network_trainer = r_network_training.RNetworkTrainer( r_net._r_network, # pylint: disable=protected-access checkpoint_dir=r_network_weights_store_path) # Creates the episodic memory that is attached to each of those envs. vec_episodic_memory = [ episodic_memory.EpisodicMemory( observation_shape=[embedding_size], observation_compare_fn=r_net.embedding_similarity) for _ in range(num_envs) ] # The size of images is reduced to 64x64 to make training faster. # Note: using color images with DMLab makes it much easier to train a policy. # So no conversion to grayscale. target_image_shape = [84, 84, 4 if is_atari_environment else 3] env_wrapper = curiosity_env_wrapper.CuriosityEnvWrapper( vec_env, vec_episodic_memory, r_net.embed_observation, target_image_shape) if r_network_trainer is not None: env_wrapper.add_observer(r_network_trainer) valid_env_wrapper, test_env_wrapper = ( curiosity_env_wrapper.CuriosityEnvWrapper( env, vec_episodic_memory, r_net.embed_observation, target_image_shape, exploration_reward=('none' if (is_atari_environment or environment_engine == 'parkour') else 'oracle'), scale_task_reward=scale_task_reward_for_eval, scale_surrogate_reward=scale_surrogate_reward_for_eval) for env in [valid_env, test_env]) return env_wrapper, valid_env_wrapper, test_env_wrapper
def learn(policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, train_callback=None, eval_callback=None, cloud_sync_callback=None, cloud_sync_interval=1000, workdir='', use_curiosity=False, curiosity_strength=0.01, forward_inverse_ratio=0.2, curiosity_loss_strength=10, random_state_predictor=False, use_rlb=False, checkpoint_path_for_debugging=None): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # pylint: disable=g-long-lambda make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, use_curiosity=use_curiosity, curiosity_strength=curiosity_strength, forward_inverse_ratio=forward_inverse_ratio, curiosity_loss_strength=curiosity_loss_strength, random_state_predictor=random_state_predictor, use_rlb=use_rlb) # pylint: enable=g-long-lambda if save_interval and workdir: with tf.gfile.Open(osp.join(workdir, 'make_model.pkl'), 'wb') as fh: fh.write(dill.dumps(make_model)) saver = tf.train.Saver(max_to_keep=10000000) def save_state(fname): if not osp.exists(osp.dirname(fname)): os.makedirs(osp.dirname(fname)) saver.save(tf.get_default_session(), fname) with tf.device('/gpu:0'): model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, eval_callback=eval_callback) if checkpoint_path_for_debugging is not None: tf_util.load_state(checkpoint_path_for_debugging, var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='rlb_model')) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) (obs, next_obs, returns, masks, actions, values, neglogpacs), states, epinfos, (rewards, rewards_ext, rewards_int, rewards_int_raw, selected_infos, dones) = runner.run() epinfobuf.extend(epinfos) mblossvals = [] mbhistos = [] mbscs = [] #if model.all_rlb_args.debug_args['debug_tf_timeline'] and update % 5 == 0: if model.all_rlb_args.debug_args[ 'debug_tf_timeline'] and update % 1 == 0: debug_timeliner = logger.TimeLiner() else: debug_timeliner = None if states is None: # nonrecurrent version inds = np.arange(nbatch) for oe in range(noptepochs): gather_histo = (oe == noptepochs - 1) np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): gather_sc = ((oe == noptepochs - 1) and (start + nbatch_train >= nbatch)) end = start + nbatch_train mbinds = inds[start:end] slices = [ arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs, next_obs) ] with logger.ProfileKV('train'): fetches = model.train(lrnow, cliprangenow, slices[0], slices[6], slices[1], slices[2], slices[3], slices[4], slices[5], gather_histo=gather_histo, gather_sc=gather_sc, debug_timeliner=debug_timeliner) mblossvals.append(fetches['losses']) if gather_histo: mbhistos.append(fetches['stats_histo']) if gather_sc: mbscs.append(fetches['stats_sc']) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for oe in range(noptepochs): gather_histo = (oe == noptepochs - 1) np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): gather_sc = ((oe == noptepochs - 1) and (start + nbatch_train >= nbatch)) end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = [ arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs, next_obs) ] mbstates = states[mbenvinds] fetches = model.train(lrnow, cliprangenow, slices[0], slices[6], slices[1], slices[2], slices[3], slices[4], slices[5], mbstates, gather_histo=gather_histo, gather_sc=gather_sc, debug_timeliner=debug_timeliner) mblossvals.append(fetches['losses']) if gather_histo: mbhistos.append(fetches['stats_histo']) if gather_sc: mbscs.append(fetches['stats_sc']) if debug_timeliner is not None: with logger.ProfileKV("save_timeline_json"): debug_timeliner.save( osp.join(workdir, 'timeline_{}.json'.format(update))) lossvals = np.mean(mblossvals, axis=0) assert len(mbscs) == 1 scalars = mbscs[0] histograms = { n: np.concatenate([f[n] for f in mbhistos], axis=0) for n in model.stats_histo_names } logger.info('Histograms: {}'.format([(n, histograms[n].shape) for n in histograms.keys()])) #for v in histograms.values(): # assert len(v) == nbatch tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: fps_total = int((update * nbatch) / (tnow - tfirststart)) #tf_op_names = [i.name for i in tf.get_default_graph().get_operations()] #logger.info('#################### tf_op_names: {}'.format(tf_op_names)) tf_num_ops = len(tf.get_default_graph().get_operations()) logger.info( '#################### tf_num_ops: {}'.format(tf_num_ops)) logger.logkv('tf_num_ops', tf_num_ops) ev = explained_variance(values, returns) logger.logkv('serial_timesteps', update * nsteps) logger.logkv('nupdates', update) logger.logkv('total_timesteps', update * nbatch) logger.logkv('fps', fps) logger.logkv('fps_total', fps_total) logger.logkv( 'remaining_time', float(tnow - tfirststart) / float(update) * float(nupdates - update)) logger.logkv('explained_variance', float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if train_callback: train_callback(safemean([epinfo['l'] for epinfo in epinfobuf]), safemean([epinfo['r'] for epinfo in epinfobuf]), update * nbatch) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) for n, v in scalars.items(): logger.logkv(n, v) for n, v in histograms.items(): logger.logkv(n, v) logger.logkv('mean_' + n, np.mean(v)) logger.logkv('std_' + n, np.std(v)) logger.logkv('max_' + n, np.max(v)) logger.logkv('min_' + n, np.min(v)) for n, v in locals().items(): if n in ['rewards_int', 'rewards_int_raw']: logger.logkv(n, v) if n in [ 'rewards', 'rewards_ext', 'rewards_int', 'rewards_int_raw' ]: logger.logkv('mean_' + n, np.mean(v)) logger.logkv('std_' + n, np.std(v)) logger.logkv('max_' + n, np.max(v)) logger.logkv('min_' + n, np.min(v)) if model.rlb_model: if model.all_rlb_args.outer_args['rlb_normalize_ir']: logger.logkv('rlb_ir_running_mean', runner.irff_rms.mean) logger.logkv('rlb_ir_running_std', np.sqrt(runner.irff_rms.var)) logger.dumpkvs() if (save_interval and (update % save_interval == 0 or update == 1) and workdir): checkdir = osp.join(workdir, 'checkpoints') if not tf.gfile.Exists(checkdir): tf.gfile.MakeDirs(checkdir) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) checkdir = osp.join(workdir, 'full_checkpoints') if not tf.gfile.Exists(checkdir): tf.gfile.MakeDirs(checkdir) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) save_state(savepath) if (cloud_sync_interval and update % cloud_sync_interval == 0 and cloud_sync_callback): cloud_sync_callback() env.close() return model
def __init__(self, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, use_curiosity, curiosity_strength, forward_inverse_ratio, curiosity_loss_strength, random_state_predictor, use_rlb): sess = tf.get_default_session() nenvs = nbatch_act act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True) assert not (use_curiosity and use_rlb) if use_curiosity: hidden_layer_size = 256 self.state_encoder_net = tf.make_template( 'state_encoder_net', pathak_utils.universeHead, create_scope_now_=True, trainable=(not random_state_predictor)) self.icm_forward_net = tf.make_template( 'icm_forward', pathak_utils.icm_forward_model, create_scope_now_=True, num_actions=ac_space.n, hidden_layer_size=hidden_layer_size) self.icm_inverse_net = tf.make_template( 'icm_inverse', pathak_utils.icm_inverse_model, create_scope_now_=True, num_actions=ac_space.n, hidden_layer_size=hidden_layer_size) else: self.state_encoder_net = None self.icm_forward_net = None self.icm_inverse_net = None A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) # When computing intrinsic reward a different batch size is used (number # of parallel environments), thus we need to define separate # placeholders for them. X_NEXT, _ = observation_input(ob_space, nbatch_train) X_INTRINSIC_NEXT, _ = observation_input(ob_space, nbatch_act) X_INTRINSIC_CURRENT, _ = observation_input(ob_space, nbatch_act) trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) self.all_rlb_args = get_rlb_args() if use_rlb: rlb_scope = 'rlb_model' #rlb_ir_weight = self.all_rlb_args.outer_args['rlb_ir_weight'] rlb_loss_weight = self.all_rlb_args.outer_args['rlb_loss_weight'] self.rlb_model = tf.make_template( rlb_scope, define_rlb_model, create_scope_now_=True, pdtype=train_model.pdtype, ac_space=ac_space, #nenvs=nenvs, optimizer=trainer, outer_scope=rlb_scope, **self.all_rlb_args.inner_args) else: self.rlb_model = None neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) curiosity_loss = self.compute_curiosity_loss( use_curiosity, train_model.X, A, X_NEXT, forward_inverse_ratio=forward_inverse_ratio, curiosity_loss_strength=curiosity_loss_strength) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + curiosity_loss if use_curiosity: encoded_time_step = self.state_encoder_net(X_INTRINSIC_CURRENT) encoded_next_time_step = self.state_encoder_net(X_INTRINSIC_NEXT) intrinsic_reward = self.curiosity_forward_model_loss( encoded_time_step, A, encoded_next_time_step) intrinsic_reward = intrinsic_reward * curiosity_strength if self.rlb_model: assert 'intrinsic_reward' not in locals() intrinsic_reward = self.rlb_model(ph_set=construct_ph_set( x=X_INTRINSIC_CURRENT, x_next=X_INTRINSIC_NEXT, a=A)).int_rew #intrinsic_reward = intrinsic_reward * rlb_ir_weight rlb_out = self.rlb_model( ph_set=construct_ph_set(x=train_model.X, x_next=X_NEXT, a=A)) loss = loss + rlb_loss_weight * rlb_out.aux_loss #with tf.variable_scope('model'): params = tf.trainable_variables() logger.info('{} trainable parameters: {}'.format( len(params), [p.name for p in params])) # For whatever reason Pathak multiplies the loss by 20. pathak_multiplier = 20 if use_curiosity else 1 grads = tf.gradients(loss * pathak_multiplier, params) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) #trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) _train = trainer.apply_gradients(grads) if self.all_rlb_args.debug_args['debug_tf_timeline']: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) builder = option_builder.ProfileOptionBuilder profiler_opts = builder( builder.time_and_memory()).order_by('micros').build() else: run_options = None def getIntrinsicReward(curr, next_obs, actions): with logger.ProfileKV('get_intrinsic_reward'): return sess.run( intrinsic_reward, { X_INTRINSIC_CURRENT: curr, X_INTRINSIC_NEXT: next_obs, A: actions }) def train(lr, cliprange, obs, next_obs, returns, masks, actions, values, neglogpacs, states=None, gather_histo=False, gather_sc=False, debug_timeliner=None): advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values, X_NEXT: next_obs } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks fetches = { 'train': _train, 'losses': [ pg_loss, vf_loss, entropy, approxkl, clipfrac, curiosity_loss ], } if self.rlb_model: fetches['losses'].append(rlb_out.aux_loss) if gather_histo: fetches.update({'stats_histo': {}}) if self.rlb_model: fetches['stats_histo'].update({ n: getattr(rlb_out.stats_histo, n) for n in self.stats_histo_names }) if gather_sc: fetches.update({'stats_sc': {}}) if self.rlb_model: fetches['stats_sc'].update({ n: getattr(rlb_out.stats_sc, n) for n in self.stats_sc_names }) if debug_timeliner is not None and self.all_rlb_args.debug_args[ 'debug_tf_timeline']: run_metadata = tf.RunMetadata() final_run_options = run_options else: run_metadata = None final_run_options = None with logger.ProfileKV('train_sess_run'): result = sess.run( fetches, td_map, options=final_run_options, run_metadata=run_metadata, ) if debug_timeliner is not None and self.all_rlb_args.debug_args[ 'debug_tf_timeline']: fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format( show_memory=True) debug_timeliner.update_timeline(chrome_trace) tf.profiler.profile(tf.get_default_graph(), run_meta=run_metadata, cmd='scope', options=profiler_opts) return result self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'curiosity_loss' ] if self.rlb_model: self.loss_names.append('rlb_loss') self.stats_histo_names = sorted( list(rlb_out.stats_histo.__dict__.keys())) self.stats_sc_names = sorted(list( rlb_out.stats_sc.__dict__.keys())) else: self.stats_histo_names = [] self.stats_sc_names = [] def save(save_path): ps = sess.run(params) with tf.gfile.Open(save_path, 'wb') as fh: fh.write(dill.dumps(ps)) def load(load_path): with tf.gfile.Open(load_path, 'rb') as fh: val = fh.read() loaded_params = dill.loads(val) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) # If you want to load weights, also save/load observation scaling inside # VecNormalize self.getIntrinsicReward = getIntrinsicReward self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess) # pylint: disable=E1101
def create_environments_with_rlb(env_name, num_envs, dmlab_homepath = '', action_set = '', base_seed = 123, scale_task_reward_for_eval = 1.0, scale_surrogate_reward_for_eval = 0.0, online_r_training = False, environment_engine = 'dmlab', r_network_weights_store_path = '', level_cache_mode=False, rlb_image_size=(84, 84)): """Creates a environments with R-network-based curiosity reward. Args: env_name: Name of the DMLab environment. num_envs: Number of parallel environment to spawn. r_network_weights_path: Path to the weights of the R-network. dmlab_homepath: Path to the DMLab MPM. Required when running on borg. action_set: One of {'small', 'nofire', ''}. Which action set to use. base_seed: Each environment will use base_seed+env_index as seed. scale_task_reward_for_eval: scale of the task reward to be used for valid/test environments. scale_surrogate_reward_for_eval: scale of the surrogate reward to be used for valid/test environments. online_r_training: Whether to enable online training of the R-network. environment_engine: either 'dmlab', 'atari', 'parkour'. r_network_weights_store_path: Directory where to store R checkpoints generated during online training of the R network. Returns: Wrapped environment with curiosity. """ # Environments without intrinsic exploration rewards. # pylint: disable=g-long-lambda create_dmlab_single_env = functools.partial(create_single_env, dmlab_homepath=dmlab_homepath, action_set=action_set, level_cache_mode=level_cache_mode) if environment_engine == 'dmlab': create_env_fn = create_dmlab_single_env is_atari_environment = False elif environment_engine == 'atari': create_env_fn = create_single_atari_env is_atari_environment = True elif environment_engine == 'parkour': mujoco_key_path = '' create_env_fn = functools.partial( create_single_parkour_env, mujoco_key_path=mujoco_key_path) is_atari_environment = False else: raise ValueError('Unknown env engine {}'.format(environment_engine)) VecEnvClass = (subproc_vec_env.SubprocVecEnv if FLAGS.vec_env_class == 'SubprocVecEnv' else threaded_vec_env.ThreadedVecEnv) with logger.ProfileKV('create_envs'): vec_env = VecEnvClass([ (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=True, split='train')) for i in range(num_envs) ], level_cache_mode=level_cache_mode) valid_env = VecEnvClass([ (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=False, split='valid')) for i in range(num_envs) ], level_cache_mode=level_cache_mode) test_env = VecEnvClass([ (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=False, split='test')) for i in range(num_envs) ], level_cache_mode=level_cache_mode) if level_cache_mode: logger.info('Starting the infinite map generation sequence...') import time while True: time.sleep(10) # pylint: enable=g-long-lambda rlb_image_shape = (84, 84, (4 if is_atari_environment else 3)) rlb_model_wrapper = RLBModelWrapper( input_shape=rlb_image_shape, action_space=vec_env.action_space, max_grad_norm=0.5) rlb_model_trainer = RLBTrainer( rlb_model_wrapper, ensure_train_between_episodes=True) embedding_size = rlb_model_wrapper.rlb_all_z_dim vec_episodic_memory = [ RLBEpisodicMemory( observation_shape=[embedding_size], replacement=rlb_model_wrapper.all_rlb_args.outer_args['rlb_ot_memory_algo'], capacity=rlb_model_wrapper.all_rlb_args.outer_args['rlb_ot_memory_capacity']) for _ in range(num_envs) ] exploration_reward_min_step = rlb_model_wrapper.all_rlb_args.outer_args['rlb_ot_exploration_min_step'] if exploration_reward_min_step < 0: exploration_reward_min_step = rlb_model_trainer.training_interval env_wrapper = RLBEnvWrapper( vec_env=vec_env, vec_episodic_memory=vec_episodic_memory, observation_embedding_fn=rlb_model_wrapper.embed_observation, intrinsic_reward_fn=rlb_model_wrapper.compute_intrinsic_rewards, rlb_image_shape=rlb_image_shape, #target_image_shape=None, target_image_shape=[84, 84, 4 if is_atari_environment else 3], exploration_reward='rlb', scale_surrogate_reward=rlb_model_wrapper.all_rlb_args.outer_args['rlb_ir_weight'], ir_normalize_type=rlb_model_wrapper.all_rlb_args.outer_args['rlb_normalize_ir'], ir_clip_low=rlb_model_wrapper.all_rlb_args.outer_args['rlb_ot_ir_clip_low'], exploration_reward_min_step=exploration_reward_min_step, name='train') if rlb_model_trainer is not None: env_wrapper.add_observer(rlb_model_trainer) valid_env_wrapper, test_env_wrapper = ( RLBEnvWrapper( vec_env=env, vec_episodic_memory=None, observation_embedding_fn=None, intrinsic_reward_fn=None, rlb_image_shape=None, target_image_shape=[84, 84, 4 if is_atari_environment else 3], exploration_reward=('none' if (is_atari_environment or environment_engine == 'parkour') else 'oracle'), scale_task_reward=scale_task_reward_for_eval, scale_surrogate_reward=scale_surrogate_reward_for_eval, name=name) for env, name in [(valid_env, 'valid'), (test_env, 'test')]) return env_wrapper, valid_env_wrapper, test_env_wrapper
def __init__(self, input_shape, action_space, max_grad_norm=0.5, ): """Inits the RNetwork. Args: input_shape: (height, width, channel) weight_path: Path to the weights of the r_network. """ self.input_shape = input_shape self.all_rlb_args = get_rlb_args() trainer = tf.train.AdamOptimizer(learning_rate=self.all_rlb_args.outer_args['rlb_ot_lr']) policy_pdtype = make_pdtype(action_space) self.policy_pdtype = policy_pdtype train_batch_size = self.all_rlb_args.outer_args['rlb_ot_batch_size'] ph_obs = tf.placeholder(shape=(train_batch_size,) + input_shape, dtype=tf.uint8, name='obs') ph_obs_next = tf.placeholder(shape=(train_batch_size,) + input_shape, dtype=tf.uint8, name='obs_next') ph_acs = policy_pdtype.sample_placeholder([train_batch_size]) ph_emb_net_obs = tf.placeholder(shape=(None,) + input_shape, dtype=tf.uint8, name='emb_net_obs') self.rlb_all_z_dim = self.all_rlb_args.inner_args['rlb_z_dim'] * self.all_rlb_args.inner_args['rlb_num_z_variables'] ph_epimem_ir_emb_memory = tf.placeholder(shape=(None, None, self.rlb_all_z_dim), dtype=tf.float32, name='epimem_ir_emb_memory') ph_epimem_ir_emb_target = tf.placeholder(shape=(None, None, self.rlb_all_z_dim), dtype=tf.float32, name='epimem_ir_emb_target') rlb_scope = 'rlb_model' self._rlb_model = tf.make_template( rlb_scope, define_rlb_model, create_scope_now_=True, pdtype=policy_pdtype, ac_space=action_space, optimizer=trainer, outer_scope=rlb_scope, **self.all_rlb_args.inner_args) rlb_train_extra_kwargs = dict() rlb_train_out = self._rlb_model( ph_set=construct_ph_set( x=ph_obs, x_next=ph_obs_next, a=ph_acs), ph_set_for_embedding_net=None, ph_set_for_epimem_ir=None, **rlb_train_extra_kwargs ) loss = rlb_train_out.aux_loss self._loss_names = ['rlb_loss'] self._stats_histo_names = sorted(list(rlb_train_out.stats_histo.__dict__.keys())) self._stats_sc_names = sorted(list(rlb_train_out.stats_sc.__dict__.keys())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=rlb_scope) logger.info('RLBModelWrapper, {} trainable parameters: {}'.format(len(params), [p.name for p in params])) grads = tf.gradients(loss, params) grads_raw_global_norm = tf.global_norm(grads) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads_clipped_global_norm = tf.global_norm(grads) grads = list(zip(grads, params)) train_op = trainer.apply_gradients(grads) def _train(obs, obs_next, acs, gather_histo=False, gather_sc=False): fetches = { 'train': train_op, 'losses': [loss], } if gather_histo: fetches['stats_histo'] = { n: getattr(rlb_train_out.stats_histo, n) for n in self._stats_histo_names } if gather_sc: fetches['stats_sc'] = { n: getattr(rlb_train_out.stats_sc, n) for n in self._stats_sc_names } fetches['additional_sc'] = { 'rlb_grads_raw_global_norm': grads_raw_global_norm, } if max_grad_norm is not None: fetches['additional_sc'].update({ 'rlb_grads_clipped_global_norm': grads_clipped_global_norm, }) sess = tf.get_default_session() result = sess.run(fetches, {ph_obs: obs, ph_obs_next: obs_next, ph_acs: acs}) return result self._train = _train rlb_eval_extra_kwargs = dict() embedding_output = self._rlb_model( ph_set=None, ph_set_for_embedding_net=construct_ph_set_for_embedding_net( ph_emb_net_obs), ph_set_for_epimem_ir=None, **rlb_eval_extra_kwargs ).z def _embedding_network(obs): sess = tf.get_default_session() return sess.run(embedding_output, {ph_emb_net_obs: obs}) self._embedding_network = _embedding_network epimem_ir_output = self._rlb_model( ph_set=None, ph_set_for_embedding_net=None, ph_set_for_epimem_ir=construct_ph_set_for_epimem_ir(ph_epimem_ir_emb_memory, ph_epimem_ir_emb_target), **rlb_eval_extra_kwargs ).epimem_ir def _ir_network(memory, x): sess = tf.get_default_session() ir = sess.run(epimem_ir_output, {ph_epimem_ir_emb_memory: memory, ph_epimem_ir_emb_target: x}) # Don't multiply the IR weight here since it will be normalized in RLBEnvWrapper. #ir = ir * self.all_rlb_args.outer_args['rlb_ir_weight'] return ir self._ir_network = _ir_network
def __init__( self, vec_env, vec_episodic_memory, observation_embedding_fn, intrinsic_reward_fn, rlb_image_shape, target_image_shape, exploration_reward='rlb', scale_task_reward=1.0, scale_surrogate_reward=None, exploration_reward_min_step=0, ir_normalize_type=0, ir_clip_low=None, name='', ): logger.info('RLBEnvWrapper args: {}'.format(locals())) if exploration_reward == 'rlb': if len(vec_episodic_memory) != vec_env.num_envs: raise ValueError( 'Each env must have a unique episodic memory.') if target_image_shape is None: target_image_shape = rlb_image_shape if self._should_process_observation(vec_env.observation_space.shape): observation_space_shape = target_image_shape[:] observation_space = gym.spaces.Box(low=0, high=255, shape=observation_space_shape, dtype=np.float) else: observation_space = vec_env.observation_space VecEnvWrapper.__init__(self, vec_env, observation_space=observation_space) self._vec_episodic_memory = vec_episodic_memory self._observation_embedding_fn = observation_embedding_fn self._intrinsic_reward_fn = intrinsic_reward_fn self._rlb_image_shape = rlb_image_shape self._target_image_shape = target_image_shape self._exploration_reward = exploration_reward self._scale_task_reward = scale_task_reward self._scale_surrogate_reward = scale_surrogate_reward self._exploration_reward_min_step = exploration_reward_min_step # Oracle reward. self._oracles = [ oracle.OracleExplorationReward() for _ in range(self.venv.num_envs) ] self._ir_normalize_type = ir_normalize_type if self._ir_normalize_type == 0: pass elif self._ir_normalize_type == 1: ir_normalize_gamma = 0.99 self._irff = RewardForwardFilter(ir_normalize_gamma) self._irff_rms = RunningMeanStd() elif self._ir_normalize_type == 2: self._ir_rms = RunningMeanStd() elif self._ir_normalize_type == 3: self._ir_rms = SimpleWeightedMovingScalarMeanStd(alpha=0.0001) else: assert False self._ir_clip_low = ir_clip_low self._name = name # Cumulative task reward over an episode. self._episode_task_reward = [0.0] * self.venv.num_envs self._episode_bonus_reward = [0.0] * self.venv.num_envs # Stats on the task and exploration reward. self._stats_task_reward = MovingAverage(capacity=100) self._stats_bonus_reward = MovingAverage(capacity=100) # Total number of steps so far per environment. self._step_count = 0 # Observers are notified each time a new time step is generated by the # environment. self._observers = [] self._bonus_reward_raw_history = [[] for _ in range(self.venv.num_envs)] self._bonus_reward_history = [[] for _ in range(self.venv.num_envs)]