示例#1
0
def display_var_info(vars):
    from third_party.baselines import logger
    count_params = 0
    for v in vars:
        name = v.name
        if "/Adam" in name or "beta1_power" in name or "beta2_power" in name:
            continue
        v_params = np.prod(v.shape.as_list())
        count_params += v_params
        if "/b:" in name or "/biases" in name:
            continue  # Wx+b, bias is not interesting to look at => count params, but not print
        logger.info("   %s%s %i params %s" %
                    (name, " " * (55 - len(name)), v_params, str(v.shape)))

    logger.info("Total model parameters: %0.2f million" %
                (count_params * 1e-6))
  def train(self, batch_gen, steps_per_epoch, num_epochs):
    mblossvals = []
    mbhistos = []
    mbscs = []
    mbascs = []
    for epoch in range(num_epochs):
      gather_histo = (epoch == num_epochs - 1)
      for step in range(steps_per_epoch):
        gather_sc = ((epoch == num_epochs - 1) and (step == steps_per_epoch - 1))
        obs, obs_next, acs = next(batch_gen)
        with logger.ProfileKV('train_ot_inner'):
          fetches = self._train(
              obs, obs_next, acs,
              gather_histo=gather_histo, gather_sc=gather_sc)
        mblossvals.append(fetches['losses'])
        if gather_histo:
          mbhistos.append(fetches['stats_histo'])
        if gather_sc:
          mbscs.append(fetches['stats_sc'])
          mbascs.append(fetches['additional_sc'])

    lossvals = np.mean(mblossvals, axis=0)
    assert len(mbscs) == 1
    assert len(mbascs) == 1
    scalars = mbscs[0]
    additional_scalars = mbascs[0]
    histograms = { n: np.concatenate([f[n] for f in mbhistos], axis=0) for n in self._stats_histo_names }
    logger.info('RLBModelWrapper.train histograms: {}'.format([(n, histograms[n].shape) for n in histograms.keys()]))

    for (lossval, lossname) in zip(lossvals, self._loss_names):
      logger.logkv(lossname, lossval)

    for n, v in scalars.items():
      logger.logkv(n, v)

    for n, v in additional_scalars.items():
      logger.logkv(n, v)

    for n, v in histograms.items():
      logger.logkv(n, v)
      logger.logkv('mean_' + n, np.mean(v))
      logger.logkv('std_' + n, np.std(v))
      logger.logkv('max_' + n, np.max(v))
      logger.logkv('min_' + n, np.min(v))
示例#3
0
    def _generate_batch(self, x1, *data):
        """Generate batches of data used to train the R network."""
        logger.info(
            'RLBTrainer._generate_batch. # batches per epoch: {}'.format(
                len(x1) // self._batch_size))
        while True:
            # Train for one epoch.
            sample_count = len(x1)
            number_of_batches = sample_count // self._batch_size
            for batch_index in range(number_of_batches):
                from_index = batch_index * self._batch_size
                to_index = (batch_index + 1) * self._batch_size
                yield (np.array(x1[from_index:to_index]), ) + tuple(
                    np.array(d[from_index:to_index]) for d in data)

            # After each epoch, shuffle the data.
            res = self._shuffle(x1, *data)
            x1 = res[0]
            data = res[1:]
示例#4
0
def create_environments(env_name,
                        num_envs,
                        r_network_weights_path = None,
                        dmlab_homepath = '',
                        action_set = '',
                        base_seed = 123,
                        scale_task_reward_for_eval = 1.0,
                        scale_surrogate_reward_for_eval = 0.0,
                        online_r_training = False,
                        environment_engine = 'dmlab',
                        r_network_weights_store_path = '',
                        level_cache_mode=False):
  """Creates a environments with R-network-based curiosity reward.

  Args:
    env_name: Name of the DMLab environment.
    num_envs: Number of parallel environment to spawn.
    r_network_weights_path: Path to the weights of the R-network.
    dmlab_homepath: Path to the DMLab MPM. Required when running on borg.
    action_set: One of {'small', 'nofire', ''}. Which action set to use.
    base_seed: Each environment will use base_seed+env_index as seed.
    scale_task_reward_for_eval: scale of the task reward to be used for
      valid/test environments.
    scale_surrogate_reward_for_eval: scale of the surrogate reward to be used
      for valid/test environments.
    online_r_training: Whether to enable online training of the R-network.
    environment_engine: either 'dmlab', 'atari', 'parkour'.
    r_network_weights_store_path: Directory where to store R checkpoints
      generated during online training of the R network.

  Returns:
    Wrapped environment with curiosity.
  """
  # Environments without intrinsic exploration rewards.
  # pylint: disable=g-long-lambda
  create_dmlab_single_env = functools.partial(create_single_env,
                                              dmlab_homepath=dmlab_homepath,
                                              action_set=action_set,
                                              level_cache_mode=level_cache_mode)

  if environment_engine == 'dmlab':
    create_env_fn = create_dmlab_single_env
    is_atari_environment = False
  elif environment_engine == 'atari':
    create_env_fn = create_single_atari_env
    is_atari_environment = True
  elif environment_engine == 'parkour':
    mujoco_key_path = ''
    create_env_fn = functools.partial(
        create_single_parkour_env, mujoco_key_path=mujoco_key_path)
    is_atari_environment = False
  else:
    raise ValueError('Unknown env engine {}'.format(environment_engine))

  # WARNING: python processes are not really compatible with other google3 code,
  # which can lead to deadlock. See go/g3process. This is why you can use
  # ThreadedVecEnv.
  VecEnvClass = (subproc_vec_env.SubprocVecEnv
                 if FLAGS.vec_env_class == 'SubprocVecEnv'
                 else threaded_vec_env.ThreadedVecEnv)

  with logger.ProfileKV('create_envs'):
    vec_env = VecEnvClass([
        (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=True,
                                    split='train'))
        for i in range(num_envs)
    ], level_cache_mode=level_cache_mode)
    valid_env = VecEnvClass([
        (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=False,
                                    split='valid'))
        for i in range(num_envs)
    ], level_cache_mode=level_cache_mode)
    test_env = VecEnvClass([
        (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=False,
                                    split='test'))
        for i in range(num_envs)
    ], level_cache_mode=level_cache_mode)
  if level_cache_mode:
    #logger.info('Starting the infinite map generation sequence...')
    logger.info('Starting the finite map generation sequence...')
    import time
    while True:
      time.sleep(10)

  # pylint: enable=g-long-lambda

  # Size of states when stored in the memory.
  embedding_size = models.EMBEDDING_DIM

  if not r_network_weights_path:
    # Empty string equivalent to no R_network checkpoint.
    r_network_weights_path = None
  r_net = r_network.RNetwork(
      (84, 84, 4) if is_atari_environment else Const.OBSERVATION_SHAPE,
      r_network_weights_path)

  # Only for online training do we need to train the R-network.
  r_network_trainer = None
  if online_r_training:
    r_network_trainer = r_network_training.RNetworkTrainer(
        r_net._r_network,  # pylint: disable=protected-access
        checkpoint_dir=r_network_weights_store_path)

  # Creates the episodic memory that is attached to each of those envs.
  vec_episodic_memory = [
      episodic_memory.EpisodicMemory(
          observation_shape=[embedding_size],
          observation_compare_fn=r_net.embedding_similarity)
      for _ in range(num_envs)
  ]

  # The size of images is reduced to 64x64 to make training faster.
  # Note: using color images with DMLab makes it much easier to train a policy.
  # So no conversion to grayscale.
  target_image_shape = [84, 84, 4 if is_atari_environment else 3]
  env_wrapper = curiosity_env_wrapper.CuriosityEnvWrapper(
      vec_env, vec_episodic_memory, r_net.embed_observation, target_image_shape)
  if r_network_trainer is not None:
    env_wrapper.add_observer(r_network_trainer)

  valid_env_wrapper, test_env_wrapper = (
      curiosity_env_wrapper.CuriosityEnvWrapper(
          env, vec_episodic_memory, r_net.embed_observation,
          target_image_shape,
          exploration_reward=('none' if (is_atari_environment or
                                         environment_engine == 'parkour')
                              else 'oracle'),
          scale_task_reward=scale_task_reward_for_eval,
          scale_surrogate_reward=scale_surrogate_reward_for_eval)
      for env in [valid_env, test_env])

  return env_wrapper, valid_env_wrapper, test_env_wrapper
示例#5
0
def learn(policy,
          env,
          nsteps,
          total_timesteps,
          ent_coef,
          lr,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          train_callback=None,
          eval_callback=None,
          cloud_sync_callback=None,
          cloud_sync_interval=1000,
          workdir='',
          use_curiosity=False,
          curiosity_strength=0.01,
          forward_inverse_ratio=0.2,
          curiosity_loss_strength=10,
          random_state_predictor=False,
          use_rlb=False,
          checkpoint_path_for_debugging=None):
    if isinstance(lr, float):
        lr = constfn(lr)
    else:
        assert callable(lr)
    if isinstance(cliprange, float):
        cliprange = constfn(cliprange)
    else:
        assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # pylint: disable=g-long-lambda
    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm,
                               use_curiosity=use_curiosity,
                               curiosity_strength=curiosity_strength,
                               forward_inverse_ratio=forward_inverse_ratio,
                               curiosity_loss_strength=curiosity_loss_strength,
                               random_state_predictor=random_state_predictor,
                               use_rlb=use_rlb)
    # pylint: enable=g-long-lambda
    if save_interval and workdir:
        with tf.gfile.Open(osp.join(workdir, 'make_model.pkl'), 'wb') as fh:
            fh.write(dill.dumps(make_model))
        saver = tf.train.Saver(max_to_keep=10000000)

        def save_state(fname):
            if not osp.exists(osp.dirname(fname)):
                os.makedirs(osp.dirname(fname))
            saver.save(tf.get_default_session(), fname)

    with tf.device('/gpu:0'):
        model = make_model()
    if load_path is not None:
        model.load(load_path)
    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    eval_callback=eval_callback)

    if checkpoint_path_for_debugging is not None:
        tf_util.load_state(checkpoint_path_for_debugging,
                           var_list=tf.get_collection(
                               tf.GraphKeys.GLOBAL_VARIABLES,
                               scope='rlb_model'))

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)
        (obs, next_obs, returns, masks, actions, values,
         neglogpacs), states, epinfos, (rewards, rewards_ext, rewards_int,
                                        rewards_int_raw, selected_infos,
                                        dones) = runner.run()
        epinfobuf.extend(epinfos)
        mblossvals = []
        mbhistos = []
        mbscs = []

        #if model.all_rlb_args.debug_args['debug_tf_timeline'] and update % 5 == 0:
        if model.all_rlb_args.debug_args[
                'debug_tf_timeline'] and update % 1 == 0:
            debug_timeliner = logger.TimeLiner()
        else:
            debug_timeliner = None

        if states is None:  # nonrecurrent version
            inds = np.arange(nbatch)
            for oe in range(noptepochs):
                gather_histo = (oe == noptepochs - 1)
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    gather_sc = ((oe == noptepochs - 1)
                                 and (start + nbatch_train >= nbatch))
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = [
                        arr[mbinds] for arr in (obs, returns, masks, actions,
                                                values, neglogpacs, next_obs)
                    ]
                    with logger.ProfileKV('train'):
                        fetches = model.train(lrnow,
                                              cliprangenow,
                                              slices[0],
                                              slices[6],
                                              slices[1],
                                              slices[2],
                                              slices[3],
                                              slices[4],
                                              slices[5],
                                              gather_histo=gather_histo,
                                              gather_sc=gather_sc,
                                              debug_timeliner=debug_timeliner)
                    mblossvals.append(fetches['losses'])
                    if gather_histo:
                        mbhistos.append(fetches['stats_histo'])
                    if gather_sc:
                        mbscs.append(fetches['stats_sc'])
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for oe in range(noptepochs):
                gather_histo = (oe == noptepochs - 1)
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    gather_sc = ((oe == noptepochs - 1)
                                 and (start + nbatch_train >= nbatch))
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = [
                        arr[mbflatinds]
                        for arr in (obs, returns, masks, actions, values,
                                    neglogpacs, next_obs)
                    ]
                    mbstates = states[mbenvinds]
                    fetches = model.train(lrnow,
                                          cliprangenow,
                                          slices[0],
                                          slices[6],
                                          slices[1],
                                          slices[2],
                                          slices[3],
                                          slices[4],
                                          slices[5],
                                          mbstates,
                                          gather_histo=gather_histo,
                                          gather_sc=gather_sc,
                                          debug_timeliner=debug_timeliner)
                    mblossvals.append(fetches['losses'])
                    if gather_histo:
                        mbhistos.append(fetches['stats_histo'])
                    if gather_sc:
                        mbscs.append(fetches['stats_sc'])

        if debug_timeliner is not None:
            with logger.ProfileKV("save_timeline_json"):
                debug_timeliner.save(
                    osp.join(workdir, 'timeline_{}.json'.format(update)))

        lossvals = np.mean(mblossvals, axis=0)
        assert len(mbscs) == 1
        scalars = mbscs[0]
        histograms = {
            n: np.concatenate([f[n] for f in mbhistos], axis=0)
            for n in model.stats_histo_names
        }
        logger.info('Histograms: {}'.format([(n, histograms[n].shape)
                                             for n in histograms.keys()]))
        #for v in histograms.values():
        #  assert len(v) == nbatch
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            fps_total = int((update * nbatch) / (tnow - tfirststart))

            #tf_op_names = [i.name for i in tf.get_default_graph().get_operations()]
            #logger.info('#################### tf_op_names: {}'.format(tf_op_names))
            tf_num_ops = len(tf.get_default_graph().get_operations())
            logger.info(
                '#################### tf_num_ops: {}'.format(tf_num_ops))
            logger.logkv('tf_num_ops', tf_num_ops)
            ev = explained_variance(values, returns)
            logger.logkv('serial_timesteps', update * nsteps)
            logger.logkv('nupdates', update)
            logger.logkv('total_timesteps', update * nbatch)
            logger.logkv('fps', fps)
            logger.logkv('fps_total', fps_total)
            logger.logkv(
                'remaining_time',
                float(tnow - tfirststart) / float(update) *
                float(nupdates - update))
            logger.logkv('explained_variance', float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if train_callback:
                train_callback(safemean([epinfo['l'] for epinfo in epinfobuf]),
                               safemean([epinfo['r'] for epinfo in epinfobuf]),
                               update * nbatch)
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)

            for n, v in scalars.items():
                logger.logkv(n, v)

            for n, v in histograms.items():
                logger.logkv(n, v)
                logger.logkv('mean_' + n, np.mean(v))
                logger.logkv('std_' + n, np.std(v))
                logger.logkv('max_' + n, np.max(v))
                logger.logkv('min_' + n, np.min(v))

            for n, v in locals().items():
                if n in ['rewards_int', 'rewards_int_raw']:
                    logger.logkv(n, v)
                if n in [
                        'rewards', 'rewards_ext', 'rewards_int',
                        'rewards_int_raw'
                ]:
                    logger.logkv('mean_' + n, np.mean(v))
                    logger.logkv('std_' + n, np.std(v))
                    logger.logkv('max_' + n, np.max(v))
                    logger.logkv('min_' + n, np.min(v))

            if model.rlb_model:
                if model.all_rlb_args.outer_args['rlb_normalize_ir']:
                    logger.logkv('rlb_ir_running_mean', runner.irff_rms.mean)
                    logger.logkv('rlb_ir_running_std',
                                 np.sqrt(runner.irff_rms.var))

            logger.dumpkvs()
        if (save_interval and (update % save_interval == 0 or update == 1)
                and workdir):
            checkdir = osp.join(workdir, 'checkpoints')
            if not tf.gfile.Exists(checkdir):
                tf.gfile.MakeDirs(checkdir)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)

            checkdir = osp.join(workdir, 'full_checkpoints')
            if not tf.gfile.Exists(checkdir):
                tf.gfile.MakeDirs(checkdir)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            save_state(savepath)
        if (cloud_sync_interval and update % cloud_sync_interval == 0
                and cloud_sync_callback):
            cloud_sync_callback()
    env.close()
    return model
示例#6
0
    def __init__(self, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm, use_curiosity,
                 curiosity_strength, forward_inverse_ratio,
                 curiosity_loss_strength, random_state_predictor, use_rlb):
        sess = tf.get_default_session()

        nenvs = nbatch_act
        act_model = policy(sess,
                           ob_space,
                           ac_space,
                           nbatch_act,
                           1,
                           reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nbatch_train,
                             nsteps,
                             reuse=True)

        assert not (use_curiosity and use_rlb)

        if use_curiosity:
            hidden_layer_size = 256
            self.state_encoder_net = tf.make_template(
                'state_encoder_net',
                pathak_utils.universeHead,
                create_scope_now_=True,
                trainable=(not random_state_predictor))
            self.icm_forward_net = tf.make_template(
                'icm_forward',
                pathak_utils.icm_forward_model,
                create_scope_now_=True,
                num_actions=ac_space.n,
                hidden_layer_size=hidden_layer_size)
            self.icm_inverse_net = tf.make_template(
                'icm_inverse',
                pathak_utils.icm_inverse_model,
                create_scope_now_=True,
                num_actions=ac_space.n,
                hidden_layer_size=hidden_layer_size)
        else:
            self.state_encoder_net = None
            self.icm_forward_net = None
            self.icm_inverse_net = None

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])
        # When computing intrinsic reward a different batch size is used (number
        # of parallel environments), thus we need to define separate
        # placeholders for them.
        X_NEXT, _ = observation_input(ob_space, nbatch_train)
        X_INTRINSIC_NEXT, _ = observation_input(ob_space, nbatch_act)
        X_INTRINSIC_CURRENT, _ = observation_input(ob_space, nbatch_act)

        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)

        self.all_rlb_args = get_rlb_args()
        if use_rlb:
            rlb_scope = 'rlb_model'
            #rlb_ir_weight = self.all_rlb_args.outer_args['rlb_ir_weight']
            rlb_loss_weight = self.all_rlb_args.outer_args['rlb_loss_weight']
            self.rlb_model = tf.make_template(
                rlb_scope,
                define_rlb_model,
                create_scope_now_=True,
                pdtype=train_model.pdtype,
                ac_space=ac_space,
                #nenvs=nenvs,
                optimizer=trainer,
                outer_scope=rlb_scope,
                **self.all_rlb_args.inner_args)
        else:
            self.rlb_model = None

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        curiosity_loss = self.compute_curiosity_loss(
            use_curiosity,
            train_model.X,
            A,
            X_NEXT,
            forward_inverse_ratio=forward_inverse_ratio,
            curiosity_loss_strength=curiosity_loss_strength)
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + curiosity_loss

        if use_curiosity:
            encoded_time_step = self.state_encoder_net(X_INTRINSIC_CURRENT)
            encoded_next_time_step = self.state_encoder_net(X_INTRINSIC_NEXT)
            intrinsic_reward = self.curiosity_forward_model_loss(
                encoded_time_step, A, encoded_next_time_step)
            intrinsic_reward = intrinsic_reward * curiosity_strength

        if self.rlb_model:
            assert 'intrinsic_reward' not in locals()
            intrinsic_reward = self.rlb_model(ph_set=construct_ph_set(
                x=X_INTRINSIC_CURRENT, x_next=X_INTRINSIC_NEXT, a=A)).int_rew
            #intrinsic_reward = intrinsic_reward * rlb_ir_weight

            rlb_out = self.rlb_model(
                ph_set=construct_ph_set(x=train_model.X, x_next=X_NEXT, a=A))
            loss = loss + rlb_loss_weight * rlb_out.aux_loss

        #with tf.variable_scope('model'):
        params = tf.trainable_variables()
        logger.info('{} trainable parameters: {}'.format(
            len(params), [p.name for p in params]))
        # For whatever reason Pathak multiplies the loss by 20.
        pathak_multiplier = 20 if use_curiosity else 1
        grads = tf.gradients(loss * pathak_multiplier, params)
        if max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        #trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        _train = trainer.apply_gradients(grads)

        if self.all_rlb_args.debug_args['debug_tf_timeline']:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            builder = option_builder.ProfileOptionBuilder
            profiler_opts = builder(
                builder.time_and_memory()).order_by('micros').build()
        else:
            run_options = None

        def getIntrinsicReward(curr, next_obs, actions):
            with logger.ProfileKV('get_intrinsic_reward'):
                return sess.run(
                    intrinsic_reward, {
                        X_INTRINSIC_CURRENT: curr,
                        X_INTRINSIC_NEXT: next_obs,
                        A: actions
                    })

        def train(lr,
                  cliprange,
                  obs,
                  next_obs,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  states=None,
                  gather_histo=False,
                  gather_sc=False,
                  debug_timeliner=None):
            advs = returns - values
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values,
                X_NEXT: next_obs
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            fetches = {
                'train':
                _train,
                'losses': [
                    pg_loss, vf_loss, entropy, approxkl, clipfrac,
                    curiosity_loss
                ],
            }
            if self.rlb_model:
                fetches['losses'].append(rlb_out.aux_loss)
            if gather_histo:
                fetches.update({'stats_histo': {}})
                if self.rlb_model:
                    fetches['stats_histo'].update({
                        n: getattr(rlb_out.stats_histo, n)
                        for n in self.stats_histo_names
                    })
            if gather_sc:
                fetches.update({'stats_sc': {}})
                if self.rlb_model:
                    fetches['stats_sc'].update({
                        n: getattr(rlb_out.stats_sc, n)
                        for n in self.stats_sc_names
                    })
            if debug_timeliner is not None and self.all_rlb_args.debug_args[
                    'debug_tf_timeline']:
                run_metadata = tf.RunMetadata()
                final_run_options = run_options
            else:
                run_metadata = None
                final_run_options = None
            with logger.ProfileKV('train_sess_run'):
                result = sess.run(
                    fetches,
                    td_map,
                    options=final_run_options,
                    run_metadata=run_metadata,
                )
            if debug_timeliner is not None and self.all_rlb_args.debug_args[
                    'debug_tf_timeline']:
                fetched_timeline = timeline.Timeline(run_metadata.step_stats)
                chrome_trace = fetched_timeline.generate_chrome_trace_format(
                    show_memory=True)
                debug_timeliner.update_timeline(chrome_trace)
                tf.profiler.profile(tf.get_default_graph(),
                                    run_meta=run_metadata,
                                    cmd='scope',
                                    options=profiler_opts)
            return result

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac', 'curiosity_loss'
        ]
        if self.rlb_model:
            self.loss_names.append('rlb_loss')
            self.stats_histo_names = sorted(
                list(rlb_out.stats_histo.__dict__.keys()))
            self.stats_sc_names = sorted(list(
                rlb_out.stats_sc.__dict__.keys()))
        else:
            self.stats_histo_names = []
            self.stats_sc_names = []

        def save(save_path):
            ps = sess.run(params)
            with tf.gfile.Open(save_path, 'wb') as fh:
                fh.write(dill.dumps(ps))

        def load(load_path):
            with tf.gfile.Open(load_path, 'rb') as fh:
                val = fh.read()
                loaded_params = dill.loads(val)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)
            # If you want to load weights, also save/load observation scaling inside
            # VecNormalize

        self.getIntrinsicReward = getIntrinsicReward
        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)  # pylint: disable=E1101
示例#7
0
def create_environments_with_rlb(env_name,
                                 num_envs,
                                 dmlab_homepath = '',
                                 action_set = '',
                                 base_seed = 123,
                                 scale_task_reward_for_eval = 1.0,
                                 scale_surrogate_reward_for_eval = 0.0,
                                 online_r_training = False,
                                 environment_engine = 'dmlab',
                                 r_network_weights_store_path = '',
                                 level_cache_mode=False,
                                 rlb_image_size=(84, 84)):
  """Creates a environments with R-network-based curiosity reward.

  Args:
    env_name: Name of the DMLab environment.
    num_envs: Number of parallel environment to spawn.
    r_network_weights_path: Path to the weights of the R-network.
    dmlab_homepath: Path to the DMLab MPM. Required when running on borg.
    action_set: One of {'small', 'nofire', ''}. Which action set to use.
    base_seed: Each environment will use base_seed+env_index as seed.
    scale_task_reward_for_eval: scale of the task reward to be used for
      valid/test environments.
    scale_surrogate_reward_for_eval: scale of the surrogate reward to be used
      for valid/test environments.
    online_r_training: Whether to enable online training of the R-network.
    environment_engine: either 'dmlab', 'atari', 'parkour'.
    r_network_weights_store_path: Directory where to store R checkpoints
      generated during online training of the R network.

  Returns:
    Wrapped environment with curiosity.
  """
  # Environments without intrinsic exploration rewards.
  # pylint: disable=g-long-lambda
  create_dmlab_single_env = functools.partial(create_single_env,
                                              dmlab_homepath=dmlab_homepath,
                                              action_set=action_set,
                                              level_cache_mode=level_cache_mode)

  if environment_engine == 'dmlab':
    create_env_fn = create_dmlab_single_env
    is_atari_environment = False
  elif environment_engine == 'atari':
    create_env_fn = create_single_atari_env
    is_atari_environment = True
  elif environment_engine == 'parkour':
    mujoco_key_path = ''
    create_env_fn = functools.partial(
        create_single_parkour_env, mujoco_key_path=mujoco_key_path)
    is_atari_environment = False
  else:
    raise ValueError('Unknown env engine {}'.format(environment_engine))

  VecEnvClass = (subproc_vec_env.SubprocVecEnv
                 if FLAGS.vec_env_class == 'SubprocVecEnv'
                 else threaded_vec_env.ThreadedVecEnv)

  with logger.ProfileKV('create_envs'):
    vec_env = VecEnvClass([
        (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=True,
                                    split='train'))
        for i in range(num_envs)
    ], level_cache_mode=level_cache_mode)
    valid_env = VecEnvClass([
        (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=False,
                                    split='valid'))
        for i in range(num_envs)
    ], level_cache_mode=level_cache_mode)
    test_env = VecEnvClass([
        (lambda _i=i: create_env_fn(env_name, base_seed + _i, use_monitor=False,
                                    split='test'))
        for i in range(num_envs)
    ], level_cache_mode=level_cache_mode)
  if level_cache_mode:
    logger.info('Starting the infinite map generation sequence...')
    import time
    while True:
      time.sleep(10)

  # pylint: enable=g-long-lambda

  rlb_image_shape = (84, 84, (4 if is_atari_environment else 3))

  rlb_model_wrapper = RLBModelWrapper(
      input_shape=rlb_image_shape,
      action_space=vec_env.action_space,
      max_grad_norm=0.5)

  rlb_model_trainer = RLBTrainer(
      rlb_model_wrapper,
      ensure_train_between_episodes=True)
      
  embedding_size = rlb_model_wrapper.rlb_all_z_dim
  vec_episodic_memory = [
      RLBEpisodicMemory(
          observation_shape=[embedding_size],
          replacement=rlb_model_wrapper.all_rlb_args.outer_args['rlb_ot_memory_algo'],
          capacity=rlb_model_wrapper.all_rlb_args.outer_args['rlb_ot_memory_capacity'])
      for _ in range(num_envs)
  ]

  exploration_reward_min_step = rlb_model_wrapper.all_rlb_args.outer_args['rlb_ot_exploration_min_step']
  if exploration_reward_min_step < 0:
    exploration_reward_min_step = rlb_model_trainer.training_interval

  env_wrapper = RLBEnvWrapper(
      vec_env=vec_env,
      vec_episodic_memory=vec_episodic_memory,
      observation_embedding_fn=rlb_model_wrapper.embed_observation,
      intrinsic_reward_fn=rlb_model_wrapper.compute_intrinsic_rewards,
      rlb_image_shape=rlb_image_shape,
      #target_image_shape=None,
      target_image_shape=[84, 84, 4 if is_atari_environment else 3],
      exploration_reward='rlb',
      scale_surrogate_reward=rlb_model_wrapper.all_rlb_args.outer_args['rlb_ir_weight'],
      ir_normalize_type=rlb_model_wrapper.all_rlb_args.outer_args['rlb_normalize_ir'],
      ir_clip_low=rlb_model_wrapper.all_rlb_args.outer_args['rlb_ot_ir_clip_low'],
      exploration_reward_min_step=exploration_reward_min_step,
      name='train')
  if rlb_model_trainer is not None:
    env_wrapper.add_observer(rlb_model_trainer)


  valid_env_wrapper, test_env_wrapper = (
      RLBEnvWrapper(
          vec_env=env,
          vec_episodic_memory=None,
          observation_embedding_fn=None,
          intrinsic_reward_fn=None,
          rlb_image_shape=None,
          target_image_shape=[84, 84, 4 if is_atari_environment else 3],
          exploration_reward=('none' if (is_atari_environment or
                                         environment_engine == 'parkour')
                              else 'oracle'),
          scale_task_reward=scale_task_reward_for_eval,
          scale_surrogate_reward=scale_surrogate_reward_for_eval,
          name=name)
      for env, name in [(valid_env, 'valid'), (test_env, 'test')])

  return env_wrapper, valid_env_wrapper, test_env_wrapper
  def __init__(self,
               input_shape,
               action_space,
               max_grad_norm=0.5,
               ):
    """Inits the RNetwork.

    Args:
      input_shape: (height, width, channel)
      weight_path: Path to the weights of the r_network.
    """

    self.input_shape = input_shape

    self.all_rlb_args = get_rlb_args()

    trainer = tf.train.AdamOptimizer(learning_rate=self.all_rlb_args.outer_args['rlb_ot_lr'])

    policy_pdtype = make_pdtype(action_space)
    self.policy_pdtype = policy_pdtype

    train_batch_size = self.all_rlb_args.outer_args['rlb_ot_batch_size']
    ph_obs = tf.placeholder(shape=(train_batch_size,) + input_shape, dtype=tf.uint8, name='obs')
    ph_obs_next = tf.placeholder(shape=(train_batch_size,) + input_shape, dtype=tf.uint8, name='obs_next')
    ph_acs = policy_pdtype.sample_placeholder([train_batch_size])

    ph_emb_net_obs = tf.placeholder(shape=(None,) + input_shape, dtype=tf.uint8, name='emb_net_obs')

    self.rlb_all_z_dim = self.all_rlb_args.inner_args['rlb_z_dim'] * self.all_rlb_args.inner_args['rlb_num_z_variables']
    ph_epimem_ir_emb_memory = tf.placeholder(shape=(None, None, self.rlb_all_z_dim), dtype=tf.float32, name='epimem_ir_emb_memory')
    ph_epimem_ir_emb_target = tf.placeholder(shape=(None, None, self.rlb_all_z_dim), dtype=tf.float32, name='epimem_ir_emb_target')

    rlb_scope = 'rlb_model'
    self._rlb_model = tf.make_template(
        rlb_scope, define_rlb_model,
        create_scope_now_=True,
        pdtype=policy_pdtype,
        ac_space=action_space,
        optimizer=trainer,
        outer_scope=rlb_scope,
        **self.all_rlb_args.inner_args)

    rlb_train_extra_kwargs = dict()
    rlb_train_out = self._rlb_model(
        ph_set=construct_ph_set(
            x=ph_obs,
            x_next=ph_obs_next,
            a=ph_acs),
        ph_set_for_embedding_net=None,
        ph_set_for_epimem_ir=None,
        **rlb_train_extra_kwargs
        )
    loss = rlb_train_out.aux_loss

    self._loss_names = ['rlb_loss']
    self._stats_histo_names = sorted(list(rlb_train_out.stats_histo.__dict__.keys()))
    self._stats_sc_names = sorted(list(rlb_train_out.stats_sc.__dict__.keys()))

    params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=rlb_scope)
    logger.info('RLBModelWrapper, {} trainable parameters: {}'.format(len(params), [p.name for p in params]))
    grads = tf.gradients(loss, params)
    grads_raw_global_norm = tf.global_norm(grads)
    if max_grad_norm is not None:
      grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
      grads_clipped_global_norm = tf.global_norm(grads)
    grads = list(zip(grads, params))
    train_op = trainer.apply_gradients(grads)
    def _train(obs, obs_next, acs, gather_histo=False, gather_sc=False):
      fetches = {
        'train': train_op,
        'losses': [loss],
      }
      if gather_histo:
        fetches['stats_histo'] = { n: getattr(rlb_train_out.stats_histo, n) for n in self._stats_histo_names }
      if gather_sc:
        fetches['stats_sc'] = { n: getattr(rlb_train_out.stats_sc, n) for n in self._stats_sc_names }
        fetches['additional_sc'] = {
          'rlb_grads_raw_global_norm': grads_raw_global_norm,
        }
        if max_grad_norm is not None:
          fetches['additional_sc'].update({
            'rlb_grads_clipped_global_norm': grads_clipped_global_norm,
          })
      sess = tf.get_default_session()
      result = sess.run(fetches, {ph_obs: obs, ph_obs_next: obs_next, ph_acs: acs})
      return result

    self._train = _train

    rlb_eval_extra_kwargs = dict()
    embedding_output = self._rlb_model(
        ph_set=None,
        ph_set_for_embedding_net=construct_ph_set_for_embedding_net(
            ph_emb_net_obs),
        ph_set_for_epimem_ir=None,
        **rlb_eval_extra_kwargs
        ).z
    def _embedding_network(obs):
      sess = tf.get_default_session()
      return sess.run(embedding_output, {ph_emb_net_obs: obs})
    self._embedding_network = _embedding_network

    epimem_ir_output = self._rlb_model(
        ph_set=None,
        ph_set_for_embedding_net=None,
        ph_set_for_epimem_ir=construct_ph_set_for_epimem_ir(ph_epimem_ir_emb_memory, ph_epimem_ir_emb_target),
        **rlb_eval_extra_kwargs
        ).epimem_ir
    def _ir_network(memory, x):
      sess = tf.get_default_session()
      ir = sess.run(epimem_ir_output, {ph_epimem_ir_emb_memory: memory, ph_epimem_ir_emb_target: x})
      # Don't multiply the IR weight here since it will be normalized in RLBEnvWrapper.
      #ir = ir * self.all_rlb_args.outer_args['rlb_ir_weight']
      return ir
    self._ir_network = _ir_network
    def __init__(
        self,
        vec_env,
        vec_episodic_memory,
        observation_embedding_fn,
        intrinsic_reward_fn,
        rlb_image_shape,
        target_image_shape,
        exploration_reward='rlb',
        scale_task_reward=1.0,
        scale_surrogate_reward=None,
        exploration_reward_min_step=0,
        ir_normalize_type=0,
        ir_clip_low=None,
        name='',
    ):
        logger.info('RLBEnvWrapper args: {}'.format(locals()))
        if exploration_reward == 'rlb':
            if len(vec_episodic_memory) != vec_env.num_envs:
                raise ValueError(
                    'Each env must have a unique episodic memory.')

        if target_image_shape is None:
            target_image_shape = rlb_image_shape

        if self._should_process_observation(vec_env.observation_space.shape):
            observation_space_shape = target_image_shape[:]
            observation_space = gym.spaces.Box(low=0,
                                               high=255,
                                               shape=observation_space_shape,
                                               dtype=np.float)
        else:
            observation_space = vec_env.observation_space

        VecEnvWrapper.__init__(self,
                               vec_env,
                               observation_space=observation_space)

        self._vec_episodic_memory = vec_episodic_memory
        self._observation_embedding_fn = observation_embedding_fn
        self._intrinsic_reward_fn = intrinsic_reward_fn
        self._rlb_image_shape = rlb_image_shape
        self._target_image_shape = target_image_shape

        self._exploration_reward = exploration_reward
        self._scale_task_reward = scale_task_reward
        self._scale_surrogate_reward = scale_surrogate_reward
        self._exploration_reward_min_step = exploration_reward_min_step

        # Oracle reward.
        self._oracles = [
            oracle.OracleExplorationReward() for _ in range(self.venv.num_envs)
        ]

        self._ir_normalize_type = ir_normalize_type
        if self._ir_normalize_type == 0:
            pass
        elif self._ir_normalize_type == 1:
            ir_normalize_gamma = 0.99
            self._irff = RewardForwardFilter(ir_normalize_gamma)
            self._irff_rms = RunningMeanStd()
        elif self._ir_normalize_type == 2:
            self._ir_rms = RunningMeanStd()
        elif self._ir_normalize_type == 3:
            self._ir_rms = SimpleWeightedMovingScalarMeanStd(alpha=0.0001)
        else:
            assert False

        self._ir_clip_low = ir_clip_low

        self._name = name

        # Cumulative task reward over an episode.
        self._episode_task_reward = [0.0] * self.venv.num_envs
        self._episode_bonus_reward = [0.0] * self.venv.num_envs

        # Stats on the task and exploration reward.
        self._stats_task_reward = MovingAverage(capacity=100)
        self._stats_bonus_reward = MovingAverage(capacity=100)

        # Total number of steps so far per environment.
        self._step_count = 0

        # Observers are notified each time a new time step is generated by the
        # environment.
        self._observers = []

        self._bonus_reward_raw_history = [[]
                                          for _ in range(self.venv.num_envs)]
        self._bonus_reward_history = [[] for _ in range(self.venv.num_envs)]