Пример #1
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
            processed_x = tf.layers.flatten(processed_x)
            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:,0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)


        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Пример #2
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Пример #3
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Пример #4
0
    def __init__(self, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm, use_curiosity,
                 curiosity_strength, forward_inverse_ratio,
                 curiosity_loss_strength, random_state_predictor):
        sess = tf.get_default_session()

        act_model = policy(sess,
                           ob_space,
                           ac_space,
                           nbatch_act,
                           1,
                           reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nbatch_train,
                             nsteps,
                             reuse=True)

        if use_curiosity:
            hidden_layer_size = 256
            self.state_encoder_net = tf.make_template(
                'state_encoder_net',
                pathak_utils.universeHead,
                create_scope_now_=True,
                trainable=(not random_state_predictor))
            self.icm_forward_net = tf.make_template(
                'icm_forward',
                pathak_utils.icm_forward_model,
                create_scope_now_=True,
                num_actions=ac_space.n,
                hidden_layer_size=hidden_layer_size)
            self.icm_inverse_net = tf.make_template(
                'icm_inverse',
                pathak_utils.icm_inverse_model,
                create_scope_now_=True,
                num_actions=ac_space.n,
                hidden_layer_size=hidden_layer_size)
        else:
            self.state_encoder_net = None
            self.icm_forward_net = None
            self.icm_inverse_net = None

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])
        # When computing intrinsic reward a different batch size is used (number
        # of parallel environments), thus we need to define separate
        # placeholders for them.
        X_NEXT, _ = observation_input(ob_space, nbatch_train)
        X_INTRINSIC_NEXT, _ = observation_input(ob_space, nbatch_act)
        X_INTRINSIC_CURRENT, _ = observation_input(ob_space, nbatch_act)

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        curiosity_loss = self.compute_curiosity_loss(
            use_curiosity,
            train_model.X,
            A,
            X_NEXT,
            forward_inverse_ratio=forward_inverse_ratio,
            curiosity_loss_strength=curiosity_loss_strength)
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + curiosity_loss

        if use_curiosity:
            encoded_time_step = self.state_encoder_net(X_INTRINSIC_CURRENT)
            encoded_next_time_step = self.state_encoder_net(X_INTRINSIC_NEXT)
            intrinsic_reward = self.curiosity_forward_model_loss(
                encoded_time_step, A, encoded_next_time_step)
            intrinsic_reward = intrinsic_reward * curiosity_strength

        with tf.variable_scope('model'):
            params = tf.trainable_variables()
        # For whatever reason Pathak multiplies the loss by 20.
        pathak_multiplier = 20 if use_curiosity else 1
        grads = tf.gradients(loss * pathak_multiplier, params)
        if max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        _train = trainer.apply_gradients(grads)

        def getIntrinsicReward(curr, next_obs, actions):
            return sess.run(intrinsic_reward, {
                X_INTRINSIC_CURRENT: curr,
                X_INTRINSIC_NEXT: next_obs,
                A: actions
            })

        def train(lr,
                  cliprange,
                  obs,
                  next_obs,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  states=None):
            advs = returns - values
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values,
                X_NEXT: next_obs
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run([
                pg_loss, vf_loss, entropy, approxkl, clipfrac, curiosity_loss,
                _train
            ], td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac', 'curiosity_loss'
        ]

        def save(save_path):
            ps = sess.run(params)
            with tf.gfile.Open(save_path, 'wb') as fh:
                fh.write(dill.dumps(ps))

        def load(load_path):
            with tf.gfile.Open(load_path, 'rb') as fh:
                loaded_params = dill.load(fh, encoding="latin1")
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)
            # If you want to load weights, also save/load observation scaling inside
            # VecNormalize

        self.getIntrinsicReward = getIntrinsicReward
        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)  # pylint: disable=E1101
Пример #5
0
    def __init__(self, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm, use_curiosity,
                 curiosity_strength, forward_inverse_ratio,
                 curiosity_loss_strength, random_state_predictor, use_rlb):
        sess = tf.get_default_session()

        nenvs = nbatch_act
        act_model = policy(sess,
                           ob_space,
                           ac_space,
                           nbatch_act,
                           1,
                           reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nbatch_train,
                             nsteps,
                             reuse=True)

        assert not (use_curiosity and use_rlb)

        if use_curiosity:
            hidden_layer_size = 256
            self.state_encoder_net = tf.make_template(
                'state_encoder_net',
                pathak_utils.universeHead,
                create_scope_now_=True,
                trainable=(not random_state_predictor))
            self.icm_forward_net = tf.make_template(
                'icm_forward',
                pathak_utils.icm_forward_model,
                create_scope_now_=True,
                num_actions=ac_space.n,
                hidden_layer_size=hidden_layer_size)
            self.icm_inverse_net = tf.make_template(
                'icm_inverse',
                pathak_utils.icm_inverse_model,
                create_scope_now_=True,
                num_actions=ac_space.n,
                hidden_layer_size=hidden_layer_size)
        else:
            self.state_encoder_net = None
            self.icm_forward_net = None
            self.icm_inverse_net = None

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])
        # When computing intrinsic reward a different batch size is used (number
        # of parallel environments), thus we need to define separate
        # placeholders for them.
        X_NEXT, _ = observation_input(ob_space, nbatch_train)
        X_INTRINSIC_NEXT, _ = observation_input(ob_space, nbatch_act)
        X_INTRINSIC_CURRENT, _ = observation_input(ob_space, nbatch_act)

        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)

        self.all_rlb_args = get_rlb_args()
        if use_rlb:
            rlb_scope = 'rlb_model'
            #rlb_ir_weight = self.all_rlb_args.outer_args['rlb_ir_weight']
            rlb_loss_weight = self.all_rlb_args.outer_args['rlb_loss_weight']
            self.rlb_model = tf.make_template(
                rlb_scope,
                define_rlb_model,
                create_scope_now_=True,
                pdtype=train_model.pdtype,
                ac_space=ac_space,
                #nenvs=nenvs,
                optimizer=trainer,
                outer_scope=rlb_scope,
                **self.all_rlb_args.inner_args)
        else:
            self.rlb_model = None

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        curiosity_loss = self.compute_curiosity_loss(
            use_curiosity,
            train_model.X,
            A,
            X_NEXT,
            forward_inverse_ratio=forward_inverse_ratio,
            curiosity_loss_strength=curiosity_loss_strength)
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + curiosity_loss

        if use_curiosity:
            encoded_time_step = self.state_encoder_net(X_INTRINSIC_CURRENT)
            encoded_next_time_step = self.state_encoder_net(X_INTRINSIC_NEXT)
            intrinsic_reward = self.curiosity_forward_model_loss(
                encoded_time_step, A, encoded_next_time_step)
            intrinsic_reward = intrinsic_reward * curiosity_strength

        if self.rlb_model:
            assert 'intrinsic_reward' not in locals()
            intrinsic_reward = self.rlb_model(ph_set=construct_ph_set(
                x=X_INTRINSIC_CURRENT, x_next=X_INTRINSIC_NEXT, a=A)).int_rew
            #intrinsic_reward = intrinsic_reward * rlb_ir_weight

            rlb_out = self.rlb_model(
                ph_set=construct_ph_set(x=train_model.X, x_next=X_NEXT, a=A))
            loss = loss + rlb_loss_weight * rlb_out.aux_loss

        #with tf.variable_scope('model'):
        params = tf.trainable_variables()
        logger.info('{} trainable parameters: {}'.format(
            len(params), [p.name for p in params]))
        # For whatever reason Pathak multiplies the loss by 20.
        pathak_multiplier = 20 if use_curiosity else 1
        grads = tf.gradients(loss * pathak_multiplier, params)
        if max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        #trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        _train = trainer.apply_gradients(grads)

        if self.all_rlb_args.debug_args['debug_tf_timeline']:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            builder = option_builder.ProfileOptionBuilder
            profiler_opts = builder(
                builder.time_and_memory()).order_by('micros').build()
        else:
            run_options = None

        def getIntrinsicReward(curr, next_obs, actions):
            with logger.ProfileKV('get_intrinsic_reward'):
                return sess.run(
                    intrinsic_reward, {
                        X_INTRINSIC_CURRENT: curr,
                        X_INTRINSIC_NEXT: next_obs,
                        A: actions
                    })

        def train(lr,
                  cliprange,
                  obs,
                  next_obs,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  states=None,
                  gather_histo=False,
                  gather_sc=False,
                  debug_timeliner=None):
            advs = returns - values
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values,
                X_NEXT: next_obs
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            fetches = {
                'train':
                _train,
                'losses': [
                    pg_loss, vf_loss, entropy, approxkl, clipfrac,
                    curiosity_loss
                ],
            }
            if self.rlb_model:
                fetches['losses'].append(rlb_out.aux_loss)
            if gather_histo:
                fetches.update({'stats_histo': {}})
                if self.rlb_model:
                    fetches['stats_histo'].update({
                        n: getattr(rlb_out.stats_histo, n)
                        for n in self.stats_histo_names
                    })
            if gather_sc:
                fetches.update({'stats_sc': {}})
                if self.rlb_model:
                    fetches['stats_sc'].update({
                        n: getattr(rlb_out.stats_sc, n)
                        for n in self.stats_sc_names
                    })
            if debug_timeliner is not None and self.all_rlb_args.debug_args[
                    'debug_tf_timeline']:
                run_metadata = tf.RunMetadata()
                final_run_options = run_options
            else:
                run_metadata = None
                final_run_options = None
            with logger.ProfileKV('train_sess_run'):
                result = sess.run(
                    fetches,
                    td_map,
                    options=final_run_options,
                    run_metadata=run_metadata,
                )
            if debug_timeliner is not None and self.all_rlb_args.debug_args[
                    'debug_tf_timeline']:
                fetched_timeline = timeline.Timeline(run_metadata.step_stats)
                chrome_trace = fetched_timeline.generate_chrome_trace_format(
                    show_memory=True)
                debug_timeliner.update_timeline(chrome_trace)
                tf.profiler.profile(tf.get_default_graph(),
                                    run_meta=run_metadata,
                                    cmd='scope',
                                    options=profiler_opts)
            return result

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac', 'curiosity_loss'
        ]
        if self.rlb_model:
            self.loss_names.append('rlb_loss')
            self.stats_histo_names = sorted(
                list(rlb_out.stats_histo.__dict__.keys()))
            self.stats_sc_names = sorted(list(
                rlb_out.stats_sc.__dict__.keys()))
        else:
            self.stats_histo_names = []
            self.stats_sc_names = []

        def save(save_path):
            ps = sess.run(params)
            with tf.gfile.Open(save_path, 'wb') as fh:
                fh.write(dill.dumps(ps))

        def load(load_path):
            with tf.gfile.Open(load_path, 'rb') as fh:
                val = fh.read()
                loaded_params = dill.loads(val)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)
            # If you want to load weights, also save/load observation scaling inside
            # VecNormalize

        self.getIntrinsicReward = getIntrinsicReward
        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)  # pylint: disable=E1101