示例#1
0
    def __init__(self, ob_space, ac_space, policy_type, args):
        self.gamma = args.gamma
        self.lam = args.lam
        self.adam_epsilon = args.adam_epsilon
        self.clip_param = args.clip_param
        self.entcoeff = args.entcoeff
        self.optim_stepsize = args.optim_stepsize
        self.int_coeff = args.int_coeff
        self.ext_coeff = args.ext_coeff

        self.ob_space = ob_space
        self.ac_space = ac_space

        self.policy_type = policy_type
        if self.policy_type == "coord_cnn":
            self.pi = CoordConvPolicy("pi", self.ob_space, self.ac_space,
                                      args.hidden_size, args.num_hid_layers,
                                      args.kind)
            self.oldpi = CoordConvPolicy("oldpi", self.ob_space, self.ac_space,
                                         args.hidden_size, args.num_hid_layers,
                                         args.kind)

        self.int_rew = RND("rnd_int_rew", self.pi.ob, args)
        self.rff_int = RewardForwardFilter(args.gamma)
        self.rff_rms_int = RunningMeanStd(comm=MPI.COMM_SELF, use_mpi=True)

        self.build_graph()
        U.initialize()
        self.adam.sync()
 def __init__(self, ob_space, ac_space, nsteps, gamma, venvs, stochpol,
              comm):
     # 32 number of envs
     self.lump_stride = venvs[0].num_envs
     # 1 venv
     self.venvs = venvs
     assert all(venv.num_envs == self.lump_stride for venv in
                self.venvs[1:]), 'All venvs should have the same num_envs'
     self.nlump = len(venvs)
     nenvs = self.nenvs = self.nlump * self.lump_stride
     self.reset_counter = 0
     self.env_results = [None] * self.nlump
     self.buf_vpreds_int = np.zeros((nenvs, nsteps), np.float32)
     self.buf_vpreds_ext = np.zeros((nenvs, nsteps), np.float32)
     self.buf_nlps = np.zeros((nenvs, nsteps), np.float32)
     self.buf_advs = np.zeros((nenvs, nsteps), np.float32)
     self.buf_advs_int = np.zeros((nenvs, nsteps), np.float32)
     self.buf_advs_ext = np.zeros((nenvs, nsteps), np.float32)
     self.buf_rews_int = np.zeros((nenvs, nsteps), np.float32)
     self.buf_rews_ext = np.zeros((nenvs, nsteps), np.float32)
     self.buf_acs = np.zeros((nenvs, nsteps, *ac_space.shape),
                             ac_space.dtype)
     self.buf_obs = {
         k:
         np.zeros([nenvs, nsteps] + stochpol.ph_ob[k].shape.as_list()[2:],
                  dtype=stochpol.ph_ob_dtypes[k])
         for k in stochpol.ph_ob_keys
     }
     self.buf_ob_last = {
         k: self.buf_obs[k][:, 0, ...].copy()
         for k in stochpol.ph_ob_keys
     }
     self.buf_epinfos = [{} for _ in range(self.nenvs)]
     self.buf_news = np.zeros((nenvs, nsteps), np.float32)
     self.buf_ent = np.zeros((nenvs, nsteps), np.float32)
     self.mem_state = stochpol.initial_state(nenvs)
     self.seg_init_mem_state = copy(
         self.mem_state
     )  # Memory state at beginning of segment of timesteps
     self.rff_int = RewardForwardFilter(gamma)
     self.rff_rms_int = RunningMeanStd(comm=comm, use_mpi=True)
     self.buf_new_last = self.buf_news[:, 0, ...].copy()
     self.buf_vpred_int_last = self.buf_vpreds_int[:, 0, ...].copy()
     self.buf_vpred_ext_last = self.buf_vpreds_ext[:, 0, ...].copy()
     self.step_count = 0  # counts number of timesteps that you've interacted with this set of environments
     self.t_last_update = time.time()
     self.statlists = defaultdict(lambda: deque([
     ], maxlen=100))  # Count other stats, e.g. optimizer outputs
     self.stats = defaultdict(float)  # Count episodes and timesteps
     self.stats['epcount'] = 0
     self.stats['n_updates'] = 0
     self.stats['tcount'] = 0
    def __init__(
        self,
        scope,
        ob_space,
        ac_space,
        policy_size="normal",
        extrahid=True,
        hidsize=128,
        memsize=128,
        rec_gate_init=0.0,
        update_ob_stats_independently_per_gpu=True,
        proportion_of_exp_used_for_predictor_update=1.0,
        dynamics_bonus=False,
        meta_rl=False,
    ):
        StochasticPolicy.__init__(self,
                                  scope,
                                  ob_space,
                                  ac_space,
                                  meta_rl=meta_rl)
        self.proportion_of_exp_used_for_predictor_update = (
            proportion_of_exp_used_for_predictor_update)
        enlargement = {"small": 1, "normal": 2, "large": 4}[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= enlargement
        hidsize *= enlargement
        convfeat = 16 * enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu,
        )
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name="state")
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        # Inputs to policy and value function will have different shapes depending on whether it is rollout
        # or optimization time, so we treat separately.
        (
            self.pdparam_opt,
            self.vpred_int_opt,
            self.vpred_ext_opt,
            self.snext_opt,
        ) = self.apply_policy(
            self.ph_ob['obs'][:, :-1],
            reuse=False,
            scope=scope,
            hidsize=hidsize,
            memsize=memsize,
            extrahid=extrahid,
            sy_nenvs=self.sy_nenvs,
            sy_nsteps=self.sy_nsteps - 1,
            pdparamsize=pdparamsize,
            additional_inputs=self.ph_ob,
        )
        (
            self.pdparam_rollout,
            self.vpred_int_rollout,
            self.vpred_ext_rollout,
            self.snext_rollout,
        ) = self.apply_policy(
            self.ph_ob['obs'],
            reuse=True,
            scope=scope,
            hidsize=hidsize,
            memsize=memsize,
            extrahid=extrahid,
            sy_nenvs=self.sy_nenvs,
            sy_nsteps=self.sy_nsteps,
            pdparamsize=pdparamsize,
            additional_inputs=self.ph_ob,
        )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            self.define_self_prediction_rew(convfeat=convfeat,
                                            rep_size=rep_size,
                                            enlargement=enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate
示例#4
0
class CnnPolicy(StochasticPolicy):
    def __init__(self,
                 scope,
                 ob_space,
                 ac_space,
                 policy_size='normal',
                 maxpool=False,
                 extrahid=True,
                 hidsize=128,
                 memsize=128,
                 rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 dynamics_bonus=False,
                 action_balance_coef=1.,
                 array_action=True):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        self.action_balance_coef = action_balance_coef
        self.array_action = array_action

        self.enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        self.rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= self.enlargement
        hidsize *= self.enlargement
        self.convfeat = 16 * self.enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        # self.int_rew_ab = None
        # self.int_rew_ab_opt = None
        if self.action_balance_coef is not None:
            # self.action_one_hot_list_rollout = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps)
            # self.action_one_hot_list_opt = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1)
            # with tf.device('/cpu:0'):
            self.action_one_hot_rollout = get_action_one_hot(
                self.ac_space.n, self.sy_nenvs, self.sy_nsteps)
            # self.action_one_hot_list_opt = get_action_one_hot(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1)

            if self.array_action:
                # with tf.device('/cpu:0'):
                self.action_encode_array_rollout = get_action_encode_array(
                    self.ac_space.n, self.sy_nenvs, self.sy_nsteps,
                    ob_space.shape[:2])
                # self.action_encode_array_rollout, self.split_lengths = get_action_encode_array(
                #     self.ac_space.n, self.sy_nenvs, self.sy_nsteps, ob_space.shape[:2])

            self.feat_var_ab, self.max_feat_ab, self.int_rew_ab, self.int_rew_ab_rollout, self.aux_loss_ab = \
                self.define_action_balance_rew(ph_ob=self.ph_ob[None],
                                               action_one_hot=self.action_one_hot_rollout,
                                               convfeat=self.convfeat,
                                               rep_size=self.rep_size, enlargement=self.enlargement,
                                               sy_nenvs=self.sy_nenvs,
                                               sy_nsteps=self.sy_nsteps,
                                               )
            # self.feat_var_ab_opt, self.max_feat_ab_opt, self.int_rew_ab_opt, self.aux_loss_ab = \
            #     self.define_action_balance_rew(ph_ob=self.ph_ob[None][:, :-1],
            #                                    action_one_hot=self.action_one_hot_list_opt,
            #                                    convfeat=self.convfeat,
            #                                    rep_size=self.rep_size, enlargement=self.enlargement,
            #                                    sy_nenvs=self.sy_nenvs,
            #                                    sy_nsteps=self.sy_nsteps - 1,
            #                                    )

            self.pd_ab = self.pdtype.pdfromflat(self.int_rew_ab)

        # Inputs to policy and value function will have different shapes depending on whether it is rollout
        # or optimization time, so we treat separately.
        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt, self.logits_raw_opt = \
            self.apply_policy(self.ph_ob[None][:, :-1],
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize
                              )
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout, _ = \
            self.apply_policy(self.ph_ob[None],
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize
                              )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=self.convfeat,
                                                rep_size=self.rep_size,
                                                enlargement=self.enlargement)
        else:
            self.define_self_prediction_rew(convfeat=self.convfeat,
                                            rep_size=self.rep_size,
                                            enlargement=self.enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate

    def apply_policy(
        self,
        ph_ob,
        reuse,
        scope,
        hidsize,
        memsize,
        extrahid,
        sy_nenvs,
        sy_nsteps,
        pdparamsize,
    ):
        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnPolicy: using '%s' shape %s as image input" %
                    (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())
        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(
                conv(X,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c2',
                     nf=64,
                     rf=4,
                     stride=2,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c3',
                     nf=64,
                     rf=4,
                     stride=1,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = to2d(X)
            mix_other_observations = [X]
            X = tf.concat(mix_other_observations, axis=1)
            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
            additional_size = 448
            X = activ(
                fc(X,
                   'fc_additional',
                   nh=additional_size,
                   init_scale=np.sqrt(2)))
            snext = tf.zeros((sy_nenvs, memsize))
            mix_timeout = [X]

            Xtout = tf.concat(mix_timeout, axis=1)
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1))
                X = X + activ(
                    fc(X, 'fc2act', nh=additional_size, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            logits_raw = pdparam

            if self.action_balance_coef is not None:
                # self.define_action_balance_rew(convfeat=self.convfeat, rep_size=self.rep_size, enlargement=self.enlargement)
                pdparam = pdparam + tf.stop_gradient(
                    self.int_rew_ab_rollout[:, :sy_nsteps] *
                    self.action_balance_coef)
                # pdparam = pdparam + tf.stop_gradient(self.int_rew_ab_rollout * self.action_balance_coef)

            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext, logits_raw

    def define_action_balance_rew(self,
                                  ph_ob,
                                  action_one_hot,
                                  convfeat,
                                  rep_size,
                                  enlargement,
                                  sy_nenvs,
                                  sy_nsteps,
                                  l2_normalize=True,
                                  sd_normalize=False):
        logger.info(
            "Using Action Balance BONUS ****************************************************"
        )

        with tf.variable_scope('action_balance', reuse=tf.AUTO_REUSE):
            # Random target network.
            ph = ph_ob
            assert len(ph.shape.as_list()) == 5  # B,T,H,W,C

            logger.info("CnnTarget: using '%s' shape %s as image input" %
                        (ph.name, str(ph.shape)))
            xr = ph
            xr = tf.cast(xr, tf.float32)
            xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:]
            xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0)

            def conv_layers(xr):
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))

                return xr

            if self.array_action:
                # with tf.device('/cpu:0'):
                xr = tf.reshape(tf.tile(xr, [1, self.ac_space.n, 1, 1]),
                                (-1, *xr.shape[1:]))
                xr = tf.concat(
                    [xr, self.action_encode_array_rollout[..., None]], axis=-1)
                xr = conv_layers(xr)

                # when n_env=128, the batch size is too big for GPU. Split inputs in order to use less memory.
                # xr_results = []
                # xr_list = tf.split(xr, num_or_size_splits=self.split_lengths)
                # state_shape = xr_list[0].shape[1:]

                # for i in range(len(xr_list)):
                #     action_array_tmp = tf.tile(self.action_encode_array_rollout, (self.split_lengths[i], 1, 1))
                #     xr = tf.reshape(tf.tile(xr_list[i], [1, self.ac_space.n, 1, 1]), (-1, *state_shape))
                #     # xr = tf.concat([xr, self.action_encode_array_list_rollout[i][..., None]], axis=-1)
                #     xr = tf.concat([xr, action_array_tmp[..., None]], axis=-1)
                #     xr = conv_layers(xr)
                #     xr_results.append(xr)
                # xr = tf.concat(xr_results, 0)
            else:
                xr = conv_layers(xr)
            rgbr = to2d(xr)

            if not self.array_action:
                # extend action dim
                rgbr_shape = rgbr.shape.as_list()
                rgbr = tf.reshape(tf.tile(rgbr, [1, self.ac_space.n]),
                                  (-1, rgbr_shape[1]))

            X_r = tf.nn.relu(
                fc(tf.concat([rgbr, action_one_hot], 1),
                   'fc1r',
                   nh=256,
                   init_scale=np.sqrt(2)))
            X_r = fc(tf.concat([X_r, action_one_hot], 1),
                     'fc2r',
                     nh=rep_size,
                     init_scale=np.sqrt(2))

            # Predictor network.
            logger.info("CnnTarget: using '%s' shape %s as image input" %
                        (ph.name, str(ph.shape)))
            # xrp = ph[:, :-1]
            xrp = ph
            xrp = tf.cast(xrp, tf.float32)
            xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))
            # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok?
            xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0,
                                   5.0)

            xrp = tf.nn.leaky_relu(
                conv(xrp,
                     'c1rp_pred',
                     nf=convfeat,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2)))
            xrp = tf.nn.leaky_relu(
                conv(xrp,
                     'c2rp_pred',
                     nf=convfeat * 2,
                     rf=4,
                     stride=2,
                     init_scale=np.sqrt(2)))
            xrp = tf.nn.leaky_relu(
                conv(xrp,
                     'c3rp_pred',
                     nf=convfeat * 2,
                     rf=3,
                     stride=1,
                     init_scale=np.sqrt(2)))
            rgbrp = to2d(xrp)

            rgbrp_shape = rgbrp.shape.as_list()
            rgbrp = tf.reshape(tf.tile(rgbrp, [1, self.ac_space.n]),
                               (-1, rgbrp_shape[1]))
            X_r_hat = tf.nn.relu(
                fc(tf.concat([rgbrp, action_one_hot], 1),
                   'fc1r_hat1_pred',
                   nh=256 * enlargement,
                   init_scale=np.sqrt(2)))
            X_r_hat = tf.nn.relu(
                fc(tf.concat([X_r_hat, action_one_hot], 1),
                   'fc1r_hat2_pred',
                   nh=256 * enlargement,
                   init_scale=np.sqrt(2)))
            X_r_hat = fc(tf.concat([X_r_hat, action_one_hot], 1),
                         'fc1r_hat3_pred',
                         nh=rep_size,
                         init_scale=np.sqrt(2))

            X_r = tf.reshape(X_r,
                             (sy_nenvs, sy_nsteps, self.ac_space.n, rep_size))
            X_r_hat = tf.reshape(
                X_r_hat, (sy_nenvs, sy_nsteps, self.ac_space.n, rep_size))
            int_rew_ab_rollout = tf.reduce_mean(
                tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1)
            if l2_normalize:
                int_rew_ab_rollout = tf.math.l2_normalize(int_rew_ab_rollout,
                                                          axis=-1)
            elif sd_normalize:
                mean_tmp, var_tmp = tf.nn.moments(int_rew_ab_rollout,
                                                  axes=[-1],
                                                  keep_dims=True)
                int_rew_ab_rollout = (int_rew_ab_rollout -
                                      mean_tmp) / tf.math.sqrt(var_tmp)

            X_r = X_r[:, :-1]
            X_r_hat = X_r_hat[:, :-1]
            feat_var_ab = tf.reduce_mean(tf.nn.moments(X_r, axes=[0, 1])[1])
            max_feat_ab = tf.reduce_max(tf.abs(X_r))
            int_rew_ab = tf.reduce_mean(
                tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1)
            if l2_normalize:
                logger.info("Normalize logits:l2")
                int_rew_ab = tf.math.l2_normalize(int_rew_ab, axis=-1)
            elif sd_normalize:
                logger.info("Normalize logits:standard")
                mean_tmp, var_tmp = tf.nn.moments(int_rew_ab,
                                                  axes=[-1],
                                                  keep_dims=True)
                int_rew_ab = (int_rew_ab - mean_tmp) / tf.math.sqrt(var_tmp)

            # int_rew_ab = tf.reshape(int_rew_ab, (sy_nenvs, sy_nsteps, *int_rew_ab.shape.as_list()[1:]))
            # int_rew_ab = tf.reshape(int_rew_ab, (sy_nenvs, sy_nsteps, self.ac_space.n))

            # self.int_rew_ab = tf.reshape(self.int_rew_ab, (self.sy_nenvs, self.sy_nsteps - 1, self.ac_space.n))

            noisy_targets = tf.stop_gradient(X_r)
            # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
            aux_loss_ab = tf.reduce_mean(tf.square(noisy_targets - X_r_hat),
                                         [-1])
            mask = tf.random_uniform(shape=tf.shape(aux_loss_ab),
                                     minval=0.,
                                     maxval=1.,
                                     dtype=tf.float32)
            mask = tf.cast(
                mask < self.proportion_of_exp_used_for_predictor_update,
                tf.float32)
            aux_loss_ab = tf.reduce_sum(mask * aux_loss_ab) / tf.maximum(
                tf.reduce_sum(mask), 1.)

        return feat_var_ab, max_feat_ab, int_rew_ab, int_rew_ab_rollout, aux_loss_ab

    def define_self_prediction_rew(self, convfeat, rep_size, enlargement):
        logger.info(
            "Using RND BONUS ****************************************************"
        )

        # RND bonus.

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c1rp_pred',
                         nf=convfeat,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c2rp_pred',
                         nf=convfeat * 2,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c3rp_pred',
                         nf=convfeat * 2,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)
                # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(rgbrp,
                       'fc1r_hat1_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(X_r_hat,
                       'fc1r_hat2_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(X_r_hat,
                             'fc1r_hat3_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        targets = tf.stop_gradient(X_r)
        # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)

    def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement):
        # Dynamics loss with random features.

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)
        assert ac_one_hot.get_shape().ndims == 3
        assert ac_one_hot.get_shape().as_list() == [
            None, None, self.ac_space.n
        ], ac_one_hot.get_shape().as_list()
        ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n))

        def cond(x):
            return tf.concat([x, ac_one_hot], 1)

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, :-1]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))
                # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok?
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c1rp_pred',
                         nf=convfeat,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c2rp_pred',
                         nf=convfeat * 2,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c3rp_pred',
                         nf=convfeat * 2,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)

                # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(cond(rgbrp),
                       'fc1r_hat1_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(cond(X_r_hat),
                       'fc1r_hat2_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(cond(X_r_hat),
                             'fc1r_hat3_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)

    def initial_state(self, n):
        return np.zeros((n, self.memsize), np.float32)

    def call(self, dict_obs, new, istate, update_obs_stats=False):
        for ob in dict_obs.values():
            if ob is not None:
                if update_obs_stats:
                    raise NotImplementedError
                    ob = ob.astype(np.float32)
                    ob = ob.reshape(-1, *self.ob_space.shape)
                    self.ob_rms.update(ob)
        # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk.
        # It will use whatever observation spaces saved to disk along with other ctor params.
        feed1 = {self.ph_ob[k]: dict_obs[k][:, None] for k in self.ph_ob_keys}
        feed2 = {
            self.ph_istate: istate,
            self.ph_new: new[:, None].astype(np.float32)
        }
        feed1.update({
            self.ph_mean: self.ob_rms.mean,
            self.ph_std: self.ob_rms.var**0.5
        })
        # for f in feed1:
        #     print(f)
        a, vpred_int, vpred_ext, nlp, newstate, ent = tf.get_default_session(
        ).run([
            self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout,
            self.nlp_samp, self.snext_rollout, self.entropy_rollout
        ],
              feed_dict={
                  **feed1,
                  **feed2
              })
        return a[:, 0], vpred_int[:, 0], vpred_ext[:,
                                                   0], nlp[:,
                                                           0], newstate, ent[:,
                                                                             0]
示例#5
0
import time
class CnnPolicy(StochasticPolicy):
    def __init__(
        self,
        scope,
        ob_space,
        ac_space,
        policy_size="normal",
        extrahid=True,
        hidsize=128,
        memsize=128,
        rec_gate_init=0.0,
        update_ob_stats_independently_per_gpu=True,
        proportion_of_exp_used_for_predictor_update=1.0,
        dynamics_bonus=False,
        meta_rl=False,
    ):
        StochasticPolicy.__init__(self,
                                  scope,
                                  ob_space,
                                  ac_space,
                                  meta_rl=meta_rl)
        self.proportion_of_exp_used_for_predictor_update = (
            proportion_of_exp_used_for_predictor_update)
        enlargement = {"small": 1, "normal": 2, "large": 4}[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= enlargement
        hidsize *= enlargement
        convfeat = 16 * enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu,
        )
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name="state")
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        # Inputs to policy and value function will have different shapes depending on whether it is rollout
        # or optimization time, so we treat separately.
        (
            self.pdparam_opt,
            self.vpred_int_opt,
            self.vpred_ext_opt,
            self.snext_opt,
        ) = self.apply_policy(
            self.ph_ob['obs'][:, :-1],
            reuse=False,
            scope=scope,
            hidsize=hidsize,
            memsize=memsize,
            extrahid=extrahid,
            sy_nenvs=self.sy_nenvs,
            sy_nsteps=self.sy_nsteps - 1,
            pdparamsize=pdparamsize,
            additional_inputs=self.ph_ob,
        )
        (
            self.pdparam_rollout,
            self.vpred_int_rollout,
            self.vpred_ext_rollout,
            self.snext_rollout,
        ) = self.apply_policy(
            self.ph_ob['obs'],
            reuse=True,
            scope=scope,
            hidsize=hidsize,
            memsize=memsize,
            extrahid=extrahid,
            sy_nenvs=self.sy_nenvs,
            sy_nsteps=self.sy_nsteps,
            pdparamsize=pdparamsize,
            additional_inputs=self.ph_ob,
        )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            self.define_self_prediction_rew(convfeat=convfeat,
                                            rep_size=rep_size,
                                            enlargement=enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate

    @staticmethod
    def apply_policy(
        ph_ob,
        reuse,
        scope,
        hidsize,
        memsize,
        extrahid,
        sy_nenvs,
        sy_nsteps,
        pdparamsize,
        additional_inputs=None,
    ):
        meta_rl = False
        data_format = "NHWC"
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info(
            f"CnnPolicy: using '{ph.name}' shape {ph.shape} as image input")
        X = tf.cast(ph, tf.float32) / 255.0
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())
        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device("/gpu:0" if yes_gpu else "/cpu:0"):
            X = activ(
                conv(
                    X,
                    "c1",
                    nf=32,
                    rf=8,
                    stride=4,
                    init_scale=np.sqrt(2),
                    data_format=data_format,
                ))
            X = activ(
                conv(
                    X,
                    "c2",
                    nf=64,
                    rf=4,
                    stride=2,
                    init_scale=np.sqrt(2),
                    data_format=data_format,
                ))
            X = activ(
                conv(
                    X,
                    "c3",
                    nf=64,
                    rf=4,
                    stride=1,
                    init_scale=np.sqrt(2),
                    data_format=data_format,
                ))
            X = to2d(X)
            mix_other_observations = [X]

            if ('prev_acs' in additional_inputs) and ('prev_rew'
                                                      in additional_inputs):
                # Cast numpy arrays to tf tensors
                prev_acs = tf.cast(additional_inputs['prev_acs'], tf.float32)
                prev_rew = tf.cast(additional_inputs['prev_rew'], tf.float32)

                # Flatten out time dimension
                prev_acs = tf.reshape(prev_acs,
                                      (-1, *prev_acs.shape.as_list()[2:]))
                prev_rew = tf.reshape(prev_rew,
                                      (-1, *prev_rew.shape.as_list()[2:]))

                # Add to 2D features going to FC layers
                mix_other_observations.extend([prev_acs, prev_rew])

            X = tf.concat(mix_other_observations, axis=1)
            X = activ(fc(X, "fc1", nh=hidsize, init_scale=np.sqrt(2)))
            additional_size = 448
            X = activ(
                fc(X,
                   "fc_additional",
                   nh=additional_size,
                   init_scale=np.sqrt(2)))
            snext = tf.zeros((sy_nenvs, memsize))
            mix_timeout = [X]

            Xtout = tf.concat(mix_timeout, axis=1)
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, "fc2val", nh=additional_size, init_scale=0.1))
                X = X + activ(
                    fc(X, "fc2act", nh=additional_size, init_scale=0.1))
            pdparam = fc(X, "pd", nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, "vf_int", nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, "vf_ext", nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext

    def define_self_prediction_rew(self, convfeat, rep_size, enlargement):
        logger.info(
            "Using RND BONUS ****************************************************"
        )

        # RND bonus.

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info(
                    f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input"
                )
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c1r",
                        nf=convfeat * 1,
                        rf=8,
                        stride=4,
                        init_scale=np.sqrt(2),
                    ))
                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c2r",
                        nf=convfeat * 2 * 1,
                        rf=4,
                        stride=2,
                        init_scale=np.sqrt(2),
                    ))
                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c3r",
                        nf=convfeat * 2 * 1,
                        rf=3,
                        stride=1,
                        init_scale=np.sqrt(2),
                    ))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], "fc1r", nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info(
                    f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input"
                )
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c1rp_pred",
                        nf=convfeat,
                        rf=8,
                        stride=4,
                        init_scale=np.sqrt(2),
                    ))
                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c2rp_pred",
                        nf=convfeat * 2,
                        rf=4,
                        stride=2,
                        init_scale=np.sqrt(2),
                    ))
                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c3rp_pred",
                        nf=convfeat * 2,
                        rf=3,
                        stride=1,
                        init_scale=np.sqrt(2),
                    ))
                rgbrp = to2d(xrp)
                X_r_hat = tf.nn.relu(
                    fc(
                        rgbrp,
                        "fc1r_hat1_pred",
                        nh=256 * enlargement,
                        init_scale=np.sqrt(2),
                    ))
                X_r_hat = tf.nn.relu(
                    fc(
                        X_r_hat,
                        "fc1r_hat2_pred",
                        nh=256 * enlargement,
                        init_scale=np.sqrt(2),
                    ))
                X_r_hat = fc(X_r_hat,
                             "fc1r_hat3_pred",
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.0,
                                 maxval=1.0,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.0)

    def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement):
        # Dynamics loss with random features.

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info(
                    f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input"
                )
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c1r",
                        nf=convfeat * 1,
                        rf=8,
                        stride=4,
                        init_scale=np.sqrt(2),
                    ))
                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c2r",
                        nf=convfeat * 2 * 1,
                        rf=4,
                        stride=2,
                        init_scale=np.sqrt(2),
                    ))
                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c3r",
                        nf=convfeat * 2 * 1,
                        rf=3,
                        stride=1,
                        init_scale=np.sqrt(2),
                    ))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], "fc1r", nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)
        assert ac_one_hot.get_shape().ndims == 3
        assert ac_one_hot.get_shape().as_list() == [
            None,
            None,
            self.ac_space.n,
        ], ac_one_hot.get_shape().as_list()
        ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n))

        def cond(x):
            return tf.concat([x, ac_one_hot], 1)

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info(
                    f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input"
                )
                xrp = ph[:, :-1]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))
                # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok?
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c1rp_pred",
                        nf=convfeat,
                        rf=8,
                        stride=4,
                        init_scale=np.sqrt(2),
                    ))
                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c2rp_pred",
                        nf=convfeat * 2,
                        rf=4,
                        stride=2,
                        init_scale=np.sqrt(2),
                    ))
                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c3rp_pred",
                        nf=convfeat * 2,
                        rf=3,
                        stride=1,
                        init_scale=np.sqrt(2),
                    ))
                rgbrp = to2d(xrp)

                X_r_hat = tf.nn.relu(
                    fc(
                        cond(rgbrp),
                        "fc1r_hat1_pred",
                        nh=256 * enlargement,
                        init_scale=np.sqrt(2),
                    ))
                X_r_hat = tf.nn.relu(
                    fc(
                        cond(X_r_hat),
                        "fc1r_hat2_pred",
                        nh=256 * enlargement,
                        init_scale=np.sqrt(2),
                    ))
                X_r_hat = fc(cond(X_r_hat),
                             "fc1r_hat3_pred",
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.0,
                                 maxval=1.0,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.0)

    def initial_state(self, n):
        return np.zeros((n, self.memsize), np.float32)

    def call(self, dict_obs, new, istate, update_obs_stats=False):
        for ob in dict_obs.values():
            if (ob is not None) and update_obs_stats:
                raise NotImplementedError
                ob = ob.astype(np.float32)
                ob = ob.reshape(-1, *self.ob_space.shape)
                self.ob_rms.update(ob)
        # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk.
        # It will use whatever observation spaces saved to disk along with other ctor params.
        feed1 = {
            self.ph_ob[k]: dict_obs[k]
            for k in self.ph_ob_keys if k != 'obs'
        }
        feed1.update({
            self.ph_mean: self.ob_rms.mean,
            self.ph_std: self.ob_rms.var**0.5
        })

        # Add an extra empty dimension to the primary observation if needed
        if len(dict_obs['obs'].shape) == 4:
            feed1[self.ph_ob['obs']] = dict_obs['obs'][:, None]
        else:
            feed1[self.ph_ob['obs']] = dict_obs['obs']

        feed2 = {
            self.ph_istate: istate,
            self.ph_new: new[:, None].astype(np.float32)
        }
        a, vpred_int, vpred_ext, nlp, newstate, ent = tf.get_default_session(
        ).run(
            [
                self.a_samp,
                self.vpred_int_rollout,
                self.vpred_ext_rollout,
                self.nlp_samp,
                self.snext_rollout,
                self.entropy_rollout,
            ],
            feed_dict={
                **feed1,
                **feed2
            },
        )
        return a[:, 0], vpred_int[:, 0], vpred_ext[:,
                                                   0], nlp[:,
                                                           0], newstate, ent[:,
                                                                             0]
示例#7
0
class CnnGruPolicy(StochasticPolicy):
    def __init__(
        self,
        scope,
        ob_space,
        ac_space,
        policy_size='normal',
        maxpool=False,
        extrahid=True,
        hidsize=128,
        memsize=128,
        rec_gate_init=0.0,
        update_ob_stats_independently_per_gpu=True,
        proportion_of_exp_used_for_predictor_update=1.,
        dynamics_bonus=False,
    ):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= enlargement  #256
        hidsize *= enlargement  #256
        convfeat = 16 * enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
            self.apply_policy(self.ph_ob[None][:,:-1],
                              ph_new=self.ph_new,
                              ph_istate=ph_istate,
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize,
                              rec_gate_init=rec_gate_init
                              )
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
            self.apply_policy(self.ph_ob[None],
                              ph_new=self.ph_new,
                              ph_istate=ph_istate,
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize,
                              rec_gate_init=rec_gate_init
                              )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            self.define_self_prediction_rew(convfeat=convfeat,
                                            rep_size=rep_size,
                                            enlargement=enlargement)
            self.step_prediction(convfeat=convfeat,
                                 rep_size=rep_size,
                                 enlargement=enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate

    @staticmethod
    def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize,
                     extrahid, sy_nenvs, sy_nsteps, pdparamsize,
                     rec_gate_init):
        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnGruPolicy: using '%s' shape %s as image input" %
                    (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())

        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(
                conv(X,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c2',
                     nf=64,
                     rf=4,
                     stride=2,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c3',
                     nf=64,
                     rf=4,
                     stride=1,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = to2d(X)
            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
            X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize])
            X, snext = tf.nn.dynamic_rnn(GRUCell(memsize,
                                                 rec_gate_init=rec_gate_init),
                                         (X, ph_new[:, :, None]),
                                         dtype=tf.float32,
                                         time_major=False,
                                         initial_state=ph_istate)
            X = tf.reshape(X, (-1, memsize))
            Xtout = X
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1))
                X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext

    def define_self_prediction_rew(self, convfeat, rep_size, enlargement):
        #RND.
        # Random target network.
        print('self_predict')
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c1rp_pred',
                         nf=convfeat,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c2rp_pred',
                         nf=convfeat * 2,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c3rp_pred',
                         nf=convfeat * 2,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)
                X_r_hat = tf.nn.relu(
                    fc(rgbrp,
                       'fc1r_hat1_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(X_r_hat,
                       'fc1r_hat2_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(X_r_hat,
                             'fc1r_hat3_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(
            tf.square(noisy_targets + tf.sqrt(self.stepvalues / 512) -
                      X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)

    def step_prediction(self, convfeat, rep_size, enlargement):
        #RND.
        # Random target network.
        print('step_predict')
        #for ph in self.ph_ob.values():
        #if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
        #logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
        #xr = ph[:,1:]
        #xr = tf.cast(xr, tf.float32)
        #xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:]
        #xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0)

        #xr = tf.nn.leaky_relu(conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2)))
        #xr = tf.nn.leaky_relu(conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2)))
        #xr = tf.nn.leaky_relu(conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2)))
        #rgbr = [to2d(xr)]
        #X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xstep = ph[:, 0:-1]
                xstep = tf.cast(xstep, tf.float32)
                xstep = tf.reshape(xstep,
                                   (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                   -1:]
                xstep = tf.clip_by_value((xstep - self.ph_mean) / self.ph_std,
                                         -5.0, 5.0)

                xstep = tf.nn.leaky_relu(
                    conv(xstep,
                         'c1step_pred',
                         nf=convfeat,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xstep = tf.nn.leaky_relu(
                    conv(xstep,
                         'c2step_pred',
                         nf=convfeat * 2,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xstep = tf.nn.leaky_relu(
                    conv(xstep,
                         'c3step_pred',
                         nf=convfeat * 2,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbrp = to2d(xstep)
                X_r_step = tf.nn.relu(
                    fc(rgbrp,
                       'fc1_step_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_step = tf.nn.relu(
                    fc(X_r_step,
                       'fc2_step_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_step = fc(X_r_step,
                              'fc3_step_pred',
                              nh=rep_size,
                              init_scale=np.sqrt(2))

        #self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        #self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_step_rew = tf.reduce_mean(tf.square(X_r_step),
                                           axis=-1,
                                           keep_dims=True)
        self.int_step_rew = tf.reshape(self.int_step_rew,
                                       (self.sy_nenvs, self.sy_nsteps - 1))

        #noisy_targets = tf.stop_gradient(X_r)
        self.step_loss = tf.reduce_mean(
            tf.square(tf.sqrt(self.stepvalues / 512) - X_r_step), -1)
        mask = tf.random_uniform(shape=tf.shape(self.step_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.step_loss = tf.reduce_sum(mask * self.step_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)

    def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement):
        #Dynamics based bonus.
        print('dynamics predict')
        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)
        assert ac_one_hot.get_shape().ndims == 3
        assert ac_one_hot.get_shape().as_list() == [
            None, None, self.ac_space.n
        ], ac_one_hot.get_shape().as_list()
        ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n))

        def cond(x):
            return tf.concat([x, ac_one_hot], 1)

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, :-1]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))
                # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok?
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c1rp_pred',
                         nf=convfeat,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c2rp_pred',
                         nf=convfeat * 2,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c3rp_pred',
                         nf=convfeat * 2,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)

                # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(cond(rgbrp),
                       'fc1r_hat1_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(cond(X_r_hat),
                       'fc1r_hat2_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(cond(X_r_hat),
                             'fc1r_hat3_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)

    def initial_state(self, n):
        return np.zeros((n, self.memsize), np.float32)

    def call(self, dict_obs, new, istate, update_obs_stats=False):
        for ob in dict_obs.values():
            if ob is not None:
                if update_obs_stats:
                    raise NotImplementedError
                    ob = ob.astype(np.float32)
                    ob = ob.reshape(-1, *self.ob_space.shape)
                    self.ob_rms.update(ob)
        # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk.
        # It will use whatever observation spaces saved to disk along with other ctor params.
        feed1 = {self.ph_ob[k]: dict_obs[k][:, None] for k in self.ph_ob_keys}
        feed2 = {
            self.ph_istate: istate,
            self.ph_new: new[:, None].astype(np.float32)
        }
        feed1.update({
            self.ph_mean: self.ob_rms.mean,
            self.ph_std: self.ob_rms.var**0.5
        })
        # for f in feed1:
        #     print(f)
        a, vpred_int, vpred_ext, nlp, newstate, ent = tf.get_default_session(
        ).run([
            self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout,
            self.nlp_samp, self.snext_rollout, self.entropy_rollout
        ],
              feed_dict={
                  **feed1,
                  **feed2
              })
        return a[:, 0], vpred_int[:, 0], vpred_ext[:,
                                                   0], nlp[:,
                                                           0], newstate, ent[:,
                                                                             0]
示例#8
0
    def __init__(
        self,
        scope,
        ob_space,
        ac_space,
        policy_size='normal',
        maxpool=False,
        extrahid=True,
        hidsize=128,
        memsize=128,
        rec_gate_init=0.0,
        update_ob_stats_independently_per_gpu=True,
        proportion_of_exp_used_for_predictor_update=1.,
        dynamics_bonus=False,
    ):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= enlargement  #256
        hidsize *= enlargement  #256
        convfeat = 16 * enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
            self.apply_policy(self.ph_ob[None][:,:-1],
                              ph_new=self.ph_new,
                              ph_istate=ph_istate,
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize,
                              rec_gate_init=rec_gate_init
                              )
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
            self.apply_policy(self.ph_ob[None],
                              ph_new=self.ph_new,
                              ph_istate=ph_istate,
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize,
                              rec_gate_init=rec_gate_init
                              )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            self.define_self_prediction_rew(convfeat=convfeat,
                                            rep_size=rep_size,
                                            enlargement=enlargement)
            self.step_prediction(convfeat=convfeat,
                                 rep_size=rep_size,
                                 enlargement=enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate
示例#9
0
class CnnPolicy(StochasticPolicy):
    def __init__(self, scope, ob_space, ac_space,
                 policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 exploration_type='bottleneck', beta=0.001, rew_counter=None
                 ):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)

        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {
            'small': 1,
            'normal': 2,
            'large': 4
        }[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obmean")  # (84, 84, 1)
        self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obstd")    # (84, 84, 1)
        memsize *= enlargement          # memsize = 256
        hidsize *= enlargement          # hidsize = 256
        convfeat = 16*enlargement       # covfeat = 32
        self.ob_rms = RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,shape=(None, memsize), name='state')  # (None,256)
        pdparamsize = self.pdtype.param_shape()[0]     # 18 等于动作维度
        self.memsize = memsize

        # Inputs to policy and value function will have different shapes depending on whether it is rollout or optimization time, so we treat separately.
        
        # pdparam_opt.shape=(None, None, 18), vpred_int_opt.shape=(None, None), vpred_ext_opt.shape=(None, None), snext_opt.shape=(None, 256)
        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
            self.apply_policy(self.ph_ob[None][:,:-1],
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,                  # 256
                              memsize=memsize,                  # 256
                              extrahid=extrahid,                # True
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize)           # 18
                              
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
            self.apply_policy(self.ph_ob[None],
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize)

        self.exploration_type = exploration_type
        self.max_table = 0

        self.define_bottleneck_rew(convfeat=convfeat, rep_size=rep_size/8, enlargement=enlargement, beta=beta, rew_counter=rew_counter)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)    # 输出策略 softmax 的分布.

        self.a_samp = pd.sample()                 # 采样动作
        self.nlp_samp = pd.neglogp(self.a_samp)   # 输出动作
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)
        self.a_samp_opt = self.pd_opt.sample()

        self.ph_istate = ph_istate

        self.scope = scope

        
        #############################################
        ########## 以下过程实际并未使用 ################
        #############################################
        # for gradcam policy
        a_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)   # (None,None) -> (None,None,18)
        # 相当于取出 one_hot 执行的动作的位置的 pdparam_opt
        loss_cam_pol = tf.reduce_mean(tf.multiply(self.pdparam_opt, a_one_hot))  # (None,)
        
        self.conv_out = tf.get_default_graph().get_tensor_by_name('ppo/pol/Relu_2:0')
        self.grads = tf.gradients(loss_cam_pol, self.conv_out)[0]
        # for gradcam aux
        loss_cam_aux = self.kl
        if int(str(tf.__version__).split('.')[1]) < 10:
            self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2/Maximum:0')
        else:
            self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2:0')
        self.grads_aux = tf.abs(tf.gradients(loss_cam_aux, self.conv_aux_out)[0])

        # self.cams 实际并未使用
        weights = tf.reduce_mean(tf.reduce_mean(self.grads, 2), 1)
        weights = tf.expand_dims(tf.expand_dims(weights, axis=1), axis=1)
        weights = tf.tile(weights, [1, 6, 6, 1])
        cams = tf.reduce_sum((weights * self.conv_out), axis=3)
        self.cams = tf.maximum(cams, tf.zeros_like(cams))

        # self.cans_aux 实际并未使用
        weights_aux = tf.reduce_mean(tf.reduce_mean(self.grads_aux, 2), 1)
        weights_aux = tf.expand_dims(tf.expand_dims(weights_aux, axis=1), axis=1)
        weights_aux = tf.tile(weights_aux, [1, 7, 7, 1])
        cams_aux = tf.nn.relu(tf.reduce_sum((weights_aux * self.conv_aux_out), axis=3))
        self.cams_aux = tf.maximum(cams_aux, tf.zeros_like(cams_aux))

    @staticmethod
    def apply_policy(ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize):
        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))     # shape=(None, 84, 84, 4)

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())
        with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            # shape: (None, 84, 84, 4) -> (None, 20, 20, 32)
            X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format))
            # shape: (None, 20, 20, 32) -> (None, 9, 9, 64)
            X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format))
            # shape: (None, 9, 9, 64) -> (None, 6, 6, 64)
            X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format))
            # (None, 6, 6, 64) -> (None, 2304)
            X = to2d(X)

            mix_other_observations = [X]
            X = tf.concat(mix_other_observations, axis=1)   # (None, 2304)

            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))  # (None, 256)

            additional_size = 448
            X = activ(fc(X, 'fc_additional', nh=additional_size, init_scale=np.sqrt(2)))  # (None, 448)

            snext = tf.zeros((sy_nenvs, memsize))   # (None, 256)
            mix_timeout = [X]
            Xtout = tf.concat(mix_timeout, axis=1)     # (None, 448)
            
            if extrahid:      # True
                Xtout = X + activ(fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1))   # (None, 448)
                X     = X + activ(fc(X, 'fc2act', nh=additional_size, init_scale=0.1))       # (None, 448)

            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)     # (None, 18)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)     # (None, 1)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)     # (None, 1)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))   # shape=(None, None, 18) 
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))            # shape=(None, None)
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))            # shape=(None, None)
        return pdparam, vpred_int, vpred_ext, snext

    def define_bottleneck_rew(self, convfeat, rep_size, enlargement, beta=1e-2, rew_counter=None):
        # convfeat=32, rep_size=64, enlargement=2, beta=0.001, rew_counter=None
        logger.info("Using Curiosity Bottleneck ****************************************************")
        v_target = tf.reshape(self.ph_ret_ext, (-1, 1))

        if rew_counter is None:
            sched_coef = 1.
        else:
            sched_coef = tf.minimum(rew_counter/1000, 1.)

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C.  ph.shape=(None,None,84,84,4)
                logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
                xr = ph[:,1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:]     # (None, 84, 84, 1)
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0)   # (None, 84, 84, 1)
                
                xr = tf.nn.leaky_relu(conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2)))      # (None, 20, 20, 32)
                xr = tf.nn.leaky_relu(conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2)))  # (None, 9, 9, 64)
                xr = tf.nn.leaky_relu(conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2)))  # (None, 7, 7, 64)

                rgbr = [to2d(xr)]             # (None, 3136)
                mu = fc(rgbr[0], 'fc_mu', nh=rep_size, init_scale=np.sqrt(2))   # (None, 64)
                sigma = tf.nn.softplus(fc(rgbr[0], 'fc_sigma', nh=rep_size, init_scale=np.sqrt(2)))   # (None, 64)
                z = mu + sigma * tf.random_normal(tf.shape(mu), 0, 1, dtype=tf.float32)   # (None, 64)
                v = fc(z, 'value', nh=1, init_scale=np.sqrt(2))     # (None, 64)

        self.feat_var = tf.reduce_mean(sigma)
        self.max_feat = tf.reduce_max(tf.abs(z))

        self.kl = 0.5 * tf.reduce_sum(
            tf.square(mu) + tf.square(sigma) - tf.log(1e-8 + tf.square(sigma)) - 1,
            axis=-1, keep_dims=True)
        self.int_rew = tf.stop_gradient(self.kl)
        self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1))

        self.aux_loss_raw = sched_coef * tf.square(v_target - v) + beta * self.kl
        # self.aux_loss_raw = beta * self.kl
        self.aux_loss = sched_coef * tf.square(v_target - v) + beta * self.kl   # (None, 1)
        
        # mask 是 0-1 之间的随机数
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32)  # (None, 1)
        # 全为 true 的矩阵. shape=(None,1)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32)         # (None, 1)
        # 对 aux_loss.shape=(None,1) 的每个位置取平均        
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(tf.reduce_sum(mask), 1.)   # (None, )
        self.v_int = v                     # (None,1)

    def initial_state(self, n):
        return np.zeros((n, self.memsize), np.float32)

    def call(self, dict_obs, new, istate, update_obs_stats=False):
        """
        called when step()
        """
        for ob in dict_obs.values():
            if ob is not None:
                if update_obs_stats:
                    raise NotImplementedError
                    ob = ob.astype(np.float32)
                    ob = ob.reshape(-1, *self.ob_space.shape)
                    self.ob_rms.update(ob)
        # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk.
        # It will use whatever observation spaces saved to disk along with other ctor params.
        feed1 = { self.ph_ob[k]: dict_obs[k][:,None] for k in self.ph_ob_keys }
        feed2 = { self.ph_istate: istate, self.ph_new: new[:,None].astype(np.float32) }
        feed1.update({self.ph_mean: self.ob_rms.mean, self.ph_std: self.ob_rms.var ** 0.5})
        a, vpred_int,vpred_ext, nlp, newstate, ent = tf.get_default_session().run(
            [self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout, self.nlp_samp, self.snext_rollout, self.entropy_rollout],
            feed_dict={**feed1, **feed2})
        return a[:,0], vpred_int[:,0],vpred_ext[:,0], nlp[:,0], newstate, ent[:,0]
示例#10
0
    def __init__(self, scope, ob_space, ac_space,
                 policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 exploration_type='bottleneck', beta=0.001, rew_counter=None
                 ):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)

        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {
            'small': 1,
            'normal': 2,
            'large': 4
        }[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obmean")  # (84, 84, 1)
        self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obstd")    # (84, 84, 1)
        memsize *= enlargement          # memsize = 256
        hidsize *= enlargement          # hidsize = 256
        convfeat = 16*enlargement       # covfeat = 32
        self.ob_rms = RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,shape=(None, memsize), name='state')  # (None,256)
        pdparamsize = self.pdtype.param_shape()[0]     # 18 等于动作维度
        self.memsize = memsize

        # Inputs to policy and value function will have different shapes depending on whether it is rollout or optimization time, so we treat separately.
        
        # pdparam_opt.shape=(None, None, 18), vpred_int_opt.shape=(None, None), vpred_ext_opt.shape=(None, None), snext_opt.shape=(None, 256)
        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
            self.apply_policy(self.ph_ob[None][:,:-1],
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,                  # 256
                              memsize=memsize,                  # 256
                              extrahid=extrahid,                # True
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize)           # 18
                              
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
            self.apply_policy(self.ph_ob[None],
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize)

        self.exploration_type = exploration_type
        self.max_table = 0

        self.define_bottleneck_rew(convfeat=convfeat, rep_size=rep_size/8, enlargement=enlargement, beta=beta, rew_counter=rew_counter)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)    # 输出策略 softmax 的分布.

        self.a_samp = pd.sample()                 # 采样动作
        self.nlp_samp = pd.neglogp(self.a_samp)   # 输出动作
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)
        self.a_samp_opt = self.pd_opt.sample()

        self.ph_istate = ph_istate

        self.scope = scope

        
        #############################################
        ########## 以下过程实际并未使用 ################
        #############################################
        # for gradcam policy
        a_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)   # (None,None) -> (None,None,18)
        # 相当于取出 one_hot 执行的动作的位置的 pdparam_opt
        loss_cam_pol = tf.reduce_mean(tf.multiply(self.pdparam_opt, a_one_hot))  # (None,)
        
        self.conv_out = tf.get_default_graph().get_tensor_by_name('ppo/pol/Relu_2:0')
        self.grads = tf.gradients(loss_cam_pol, self.conv_out)[0]
        # for gradcam aux
        loss_cam_aux = self.kl
        if int(str(tf.__version__).split('.')[1]) < 10:
            self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2/Maximum:0')
        else:
            self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2:0')
        self.grads_aux = tf.abs(tf.gradients(loss_cam_aux, self.conv_aux_out)[0])

        # self.cams 实际并未使用
        weights = tf.reduce_mean(tf.reduce_mean(self.grads, 2), 1)
        weights = tf.expand_dims(tf.expand_dims(weights, axis=1), axis=1)
        weights = tf.tile(weights, [1, 6, 6, 1])
        cams = tf.reduce_sum((weights * self.conv_out), axis=3)
        self.cams = tf.maximum(cams, tf.zeros_like(cams))

        # self.cans_aux 实际并未使用
        weights_aux = tf.reduce_mean(tf.reduce_mean(self.grads_aux, 2), 1)
        weights_aux = tf.expand_dims(tf.expand_dims(weights_aux, axis=1), axis=1)
        weights_aux = tf.tile(weights_aux, [1, 7, 7, 1])
        cams_aux = tf.nn.relu(tf.reduce_sum((weights_aux * self.conv_aux_out), axis=3))
        self.cams_aux = tf.maximum(cams_aux, tf.zeros_like(cams_aux))
示例#11
0
class PPO_RND(object):
    def __init__(self, ob_space, ac_space, policy_type, args):
        self.gamma = args.gamma
        self.lam = args.lam
        self.adam_epsilon = args.adam_epsilon
        self.clip_param = args.clip_param
        self.entcoeff = args.entcoeff
        self.optim_stepsize = args.optim_stepsize
        self.int_coeff = args.int_coeff
        self.ext_coeff = args.ext_coeff

        self.ob_space = ob_space
        self.ac_space = ac_space

        self.policy_type = policy_type
        if self.policy_type == "coord_cnn":
            self.pi = CoordConvPolicy("pi", self.ob_space, self.ac_space,
                                      args.hidden_size, args.num_hid_layers,
                                      args.kind)
            self.oldpi = CoordConvPolicy("oldpi", self.ob_space, self.ac_space,
                                         args.hidden_size, args.num_hid_layers,
                                         args.kind)

        self.int_rew = RND("rnd_int_rew", self.pi.ob, args)
        self.rff_int = RewardForwardFilter(args.gamma)
        self.rff_rms_int = RunningMeanStd(comm=MPI.COMM_SELF, use_mpi=True)

        self.build_graph()
        U.initialize()
        self.adam.sync()

    def build_graph(self):
        atarg = tf.placeholder(
            dtype=tf.float32,
            shape=[None])  # Target advantage function (if applicable)
        ret_ext = tf.placeholder(dtype=tf.float32,
                                 shape=[None])  # Extrinsic return
        ret_int = tf.placeholder(dtype=tf.float32,
                                 shape=[None])  # Intrinsic return

        lrmult = tf.placeholder(
            name='lrmult', dtype=tf.float32,
            shape=[])  # learning rate multiplier, updated with schedule
        clip_param = self.clip_param * lrmult  # Annealed clipping parameter epsilon

        ob = self.pi.ob
        ac = self.pi.pdtype.sample_placeholder([None])

        kloldnew = self.oldpi.pd.kl(self.pi.pd)
        ent = self.pi.pd.entropy()
        meankl = tf.reduce_mean(kloldnew)
        meanent = tf.reduce_mean(ent)
        pol_entpen = (-self.entcoeff) * meanent

        ratio = tf.exp(self.pi.pd.logp(ac) -
                       self.oldpi.pd.logp(ac))  # pnew / pold
        surr1 = ratio * atarg  # surrogate from conservative policy iteration
        surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                 1.0 + clip_param) * atarg  #
        pol_surr = -tf.reduce_mean(tf.minimum(
            surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
        vf_ext_loss = tf.reduce_mean(tf.square(self.pi.vpred_ext - ret_ext))
        vf_int_loss = tf.reduce_mean(tf.square(self.pi.vpred_int - ret_int))
        vf_loss = vf_ext_loss + vf_int_loss
        total_loss = pol_surr + pol_entpen + vf_loss + self.int_rew.aux_loss

        self.losses = [
            pol_surr, pol_entpen, vf_ext_loss, vf_int_loss, meankl, meanent,
            self.int_rew.aux_loss
        ]
        self.loss_names = [
            "pol_surr", "pol_entpen", "vf_ext_loss", "vf_int_loss", "kl",
            "ent", "aux_loss"
        ]

        var_list = self.pi.get_trainable_variables(
        ) + self.int_rew.get_trainable_variables()

        self.lossandgrad = U.function(
            [ac, atarg, ret_ext, ret_int, lrmult] + ob,
            self.losses + [U.flatgrad(total_loss, var_list)])
        self.compute_losses = U.function(
            [ac, atarg, ret_ext, ret_int, lrmult] + ob, self.losses)

        self.adam = MpiAdam(var_list, epsilon=self.adam_epsilon)

        self.assign_old_eq_new = U.function(
            [], [],
            updates=[
                tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                    self.oldpi.get_variables(), self.pi.get_variables())
            ])

    def train(self, seg, optim_batchsize, optim_epochs):
        #normalize the reward
        rffs_int = np.array(
            [self.rff_int.update(rew) for rew in seg["rew_int"]])
        self.rff_rms_int.update(rffs_int.ravel())
        seg["rew_int"] = seg["rew_int"] / np.sqrt(self.rff_rms_int.var)

        cur_lrmult = 1.0
        add_vtarg_and_adv(seg, self.gamma, self.lam)
        ob, unnorm_ac, atarg_ext, tdlamret_ext, atarg_int, tdlamret_int = seg[
            "ob"], seg["unnorm_ac"], seg["adv_ext"], seg["tdlamret_ext"], seg[
                "adv_int"], seg["tdlamret_int"]
        vpredbefore_ext, vpredbefore_int = seg["vpred_ext"], seg[
            "vpred_int"]  # predicted value function before udpate
        atarg_ext = (atarg_ext - atarg_ext.mean()) / atarg_ext.std(
        )  # standardized advantage function estimate
        atarg_int = (atarg_int - atarg_int.mean()) / atarg_int.std()
        atarg = self.int_coeff * atarg_int + self.ext_coeff * atarg_ext

        d = Dataset(dict(ob=ob,
                         ac=unnorm_ac,
                         atarg=atarg,
                         vtarg_ext=tdlamret_ext,
                         vtarg_int=tdlamret_int),
                    shuffle=not self.pi.recurrent)

        if hasattr(self.pi, "ob_rms"):
            self.pi.update_obs_rms(ob)  # update running mean/std for policy
        if hasattr(self.int_rew, "ob_rms"):
            self.int_rew.update_obs_rms(
                ob)  #update running mean/std for int_rew
        self.assign_old_eq_new(
        )  # set old parameter values to new parameter values
        logger.log2("Optimizing...")
        logger.log2(fmt_row(13, self.loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                lg = self.lossandgrad(batch["ac"], batch["atarg"],
                                      batch["vtarg_ext"], batch["vtarg_int"],
                                      cur_lrmult, *zip(*batch["ob"].tolist()))
                new_losses, g = lg[:-1], lg[-1]
                self.adam.update(g, self.optim_stepsize * cur_lrmult)
                losses.append(new_losses)
            logger.log2(fmt_row(13, np.mean(losses, axis=0)))

        logger.log2("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = self.compute_losses(batch["ac"], batch["atarg"],
                                            batch["vtarg_ext"],
                                            batch["vtarg_int"], cur_lrmult,
                                            *zip(*batch["ob"].tolist()))
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log2(fmt_row(13, meanlosses))

        for (lossval, name) in zipsame(meanlosses, self.loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular(
            "ev_tdlam_ext_before",
            explained_variance(vpredbefore_ext, tdlamret_ext))
        return meanlosses
示例#12
0
    def __init__(self,
                 scope,
                 ob_space,
                 ac_space,
                 policy_size='normal',
                 maxpool=False,
                 extrahid=True,
                 hidsize=128,
                 memsize=128,
                 rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 dynamics_bonus=False,
                 action_balance_coef=1.,
                 array_action=True):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        self.action_balance_coef = action_balance_coef
        self.array_action = array_action

        self.enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        self.rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= self.enlargement
        hidsize *= self.enlargement
        self.convfeat = 16 * self.enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        # self.int_rew_ab = None
        # self.int_rew_ab_opt = None
        if self.action_balance_coef is not None:
            # self.action_one_hot_list_rollout = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps)
            # self.action_one_hot_list_opt = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1)
            # with tf.device('/cpu:0'):
            self.action_one_hot_rollout = get_action_one_hot(
                self.ac_space.n, self.sy_nenvs, self.sy_nsteps)
            # self.action_one_hot_list_opt = get_action_one_hot(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1)

            if self.array_action:
                # with tf.device('/cpu:0'):
                self.action_encode_array_rollout = get_action_encode_array(
                    self.ac_space.n, self.sy_nenvs, self.sy_nsteps,
                    ob_space.shape[:2])
                # self.action_encode_array_rollout, self.split_lengths = get_action_encode_array(
                #     self.ac_space.n, self.sy_nenvs, self.sy_nsteps, ob_space.shape[:2])

            self.feat_var_ab, self.max_feat_ab, self.int_rew_ab, self.int_rew_ab_rollout, self.aux_loss_ab = \
                self.define_action_balance_rew(ph_ob=self.ph_ob[None],
                                               action_one_hot=self.action_one_hot_rollout,
                                               convfeat=self.convfeat,
                                               rep_size=self.rep_size, enlargement=self.enlargement,
                                               sy_nenvs=self.sy_nenvs,
                                               sy_nsteps=self.sy_nsteps,
                                               )
            # self.feat_var_ab_opt, self.max_feat_ab_opt, self.int_rew_ab_opt, self.aux_loss_ab = \
            #     self.define_action_balance_rew(ph_ob=self.ph_ob[None][:, :-1],
            #                                    action_one_hot=self.action_one_hot_list_opt,
            #                                    convfeat=self.convfeat,
            #                                    rep_size=self.rep_size, enlargement=self.enlargement,
            #                                    sy_nenvs=self.sy_nenvs,
            #                                    sy_nsteps=self.sy_nsteps - 1,
            #                                    )

            self.pd_ab = self.pdtype.pdfromflat(self.int_rew_ab)

        # Inputs to policy and value function will have different shapes depending on whether it is rollout
        # or optimization time, so we treat separately.
        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt, self.logits_raw_opt = \
            self.apply_policy(self.ph_ob[None][:, :-1],
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize
                              )
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout, _ = \
            self.apply_policy(self.ph_ob[None],
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize
                              )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=self.convfeat,
                                                rep_size=self.rep_size,
                                                enlargement=self.enlargement)
        else:
            self.define_self_prediction_rew(convfeat=self.convfeat,
                                            rep_size=self.rep_size,
                                            enlargement=self.enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate
示例#13
0
class MlpPolicy(StochasticPolicy):
    def __init__(
        self,
        scope,
        ob_space,
        ac_space,
        policy_size='small',
        maxpool=False,
        extrahid=True,
        hidsize=128,
        memsize=128,
        rec_gate_init=0.0,
        update_ob_stats_independently_per_gpu=True,
        proportion_of_exp_used_for_predictor_update=1.,
        dynamics_bonus=False,
    ):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape),
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape),
                                     name="obstd")
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape),
            use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]

        rep_size = 16
        memsize *= enlargement
        hidsize *= enlargement
        convfeat = 16 * enlargement

        #Inputs to policy and value function will have different shapes depending on whether it is rollout
        #or optimization time, so we treat separately.
        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
            self.apply_policy(self.ph_ob[None][:,:-1],
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize
                              )
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
            self.apply_policy(self.ph_ob[None],
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize
                              )
        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            self.define_self_prediction_rew(convfeat=convfeat,
                                            rep_size=rep_size,
                                            enlargement=enlargement)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate

    @staticmethod
    def apply_policy(ph_ob,
                     reuse,
                     scope,
                     hidsize,
                     memsize,
                     extrahid,
                     sy_nenvs,
                     sy_nsteps,
                     pdparamsize,
                     use_action_balance=None):
        ph = ph_ob
        assert len(ph.shape.as_list()) == 3  # B,T,S
        logger.info("Mlp Policy: using '%s' shape %s as image input" %
                    (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32)
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-1:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())
        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(fc(X, 'fc_0', nh=hidsize, init_scale=np.sqrt(2)))
            mix_other_observations = [X]
            X = tf.concat(mix_other_observations, axis=1)
            X = activ(fc(X, 'fc_1', nh=hidsize, init_scale=np.sqrt(2)))
            additional_size = 64
            X = activ(
                fc(X,
                   'fc_additional',
                   nh=additional_size,
                   init_scale=np.sqrt(2)))

            snext = tf.zeros((sy_nenvs, memsize))
            mix_timeout = [X]

            Xtout = tf.concat(mix_timeout, axis=1)
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1))
                X = X + activ(
                    fc(X, 'fc2act', nh=additional_size, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            # if use_action_balance:

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext

    def define_action_balance_rew(self, units, rep_size):
        logger.info(
            "Using Action Balance BONUS ****************************************************"
        )
        # (s, a) seen frequency as bonus
        with tf.variable_scope('action_balance', reuse=tf.AUTO_REUSE):
            ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)
            assert ac_one_hot.get_shape().ndims == 3
            assert ac_one_hot.get_shape().as_list() == [
                None, None, self.ac_space.n
            ], ac_one_hot.get_shape().as_list()
            ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n))

            def cond(x):
                return tf.concat([x, ac_one_hot], 1)

            # Random target network.
            for ph in self.ph_ob.values():
                if len(ph.shape.as_list()) == 3:  # B,T,S
                    logger.info(
                        "Mlp Target: using '%s' shape %s as image input" %
                        (ph.name, str(ph.shape)))
                    xr = ph[:, :-1]
                    xr = tf.cast(xr, tf.float32)
                    xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-1:]))
                    xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std,
                                          -5.0, 5.0)

                    xr = tf.nn.relu(
                        fc(cond(xr),
                           'fc_sa0_r',
                           nh=units,
                           init_scale=np.sqrt(2)))
                    xr = tf.nn.relu(
                        fc(cond(xr),
                           'fc_sa1_r',
                           nh=units,
                           init_scale=np.sqrt(2)))
                    X_r = fc(cond(xr),
                             'fc_sa2_r',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

            # Predictor network.
            for ph in self.ph_ob.values():
                if len(ph.shape.as_list()) == 3:  # B,T,S
                    logger.info(
                        "Mlp Target: using '%s' shape %s as image input" %
                        (ph.name, str(ph.shape)))
                    xrp = ph[:, :-1]
                    xrp = tf.cast(xrp, tf.float32)
                    xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-1:]))
                    xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                           -5.0, 5.0)

                    xrp = tf.nn.relu(
                        fc(cond(xrp),
                           'fc_sa0_r',
                           nh=units * 2,
                           init_scale=np.sqrt(2)))
                    xrp = tf.nn.relu(
                        fc(cond(xrp),
                           'fc_sa1_r',
                           nh=units * 2,
                           init_scale=np.sqrt(2)))
                    X_r_hat = fc(cond(xrp),
                                 'fc_sa2_r',
                                 nh=rep_size,
                                 init_scale=np.sqrt(2))

        self.feat_var_ab = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat_ab = tf.reduce_max(tf.abs(X_r))
        self.int_rew_ab = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew_ab = tf.reshape(self.int_rew_ab,
                                     (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        self.aux_loss_ab = tf.reduce_mean(tf.square(noisy_targets - X_r_hat),
                                          -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss_ab),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss_ab = tf.reduce_sum(mask * self.aux_loss_ab) / tf.maximum(
            tf.reduce_sum(mask), 1.)

    def define_self_prediction_rew(self, convfeat, rep_size, enlargement):
        logger.info(
            "Using RND BONUS ****************************************************"
        )
        hidden_size = convfeat * 2

        #RND bonus.

        activ = tf.nn.relu
        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 3:  # B,T,S
                logger.info("Mlp Target: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]  # get next status index is 1:
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-1:]))
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = activ(
                    fc(xr, 'fc_0_r', nh=hidden_size, init_scale=np.sqrt(2)))
                xr = activ(
                    fc(xr, 'fc_1_r', nh=hidden_size, init_scale=np.sqrt(2)))
                X_r = fc(xr, 'fc_2_r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 3:  # B,T,S
                logger.info("Mlp Target: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-1:]))
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = activ(
                    fc(xrp, 'fc_0_pred', nh=hidden_size,
                       init_scale=np.sqrt(2)))
                xrp = activ(
                    fc(xrp, 'fc_1_pred', nh=hidden_size,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(xrp,
                             'fc_2_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        targets = tf.stop_gradient(X_r)
        # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)

    def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement):
        #Dynamics loss with random features.

        activ = tf.nn.relu
        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 3:  # B,T,S
                logger.info("Mlp Target: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]  # get next status index is 1:
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-1:]))
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = activ(fc(xr, 'fc_0_r', nh=32, init_scale=np.sqrt(2)))
                xr = activ(fc(xr, 'fc_1_r', nh=32, init_scale=np.sqrt(2)))
                X_r = fc(xr, 'fc_2_r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)
        assert ac_one_hot.get_shape().ndims == 3
        assert ac_one_hot.get_shape().as_list() == [
            None, None, self.ac_space.n
        ], ac_one_hot.get_shape().as_list()
        ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n))

        def cond(x):
            return tf.concat([x, ac_one_hot], 1)

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 3:  # B,T,S
                logger.info("Mlp Target: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-1:]))
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = activ(fc(xrp, 'fc_0_pred', nh=32, init_scale=np.sqrt(2)))
                xrp = activ(fc(xrp, 'fc_1_pred', nh=32, init_scale=np.sqrt(2)))
                X_r_hat = fc(xrp,
                             'fc_2r_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)

    def initial_state(self, n):
        return np.zeros((n, self.memsize), np.float32)

    def call(self, dict_obs, new, istate, update_obs_stats=False):
        for ob in dict_obs.values():
            if ob is not None:
                if update_obs_stats:
                    raise NotImplementedError
                    ob = ob.astype(np.float32)
                    ob = ob.reshape(-1, *self.ob_space.shape)
                    self.ob_rms.update(ob)
        # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk.
        # It will use whatever observation spaces saved to disk along with other ctor params.
        feed1 = {self.ph_ob[k]: dict_obs[k][:, None] for k in self.ph_ob_keys}
        feed2 = {
            self.ph_istate: istate,
            self.ph_new: new[:, None].astype(np.float32)
        }
        feed1.update({
            self.ph_mean: self.ob_rms.mean,
            self.ph_std: self.ob_rms.var**0.5
        })
        # for f in feed1:
        #     print(f)
        a, vpred_int, vpred_ext, nlp, newstate, ent = tf.get_default_session(
        ).run([
            self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout,
            self.nlp_samp, self.snext_rollout, self.entropy_rollout
        ],
              feed_dict={
                  **feed1,
                  **feed2
              })
        return a[:, 0], vpred_int[:, 0], vpred_ext[:,
                                                   0], nlp[:,
                                                           0], newstate, ent[:,
                                                                             0]
    def __init__(self,
                 scope,
                 ob_space,
                 ac_space,
                 policy_size='normal',
                 maxpool=False,
                 extrahid=True,
                 hidsize=128,
                 memsize=128,
                 rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 dynamics_bonus=False,
                 num_agents=1,
                 rnd_type='rnd',
                 div_type='oracle',
                 indep_rnd=False,
                 indep_policy=False,
                 sd_type='oracle',
                 rnd_mask_prob=1.):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        rep_size = 512

        self.rnd_mask = tf.placeholder(dtype=tf.float32,
                                       shape=(None, None, num_agents),
                                       name="rnd_mask")
        self.new_rnd_mask = tf.placeholder(dtype=tf.float32,
                                           shape=(None, None),
                                           name="new_rnd_mask")
        self.div_train_mask = tf.placeholder(dtype=tf.float32,
                                             shape=(None, None),
                                             name="div_train_mask")
        self.sample_agent_prob = tf.placeholder(dtype=tf.float32,
                                                shape=(
                                                    None,
                                                    None,
                                                ),
                                                name="sample_agent_prob")
        self.stage_label = tf.placeholder(dtype=tf.int32,
                                          shape=(None, None),
                                          name="stage_label")

        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        self.ph_count = tf.placeholder(dtype=tf.float32,
                                       shape=(),
                                       name="obcount")

        self.sep_ph_mean = tf.placeholder(dtype=tf.float32,
                                          shape=(
                                              None,
                                              None,
                                          ) + ob_space.shape[:2] + (1, ),
                                          name="sep_obmean")
        self.sep_ph_std = tf.placeholder(dtype=tf.float32,
                                         shape=(
                                             None,
                                             None,
                                         ) + ob_space.shape[:2] + (1, ),
                                         name="sep_obstd")
        self.sep_ph_count = tf.placeholder(dtype=tf.float32,
                                           shape=(),
                                           name="sep_obcount")

        self.game_score = tf.placeholder(dtype=tf.float32,
                                         shape=(None, None),
                                         name="game_score")
        self.last_rew_ob = tf.placeholder(dtype=ob_space.dtype,
                                          shape=(None, None) +
                                          tuple(ob_space.shape),
                                          name="last_rew_ob")

        self.div_ph_mean = tf.placeholder(dtype=tf.float32,
                                          shape=list(ob_space.shape[:2]) + [1],
                                          name="div_obmean")
        self.div_ph_std = tf.placeholder(dtype=tf.float32,
                                         shape=list(ob_space.shape[:2]) + [1],
                                         name="div_obstd")

        self.idle_agent_label = tf.placeholder(dtype=tf.int32,
                                               shape=(
                                                   None,
                                                   None,
                                               ),
                                               name="idle_agent_label")
        self.rew_agent_label = tf.placeholder(dtype=tf.int32,
                                              shape=(
                                                  None,
                                                  None,
                                              ),
                                              name="rew_agent_label")

        #self.var_ph_mean = tf.get_variable("var_ph_mean", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0))
        #self.var_ph_std = tf.get_variable("var_ph_std", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0))
        #self.var_ph_count = tf.get_variable("var_ph_count", (), initializer=tf.constant_initializer(0.0))

        self.sd_ph_mean = tf.placeholder(dtype=tf.float32,
                                         shape=list(ob_space.shape[:2]) + [1],
                                         name="sd_obmean")
        self.sd_ph_std = tf.placeholder(dtype=tf.float32,
                                        shape=list(ob_space.shape[:2]) + [1],
                                        name="sd_obstd")

        memsize *= enlargement
        hidsize *= enlargement
        convfeat = 16 * enlargement

        self.ob_rms_list = [RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi= not update_ob_stats_independently_per_gpu) \
                                for _ in range(num_agents)]
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)

        self.diversity_ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)

        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]

        self.memsize = memsize
        self.num_agents = num_agents
        self.indep_rnd = indep_rnd
        self.indep_policy = indep_policy

        self.num_agents = num_agents

        if num_agents <= 0:

            self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
                self.apply_policy(self.ph_ob[None][:,:-1],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=False,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps - 1,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
            self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
                self.apply_policy(self.ph_ob[None],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=True,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
        else:

            self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
                self.apply_multi_head_policy(self.ph_ob[None][:,:-1],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=False,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps - 1,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
            self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
                self.apply_multi_head_policy(self.ph_ob[None],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=True,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )

        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            #self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement)
            self.aux_loss, self.int_rew, self.feat_var, self.max_feat = self.define_multi_head_self_prediction_rew(
                convfeat=convfeat, rep_size=rep_size, enlargement=enlargement)

        self.stage_rnd = tf.constant(1.)
        self.stage_prob = tf.constant(1.)

        if div_type == 'cls':
            with tf.variable_scope("div", reuse=False):
                #self.define_rew_discriminator(convfeat=convfeat, rep_size=256)
                with tf.variable_scope("int", reuse=False):
                    self.disc_logits, self.all_div_prob, self.sp_prob, self.div_rew, self.disc_pd, self.disc_nlp = self.define_rew_discriminator_v2(
                        convfeat=convfeat, rep_size=512, use_rew=True)
        else:
            self.div_rew = tf.constant(0.)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate
class CnnGruPolicy(StochasticPolicy):
    def __init__(self,
                 scope,
                 ob_space,
                 ac_space,
                 policy_size='normal',
                 maxpool=False,
                 extrahid=True,
                 hidsize=128,
                 memsize=128,
                 rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 dynamics_bonus=False,
                 num_agents=1,
                 rnd_type='rnd',
                 div_type='oracle',
                 indep_rnd=False,
                 indep_policy=False,
                 sd_type='oracle',
                 rnd_mask_prob=1.):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        rep_size = 512

        self.rnd_mask = tf.placeholder(dtype=tf.float32,
                                       shape=(None, None, num_agents),
                                       name="rnd_mask")
        self.new_rnd_mask = tf.placeholder(dtype=tf.float32,
                                           shape=(None, None),
                                           name="new_rnd_mask")
        self.div_train_mask = tf.placeholder(dtype=tf.float32,
                                             shape=(None, None),
                                             name="div_train_mask")
        self.sample_agent_prob = tf.placeholder(dtype=tf.float32,
                                                shape=(
                                                    None,
                                                    None,
                                                ),
                                                name="sample_agent_prob")
        self.stage_label = tf.placeholder(dtype=tf.int32,
                                          shape=(None, None),
                                          name="stage_label")

        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        self.ph_count = tf.placeholder(dtype=tf.float32,
                                       shape=(),
                                       name="obcount")

        self.sep_ph_mean = tf.placeholder(dtype=tf.float32,
                                          shape=(
                                              None,
                                              None,
                                          ) + ob_space.shape[:2] + (1, ),
                                          name="sep_obmean")
        self.sep_ph_std = tf.placeholder(dtype=tf.float32,
                                         shape=(
                                             None,
                                             None,
                                         ) + ob_space.shape[:2] + (1, ),
                                         name="sep_obstd")
        self.sep_ph_count = tf.placeholder(dtype=tf.float32,
                                           shape=(),
                                           name="sep_obcount")

        self.game_score = tf.placeholder(dtype=tf.float32,
                                         shape=(None, None),
                                         name="game_score")
        self.last_rew_ob = tf.placeholder(dtype=ob_space.dtype,
                                          shape=(None, None) +
                                          tuple(ob_space.shape),
                                          name="last_rew_ob")

        self.div_ph_mean = tf.placeholder(dtype=tf.float32,
                                          shape=list(ob_space.shape[:2]) + [1],
                                          name="div_obmean")
        self.div_ph_std = tf.placeholder(dtype=tf.float32,
                                         shape=list(ob_space.shape[:2]) + [1],
                                         name="div_obstd")

        self.idle_agent_label = tf.placeholder(dtype=tf.int32,
                                               shape=(
                                                   None,
                                                   None,
                                               ),
                                               name="idle_agent_label")
        self.rew_agent_label = tf.placeholder(dtype=tf.int32,
                                              shape=(
                                                  None,
                                                  None,
                                              ),
                                              name="rew_agent_label")

        #self.var_ph_mean = tf.get_variable("var_ph_mean", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0))
        #self.var_ph_std = tf.get_variable("var_ph_std", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0))
        #self.var_ph_count = tf.get_variable("var_ph_count", (), initializer=tf.constant_initializer(0.0))

        self.sd_ph_mean = tf.placeholder(dtype=tf.float32,
                                         shape=list(ob_space.shape[:2]) + [1],
                                         name="sd_obmean")
        self.sd_ph_std = tf.placeholder(dtype=tf.float32,
                                        shape=list(ob_space.shape[:2]) + [1],
                                        name="sd_obstd")

        memsize *= enlargement
        hidsize *= enlargement
        convfeat = 16 * enlargement

        self.ob_rms_list = [RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi= not update_ob_stats_independently_per_gpu) \
                                for _ in range(num_agents)]
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)

        self.diversity_ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)

        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]

        self.memsize = memsize
        self.num_agents = num_agents
        self.indep_rnd = indep_rnd
        self.indep_policy = indep_policy

        self.num_agents = num_agents

        if num_agents <= 0:

            self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
                self.apply_policy(self.ph_ob[None][:,:-1],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=False,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps - 1,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
            self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
                self.apply_policy(self.ph_ob[None],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=True,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
        else:

            self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
                self.apply_multi_head_policy(self.ph_ob[None][:,:-1],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=False,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps - 1,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )
            self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
                self.apply_multi_head_policy(self.ph_ob[None],
                                  ph_new=self.ph_new,
                                  ph_istate=ph_istate,
                                  reuse=True,
                                  scope=scope,
                                  hidsize=hidsize,
                                  memsize=memsize,
                                  extrahid=extrahid,
                                  sy_nenvs=self.sy_nenvs,
                                  sy_nsteps=self.sy_nsteps,
                                  pdparamsize=pdparamsize,
                                  rec_gate_init=rec_gate_init
                                  )

        if dynamics_bonus:
            self.define_dynamics_prediction_rew(convfeat=convfeat,
                                                rep_size=rep_size,
                                                enlargement=enlargement)
        else:
            #self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement)
            self.aux_loss, self.int_rew, self.feat_var, self.max_feat = self.define_multi_head_self_prediction_rew(
                convfeat=convfeat, rep_size=rep_size, enlargement=enlargement)

        self.stage_rnd = tf.constant(1.)
        self.stage_prob = tf.constant(1.)

        if div_type == 'cls':
            with tf.variable_scope("div", reuse=False):
                #self.define_rew_discriminator(convfeat=convfeat, rep_size=256)
                with tf.variable_scope("int", reuse=False):
                    self.disc_logits, self.all_div_prob, self.sp_prob, self.div_rew, self.disc_pd, self.disc_nlp = self.define_rew_discriminator_v2(
                        convfeat=convfeat, rep_size=512, use_rew=True)
        else:
            self.div_rew = tf.constant(0.)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate

    @staticmethod
    def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize,
                     extrahid, sy_nenvs, sy_nsteps, pdparamsize,
                     rec_gate_init):
        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnGruPolicy: using '%s' shape %s as image input" %
                    (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())

        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(
                conv(X,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c2',
                     nf=64,
                     rf=4,
                     stride=2,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c3',
                     nf=64,
                     rf=4,
                     stride=1,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = to2d(X)
            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
            X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize])
            X, snext = tf.nn.dynamic_rnn(GRUCell(memsize,
                                                 rec_gate_init=rec_gate_init),
                                         (X, ph_new[:, :, None]),
                                         dtype=tf.float32,
                                         time_major=False,
                                         initial_state=ph_istate)
            X = tf.reshape(X, (-1, memsize))
            Xtout = X
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1))
                X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext

    def _build_policy_net(self, X, ph_new, ph_istate, reuse, scope, hidsize,
                          memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize,
                          rec_gate_init):
        activ = tf.nn.relu
        data_format = 'NHWC'

        with tf.variable_scope(scope, reuse=reuse):
            X = activ(
                conv(X,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c2',
                     nf=64,
                     rf=4,
                     stride=2,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c3',
                     nf=64,
                     rf=4,
                     stride=1,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = to2d(X)
            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
            X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize])
            X, snext = tf.nn.dynamic_rnn(GRUCell(memsize,
                                                 rec_gate_init=rec_gate_init),
                                         (X, ph_new[:, :, None]),
                                         dtype=tf.float32,
                                         time_major=False,
                                         initial_state=ph_istate)
            X = tf.reshape(X, (-1, memsize))
            Xtout = X
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1))
                X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

        return pdparam, vpred_int, vpred_ext, snext

    def apply_multi_head_policy(self, ph_ob, ph_new, ph_istate, reuse, scope,
                                hidsize, memsize, extrahid, sy_nenvs,
                                sy_nsteps, pdparamsize, rec_gate_init):

        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnGruPolicy: using '%s' shape %s as image input" %
                    (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        yes_gpu = any(get_available_gpus())

        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):

            all_pdparam = []
            all_vint = []
            all_vext = []
            all_snext = []

            for i in range(self.num_agents):

                scope = 'agent_{}'.format(str(i))
                pdparam, vpred_int, vpred_ext, snext = self._build_policy_net(
                    X=X,
                    ph_new=ph_new,
                    ph_istate=ph_istate,
                    scope=scope,
                    reuse=False,
                    hidsize=hidsize,
                    memsize=memsize,
                    extrahid=extrahid,
                    sy_nenvs=sy_nenvs,
                    sy_nsteps=sy_nsteps,
                    pdparamsize=pdparamsize,
                    rec_gate_init=rec_gate_init)

                if i == 0:
                    #[batch,naction] - > [batch, 1, naction]
                    all_pdparam = tf.expand_dims(pdparam, axis=1)
                    #[batch,1] -> [batch,1,1]
                    all_vint = tf.expand_dims(vpred_int, axis=1)
                    all_vext = tf.expand_dims(vpred_ext, axis=1)
                    all_snext = tf.expand_dims(snext, axis=1)
                else:
                    all_pdparam = tf.concat(
                        [all_pdparam,
                         tf.expand_dims(pdparam, axis=1)], axis=1)
                    all_vint = tf.concat(
                        [all_vint, tf.expand_dims(vpred_int, axis=1)], axis=1)
                    all_vext = tf.concat(
                        [all_vext, tf.expand_dims(vpred_ext, axis=1)], axis=1)
                    all_snext = tf.concat(
                        [all_snext, tf.expand_dims(snext, axis=1)], axis=1)

            #[batch, nstep] -> [batch,nstep, ngroups]
            one_hot_gidx = tf.one_hot(self.ph_agent_idx,
                                      self.num_agents,
                                      axis=-1)
            #[batch,nstep, ngroups] -> [batch * nstep, ngroups,1]
            one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents, 1))

            pdparam = tf.reduce_sum(one_hot_gidx * all_pdparam, axis=1)
            vpred_int = tf.reduce_sum(one_hot_gidx * all_vint, axis=1)
            vpred_ext = tf.reduce_sum(one_hot_gidx * all_vext, axis=1)
            snext = tf.reduce_sum(one_hot_gidx * all_snext, axis=1)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
            snext = tf.reshape(snext, (sy_nenvs, memsize))

        return pdparam, vpred_int, vpred_ext, snext

    def _build_target_net(self, target_x, scope, reuse, convfeat, rep_size,
                          enlargement):

        with tf.variable_scope(scope, reuse=reuse):
            xr = tf.nn.leaky_relu(
                conv(target_x,
                     'c1r',
                     nf=convfeat * 1,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2)))
            xr = tf.nn.leaky_relu(
                conv(xr,
                     'c2r',
                     nf=convfeat * 2 * 1,
                     rf=4,
                     stride=2,
                     init_scale=np.sqrt(2)))
            xr = tf.nn.leaky_relu(
                conv(xr,
                     'c3r',
                     nf=convfeat * 2 * 1,
                     rf=3,
                     stride=1,
                     init_scale=np.sqrt(2)))
            rgbr = [to2d(xr)]
            X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        return X_r

    def _build_pred_net(self, pred_x, scope, reuse, convfeat, rep_size,
                        enlargement):

        with tf.variable_scope(scope, reuse=reuse):
            xrp = tf.nn.leaky_relu(
                conv(pred_x,
                     'c1rp_pred',
                     nf=convfeat,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2)))
            xrp = tf.nn.leaky_relu(
                conv(xrp,
                     'c2rp_pred',
                     nf=convfeat * 2,
                     rf=4,
                     stride=2,
                     init_scale=np.sqrt(2)))
            xrp = tf.nn.leaky_relu(
                conv(xrp,
                     'c3rp_pred',
                     nf=convfeat * 2,
                     rf=3,
                     stride=1,
                     init_scale=np.sqrt(2)))
            rgbrp = to2d(xrp)
            # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2)))
            X_r_hat = tf.nn.relu(
                fc(rgbrp,
                   'fc1r_hat1_pred',
                   nh=256 * enlargement,
                   init_scale=np.sqrt(2)))
            X_r_hat = tf.nn.relu(
                fc(X_r_hat,
                   'fc1r_hat2_pred',
                   nh=256 * enlargement,
                   init_scale=np.sqrt(2)))
            X_r_hat = fc(X_r_hat,
                         'fc1r_hat3_pred',
                         nh=rep_size,
                         init_scale=np.sqrt(2))

        return X_r_hat

    def define_multi_head_self_prediction_rew(self, convfeat, rep_size,
                                              enlargement):
        logger.info(
            "Using multi-head RND BONUS ****************************************************"
        )

        #RND bonus.

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]

                ph_mean = tf.reshape(
                    self.sep_ph_mean,
                    (-1, *self.sep_ph_mean.shape.as_list()[-3:]))
                ph_std = tf.reshape(
                    self.sep_ph_std,
                    (-1, *self.sep_ph_std.shape.as_list()[-3:]))

                target_x = xr = tf.clip_by_value((xr - ph_mean) / ph_std, -5.0,
                                                 5.0)

                all_target_out = []

                #target_out = self._build_target_net(target_x, 'target_net', False, convfeat, rep_size, enlargement)
                for i in range(self.num_agents):

                    scope = 'target_net_{}'.format(str(i))
                    target_out = self._build_target_net(
                        target_x, scope, tf.AUTO_REUSE, convfeat, rep_size,
                        enlargement)

                    if i == 0:
                        #[env*step, rep_size] -> [env*step, 1, rep_size]
                        all_target_out = tf.expand_dims(target_out, axis=1)
                    else:
                        #[env*step, 1, rep_size] -> [env*step, num_agents , rep_size]
                        all_target_out = tf.concat([
                            all_target_out,
                            tf.expand_dims(target_out, axis=1)
                        ],
                                                   axis=1)

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))

                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]

                ph_mean = tf.reshape(
                    self.sep_ph_mean,
                    (-1, *self.sep_ph_mean.shape.as_list()[-3:]))
                ph_std = tf.reshape(
                    self.sep_ph_std,
                    (-1, *self.sep_ph_std.shape.as_list()[-3:]))

                pred_x = xrp = tf.clip_by_value((xrp - ph_mean) / ph_std, -5.0,
                                                5.0)

                all_pred_out = []
                for i in range(self.num_agents):

                    scope = 'pred_net_{}'.format(str(i))
                    pred_out = self._build_pred_net(pred_x, scope,
                                                    tf.AUTO_REUSE, convfeat,
                                                    rep_size, enlargement)

                    if i == 0:
                        #[env*step, rep_size] -> [env*step, 1, rep_size]
                        all_pred_out = tf.expand_dims(pred_out, axis=1)
                    else:
                        #[env*step, 1, rep_size] -> [env*step, num_agents , rep_size]
                        all_pred_out = tf.concat(
                            [all_pred_out,
                             tf.expand_dims(pred_out, axis=1)],
                            axis=1)

        #[env*step, num_agents , rep_size] -> [env*step, num_agents , 1]
        all_loss = tf.reduce_mean(
            tf.square(tf.stop_gradient(all_target_out) - all_pred_out),
            axis=-1,
            keep_dims=True)

        #[batch, nstep] -> [batch,nstep, ngroups]
        one_hot_gidx = tf.one_hot(self.ph_agent_idx, self.num_agents, axis=-1)
        #[batch,nstep, ngroups] -> [batch * nstep, ngroups,1]
        one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents, 1))

        X_r = tf.reduce_sum(one_hot_gidx * all_target_out, axis=1)

        feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        max_feat = tf.reduce_max(tf.abs(X_r))
        #[env*step, num_agents , 1] -> [env*step, 1]
        int_rew = tf.reduce_sum(one_hot_gidx * all_loss, axis=1)
        int_rew = tf.reshape(int_rew, (self.sy_nenvs, self.sy_nsteps - 1))

        #[env*step, num_agents ,1]
        rnd_mask = tf.reshape(self.rnd_mask, (-1, self.num_agents, 1))
        rnd_mask = tf.cast(rnd_mask, tf.float32)

        #[env*step, num_agents , 1] -> [env*step]
        mask_loss = tf.reduce_sum(rnd_mask * all_loss,
                                  axis=[1, 2]) / tf.maximum(
                                      tf.reduce_sum(rnd_mask, axis=[1, 2]), 1.)
        aux_loss = mask_loss
        mask = tf.random_uniform(shape=tf.shape(aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        aux_loss = tf.reduce_sum(mask * aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)

        return aux_loss, int_rew, feat_var, max_feat

    def define_rew_discriminator_v2(self, convfeat, rep_size, use_rew=False):

        output_shape = [self.sy_nenvs * (self.sy_nsteps - 1)]

        sample_prob = tf.reshape(self.sample_agent_prob,
                                 tf.stack(output_shape))
        game_score = tf.reshape(
            self.game_score,
            tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1]))

        rew_agent_label = tf.reshape(
            self.rew_agent_label,
            tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1]))

        #rew_agent_label = tf.one_hot(self.rew_agent_label, self.num_agents, axis=-1)
        #rew_agent_label = tf.reshape(rew_agent_label,(-1,self.num_agents ))

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C

                phi = ph[:, 1:]
                phi = tf.cast(phi, tf.float32)
                phi = tf.reshape(phi, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                phi = phi / 255.

                last_rew_ob = self.last_rew_ob
                last_rew_ob = tf.cast(last_rew_ob, tf.float32)
                last_rew_ob = tf.reshape(
                    last_rew_ob,
                    (-1, *last_rew_ob.shape.as_list()[-3:]))[:, :, :, -1:]
                last_rew_ob = last_rew_ob / 255.

                if use_rew:
                    phi = tf.concat([phi, last_rew_ob], axis=-1)

                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                #[20,20] [8,8]
                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                #[9,9] [7,7]
                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                phi = to2d(phi)

                phi = tf.nn.relu(
                    fc(phi, 'fc1r', nh=rep_size, init_scale=np.sqrt(2)))
                phi = tf.nn.relu(
                    fc(phi, 'fc2r', nh=rep_size, init_scale=np.sqrt(2)))
                disc_logits = fc(phi,
                                 'fc3r',
                                 nh=self.num_agents,
                                 init_scale=np.sqrt(2))

        one_hot_gidx = tf.one_hot(self.ph_agent_idx, self.num_agents, axis=-1)
        one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents))

        flatten_all_div_prob = tf.nn.softmax(disc_logits, axis=-1)
        all_div_prob = tf.reshape(
            flatten_all_div_prob,
            (self.sy_nenvs, self.sy_nsteps - 1, self.num_agents))

        sp_prob = tf.reduce_sum(one_hot_gidx * flatten_all_div_prob, axis=1)
        sp_prob = tf.reshape(sp_prob, (self.sy_nenvs, self.sy_nsteps - 1))

        div_rew = -1 * tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=disc_logits, labels=one_hot_gidx)
        base_rew = tf.log(0.01)
        div_rew = div_rew - tf.log(sample_prob)

        div_rew = tf.reshape(div_rew, (self.sy_nenvs, self.sy_nsteps - 1))

        disc_pdtype = CategoricalPdType(self.num_agents)
        disc_pd = disc_pdtype.pdfromflat(disc_logits)

        disc_nlp = disc_pd.neglogp(rew_agent_label)

        return disc_logits, all_div_prob, sp_prob, div_rew, disc_pd, disc_nlp

    def define_self_prediction_rew(self, convfeat, rep_size, enlargement):
        #RND.
        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c1rp_pred',
                         nf=convfeat,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c2rp_pred',
                         nf=convfeat * 2,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c3rp_pred',
                         nf=convfeat * 2,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)
                X_r_hat = tf.nn.relu(
                    fc(rgbrp,
                       'fc1r_hat1_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(X_r_hat,
                       'fc1r_hat2_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(X_r_hat,
                             'fc1r_hat3_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)

    def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement):
        #Dynamics based bonus.

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)
        assert ac_one_hot.get_shape().ndims == 3
        assert ac_one_hot.get_shape().as_list() == [
            None, None, self.ac_space.n
        ], ac_one_hot.get_shape().as_list()
        ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n))

        def cond(x):
            return tf.concat([x, ac_one_hot], 1)

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, :-1]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))
                # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok?
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c1rp_pred',
                         nf=convfeat,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c2rp_pred',
                         nf=convfeat * 2,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c3rp_pred',
                         nf=convfeat * 2,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)

                # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(cond(rgbrp),
                       'fc1r_hat1_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(cond(X_r_hat),
                       'fc1r_hat2_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(cond(X_r_hat),
                             'fc1r_hat3_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)

    def initial_state(self, n):
        return np.zeros((n, self.memsize), np.float32)

    def call(self, dict_obs, new, istate, agent_idx, update_obs_stats=False):
        for ob in dict_obs.values():
            if ob is not None:
                if update_obs_stats:
                    raise NotImplementedError
                    ob = ob.astype(np.float32)
                    ob = ob.reshape(-1, *self.ob_space.shape)
                    self.ob_rms.update(ob)
        # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk.
        # It will use whatever observation spaces saved to disk along with other ctor params.
        feed1 = {self.ph_ob[k]: dict_obs[k][:, None] for k in self.ph_ob_keys}
        feed2 = {
            self.ph_istate: istate,
            self.ph_new: new[:, None].astype(np.float32)
        }
        #feed1.update({self.ph_mean: self.ob_rms.mean, self.ph_std: self.ob_rms.var ** 0.5})

        feed1.update({self.ph_agent_idx: agent_idx})
        # for f in feed1:
        #     print(f)
        a, vpred_int, vpred_ext, nlp, newstate, ent = tf_util.get_session(
        ).run([
            self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout,
            self.nlp_samp, self.snext_rollout, self.entropy_rollout
        ],
              feed_dict={
                  **feed1,
                  **feed2
              })

        base_vpred_ext = np.ones_like(vpred_ext)

        return a[:,
                 0], vpred_int[:,
                               0], vpred_ext[:,
                                             0], nlp[:,
                                                     0], newstate, ent[:,
                                                                       0], base_vpred_ext[:,
                                                                                          0]

    def get_ph_mean_std(self):
        mean, std = tf.get_default_session().run(
            [self.var_ph_mean, self.var_ph_std])

        return mean, std
示例#16
0
class CnnGruPolicy(StochasticPolicy):
    def __init__(self,
                 scope,
                 ob_space,
                 ac_space,
                 policy_size='normal',
                 maxpool=False,
                 extrahid=True,
                 hidsize=128,
                 memsize=128,
                 rec_gate_init=0.0,
                 update_ob_stats_independently_per_gpu=True,
                 proportion_of_exp_used_for_predictor_update=1.,
                 exploration_type='bottleneck',
                 beta=1e-3,
                 rew_counter=None):
        StochasticPolicy.__init__(self, scope, ob_space, ac_space)
        self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
        enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size]
        rep_size = 512
        self.ph_mean = tf.placeholder(dtype=tf.float32,
                                      shape=list(ob_space.shape[:2]) + [1],
                                      name="obmean")
        self.ph_std = tf.placeholder(dtype=tf.float32,
                                     shape=list(ob_space.shape[:2]) + [1],
                                     name="obstd")
        memsize *= enlargement
        hidsize *= enlargement
        convfeat = 16 * enlargement
        self.ob_rms = RunningMeanStd(
            shape=list(ob_space.shape[:2]) + [1],
            use_mpi=not update_ob_stats_independently_per_gpu)
        ph_istate = tf.placeholder(dtype=tf.float32,
                                   shape=(None, memsize),
                                   name='state')
        pdparamsize = self.pdtype.param_shape()[0]
        self.memsize = memsize

        # For training
        self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \
            self.apply_policy(self.ph_ob[None][:,:-1],
                              ph_new=self.ph_new,
                              ph_istate=ph_istate,
                              reuse=False,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps - 1,
                              pdparamsize=pdparamsize,
                              rec_gate_init=rec_gate_init
                              )
        # For inference
        self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
            self.apply_policy(self.ph_ob[None],
                              ph_new=self.ph_new,
                              ph_istate=ph_istate,
                              reuse=True,
                              scope=scope,
                              hidsize=hidsize,
                              memsize=memsize,
                              extrahid=extrahid,
                              sy_nenvs=self.sy_nenvs,
                              sy_nsteps=self.sy_nsteps,
                              pdparamsize=pdparamsize,
                              rec_gate_init=rec_gate_init
                              )

        self.define_bottleneck_rew(convfeat=convfeat,
                                   rep_size=rep_size / 8,
                                   enlargement=enlargement,
                                   beta=beta,
                                   rew_counter=rew_counter)

        pd = self.pdtype.pdfromflat(self.pdparam_rollout)
        self.a_samp = pd.sample()
        self.nlp_samp = pd.neglogp(self.a_samp)
        self.entropy_rollout = pd.entropy()
        self.pd_rollout = pd

        self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt)

        self.ph_istate = ph_istate

    @staticmethod
    def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize,
                     extrahid, sy_nenvs, sy_nsteps, pdparamsize,
                     rec_gate_init):
        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnGruPolicy: using '%s' shape %s as image input" %
                    (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())

        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(
                conv(X,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c2',
                     nf=64,
                     rf=4,
                     stride=2,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c3',
                     nf=64,
                     rf=4,
                     stride=1,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = to2d(X)
            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
            X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize])
            X, snext = tf.nn.dynamic_rnn(GRUCell(memsize,
                                                 rec_gate_init=rec_gate_init),
                                         (X, ph_new[:, :, None]),
                                         dtype=tf.float32,
                                         time_major=False,
                                         initial_state=ph_istate)
            X = tf.reshape(X, (-1, memsize))
            Xtout = X
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1))
                X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext

    def define_bottleneck_rew(self,
                              convfeat,
                              rep_size,
                              enlargement,
                              beta=1e-2,
                              rew_counter=None):
        logger.info(
            "Using Curiosity Bottleneck ****************************************************"
        )
        v_target = tf.reshape(self.ph_ret_ext, (-1, 1))

        if rew_counter is None:
            sched_coef = 1.
        else:
            sched_coef = tf.minimum(rew_counter / 1000, 1.)

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                mu = fc(rgbr[0], 'fc_mu', nh=rep_size, init_scale=np.sqrt(2))
                sigma = tf.nn.softplus(
                    fc(rgbr[0], 'fc_sigma', nh=rep_size,
                       init_scale=np.sqrt(2)))
                z = mu + sigma * tf.random_normal(
                    tf.shape(mu), 0, 1, dtype=tf.float32)
                v = fc(z, 'value', nh=1, init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(sigma)
        self.max_feat = tf.reduce_max(tf.abs(z))

        self.kl = 0.5 * tf.reduce_sum(tf.square(mu) + tf.square(sigma) -
                                      tf.log(1e-8 + tf.square(sigma)) - 1,
                                      axis=-1,
                                      keep_dims=True)
        self.int_rew = tf.stop_gradient(self.kl)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        self.aux_loss = sched_coef * tf.square(v_target - v) + beta * self.kl
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)

    def initial_state(self, n):
        return np.zeros((n, self.memsize), np.float32)

    def call(self, dict_obs, new, istate, update_obs_stats=False):
        for ob in dict_obs.values():
            if ob is not None:
                if update_obs_stats:
                    raise NotImplementedError
                    ob = ob.astype(np.float32)
                    ob = ob.reshape(-1, *self.ob_space.shape)
                    self.ob_rms.update(ob)
        # Note: if it fails here with ph vs observations inconsistency, check if you're loading agent from disk.
        # It will use whatever observation spaces saved to disk along with other ctor params.
        feed1 = {self.ph_ob[k]: dict_obs[k][:, None] for k in self.ph_ob_keys}
        feed2 = {
            self.ph_istate: istate,
            self.ph_new: new[:, None].astype(np.float32)
        }
        feed1.update({
            self.ph_mean: self.ob_rms.mean,
            self.ph_std: self.ob_rms.var**0.5
        })
        # for f in feed1:
        #     print(f)
        a, vpred_int, vpred_ext, nlp, newstate, ent = tf.get_default_session(
        ).run([
            self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout,
            self.nlp_samp, self.snext_rollout, self.entropy_rollout
        ],
              feed_dict={
                  **feed1,
                  **feed2
              })
        return a[:, 0], vpred_int[:, 0], vpred_ext[:,
                                                   0], nlp[:,
                                                           0], newstate, ent[:,
                                                                             0]
    def __init__(self, ob_space, ac_space, nsteps, gamma, venvs, stochpol, comm):
        self.lump_stride = venvs[0].num_envs
        self.venvs = venvs
        assert all(venv.num_envs == self.lump_stride for venv in self.venvs[1:]), 'All venvs should have the same num_envs'
        self.nlump = len(venvs)
        nenvs = self.nenvs = self.nlump * self.lump_stride
        self.reset_counter = 0
        self.env_results = [None] * self.nlump
        self.buf_vpreds_int = np.zeros((nenvs, nsteps), np.float32)
        self.buf_vpreds_ext = np.zeros((nenvs, nsteps), np.float32)
        self.buf_nlps = np.zeros((nenvs, nsteps), np.float32)
        self.buf_advs = np.zeros((nenvs, nsteps), np.float32)
        self.buf_advs_int = np.zeros((nenvs, nsteps), np.float32)
        self.buf_advs_ext = np.zeros((nenvs, nsteps), np.float32)
        self.buf_rews_int = np.zeros((nenvs, nsteps), np.float32)
        self.buf_rews_ext = np.zeros((nenvs, nsteps), np.float32)

        self.buf_rews_ec = np.zeros((nenvs, nsteps), np.float32)

        self.buf_acs = np.zeros((nenvs, nsteps, *ac_space.shape), ac_space.dtype)
        self.buf_obs = { k: np.zeros(
                            [nenvs, nsteps] + stochpol.ph_ob[k].shape.as_list()[2:],
                            dtype=stochpol.ph_ob_dtypes[k])
                        for k in stochpol.ph_ob_keys }
        self.buf_ob_last = { k: self.buf_obs[k][:, 0, ...].copy() for k in stochpol.ph_ob_keys }
        self.buf_epinfos = [{} for _ in range(self.nenvs)]
        self.buf_news = np.zeros((nenvs, nsteps), np.float32)
        self.buf_ent = np.zeros((nenvs, nsteps), np.float32)
        self.mem_state = stochpol.initial_state(nenvs)
        self.seg_init_mem_state = copy(self.mem_state) # Memory state at beginning of segment of timesteps
        self.rff_int = RewardForwardFilter(gamma)
        self.rff_rms_int = RunningMeanStd(comm=comm, use_mpi=True)
        self.buf_new_last = self.buf_news[:, 0, ...].copy()
        self.buf_vpred_int_last = self.buf_vpreds_int[:, 0, ...].copy()
        self.buf_vpred_ext_last = self.buf_vpreds_ext[:, 0, ...].copy()
        self.step_count = 0 # counts number of timesteps that you've interacted with this set of environments
        self.t_last_update = time.time()
        self.statlists = defaultdict(lambda : deque([], maxlen=100)) # Count other stats, e.g. optimizer outputs
        self.stats = defaultdict(float) # Count episodes and timesteps
        self.stats['epcount'] = 0
        self.stats['n_updates'] = 0
        self.stats['tcount'] = 0
        self.stats['nbatch'] = 0

        self.buf_scores = np.zeros((nenvs), np.float32)
        self.buf_nsteps = np.zeros((nenvs), np.float32)
        self.buf_reset = np.zeros((nenvs), np.float32)

        self.buf_ep_raminfos = [{} for _ in range(self.nenvs)]

        self.oracle_visited_count = oracle.OracleExplorationRewardForAllEpisodes()



        self.cur_gen_idx = 0
        self.rews_found_by_ancestors={}
        self.last_gen_policy = None
        self.last_gen_rnd = None


        self.target_max_rews = 20000
        self.max_npolicies = 5
        self.max_nsteps_training_single_policy = 10 * 1e6

        self.plan_step = 3

        self.reset_for_new_generation()
        self.reset_for_new_policy()