Пример #1
0
    def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init):
        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnGruPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())

        with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format))
            X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format))
            X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format))
            X = to2d(X)
            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
            X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize])
            X, snext = tf.nn.dynamic_rnn(
                GRUCell(memsize, rec_gate_init=rec_gate_init), (X, ph_new[:,:,None]),
                dtype=tf.float32, time_major=False, initial_state=ph_istate)
            X = tf.reshape(X, (-1, memsize))
            Xtout = X
            if extrahid:
                Xtout = X + activ(fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1))
                X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext
Пример #2
0
    def apply_policy(ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize):
        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())
        with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format))
            X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format))
            X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format))
            X = to2d(X)
            mix_other_observations = [X]
            X = tf.concat(mix_other_observations, axis=1)
            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
            additional_size = 448
            X = activ(fc(X, 'fc_additional', nh=additional_size, init_scale=np.sqrt(2)))
            snext = tf.zeros((sy_nenvs, memsize))
            mix_timeout = [X]

            Xtout = tf.concat(mix_timeout, axis=1)
            if extrahid:
                Xtout = X + activ(fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1))
                X     = X + activ(fc(X, 'fc2act', nh=additional_size, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int   = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext   = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext
Пример #3
0
    def apply_policy(ph_ob,
                     reuse,
                     scope,
                     hidsize,
                     memsize,
                     extrahid,
                     sy_nenvs,
                     sy_nsteps,
                     pdparamsize,
                     use_action_balance=None):
        ph = ph_ob
        assert len(ph.shape.as_list()) == 3  # B,T,S
        logger.info("Mlp Policy: using '%s' shape %s as image input" %
                    (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32)
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-1:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())
        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(fc(X, 'fc_0', nh=hidsize, init_scale=np.sqrt(2)))
            mix_other_observations = [X]
            X = tf.concat(mix_other_observations, axis=1)
            X = activ(fc(X, 'fc_1', nh=hidsize, init_scale=np.sqrt(2)))
            additional_size = 64
            X = activ(
                fc(X,
                   'fc_additional',
                   nh=additional_size,
                   init_scale=np.sqrt(2)))

            snext = tf.zeros((sy_nenvs, memsize))
            mix_timeout = [X]

            Xtout = tf.concat(mix_timeout, axis=1)
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1))
                X = X + activ(
                    fc(X, 'fc2act', nh=additional_size, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            # if use_action_balance:

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext
Пример #4
0
    def __init__(self, orders, d, coeffs, name=None):
        self.orders = orders
        self.d = d
        self.coeffs = coeffs

        self.input_dim = self.orders.shape[1]
        self.name = name

        gpu_devices = U.get_available_gpus()
        if gpu_devices:
            device = gpu_devices[0]
        else:
            cpu_devices = U.get_available_cpus()
            device = cpu_devices[0]

        with tf.device(device):
            self.declare_vars()
    def __init__(self,
                 res=None,
                 activation=None,
                 keras=False,
                 model=None,
                 reuse=False):
        if not keras:
            # activation type
            activations = activation.split('_')
            if len(activations) > 1:
                self.activation = activations[0]
                self.last_layer_activation = activations[1]
            else:
                self.activation = activation
                self.last_layer_activation = None
            # affine mapping of the output
            self.offset = res[-2]
            self.scale_factor = res[-1]

            # parse structure of neural networks
            self.num_of_inputs = int(res[0])
            self.num_of_outputs = int(res[1])
            self.num_of_hidden_layers = int(res[2])
            self.network_structure = np.zeros(self.num_of_hidden_layers + 1,
                                              dtype=int)

            self.activations = [self.activation
                                ] * (self.num_of_hidden_layers + 1)
            if self.last_layer_activation is not None:
                self.activations[-1] = self.last_layer_activation

            # pointer is current reading index
            self.pointer = 3

            # num of neurons of each layer
            for i in range(self.num_of_hidden_layers):
                self.network_structure[i] = int(res[self.pointer])
                self.pointer += 1

            # output layer
            self.network_structure[-1] = self.num_of_outputs

            # all values from the text file
            self.param = res

            # store the weights and bias in two lists
            # self.weights
            # self.bias
            gpu_devices = U.get_available_gpus()
            if gpu_devices:
                device = gpu_devices[0]
            else:
                cpu_devices = U.get_available_cpus()
                device = cpu_devices[0]
            with tf.device(device):
                self.parse_w_b()
                self.x = tf.placeholder(tf.float64,
                                        shape=[None, self.num_of_inputs],
                                        name='input')
                self.y = self.tensorflow_representation(self.x, reuse=reuse)
        else:
            params = []
            self.weights = []
            self.bias = []
            for layer in model.layers:
                params.append(layer.get_weights())  # list of numpy arrays
            for param in params:
                if len(param) == 0:
                    continue
                else:
                    self.weights.append(param[0])
                    self.bias.append(param[1])
            self.model = model
    def apply_policy(
        ph_ob,
        reuse,
        scope,
        hidsize,
        memsize,
        extrahid,
        sy_nenvs,
        sy_nsteps,
        pdparamsize,
        additional_inputs=None,
    ):
        meta_rl = False
        data_format = "NHWC"
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info(
            f"CnnPolicy: using '{ph.name}' shape {ph.shape} as image input")
        X = tf.cast(ph, tf.float32) / 255.0
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())
        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device("/gpu:0" if yes_gpu else "/cpu:0"):
            X = activ(
                conv(
                    X,
                    "c1",
                    nf=32,
                    rf=8,
                    stride=4,
                    init_scale=np.sqrt(2),
                    data_format=data_format,
                ))
            X = activ(
                conv(
                    X,
                    "c2",
                    nf=64,
                    rf=4,
                    stride=2,
                    init_scale=np.sqrt(2),
                    data_format=data_format,
                ))
            X = activ(
                conv(
                    X,
                    "c3",
                    nf=64,
                    rf=4,
                    stride=1,
                    init_scale=np.sqrt(2),
                    data_format=data_format,
                ))
            X = to2d(X)
            mix_other_observations = [X]

            if ('prev_acs' in additional_inputs) and ('prev_rew'
                                                      in additional_inputs):
                # Cast numpy arrays to tf tensors
                prev_acs = tf.cast(additional_inputs['prev_acs'], tf.float32)
                prev_rew = tf.cast(additional_inputs['prev_rew'], tf.float32)

                # Flatten out time dimension
                prev_acs = tf.reshape(prev_acs,
                                      (-1, *prev_acs.shape.as_list()[2:]))
                prev_rew = tf.reshape(prev_rew,
                                      (-1, *prev_rew.shape.as_list()[2:]))

                # Add to 2D features going to FC layers
                mix_other_observations.extend([prev_acs, prev_rew])

            X = tf.concat(mix_other_observations, axis=1)
            X = activ(fc(X, "fc1", nh=hidsize, init_scale=np.sqrt(2)))
            additional_size = 448
            X = activ(
                fc(X,
                   "fc_additional",
                   nh=additional_size,
                   init_scale=np.sqrt(2)))
            snext = tf.zeros((sy_nenvs, memsize))
            mix_timeout = [X]

            Xtout = tf.concat(mix_timeout, axis=1)
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, "fc2val", nh=additional_size, init_scale=0.1))
                X = X + activ(
                    fc(X, "fc2act", nh=additional_size, init_scale=0.1))
            pdparam = fc(X, "pd", nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, "vf_int", nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, "vf_ext", nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext
    def apply_policy(self, ph_ob, ph_new, ph_istate, reuse, scope, hidsize,
                     memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize,
                     rec_gate_init):
        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnGruPolicy: using '%s' shape %s as image input" %
                    (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        # (None, 84, 84, 4) in case of MontezumaRevengeNoFrameskip
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())

        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(
                conv(X,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            #X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format))
            #X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format))

            # over 14k rewards with these 2 and only the first conv layer
            # with tf.variable_scope("augmented1"):
            #     X = self.augmented_conv2d(X, 256, dk=24, dv=24)

            # with tf.variable_scope("augmented2"):
            #     X = self.augmented_conv2d(X, 256, dk=24, dv=24)

            # 5.8k rewards 3 levels with these 2 and the first 2 conv layers
            # with tf.variable_scope("augmented1"):
            #     X = self.augmented_conv2d(X, 512, dk=256, dv=256)

            # with tf.variable_scope("augmented2"):
            #     X = self.augmented_conv2d(X, 512, dk=256, dv=256)

            with tf.variable_scope("augmented1"):
                X = self.augmented_conv2d(X, 256, dk=24, dv=24)

            with tf.variable_scope("augmented2"):
                X = self.augmented_conv2d(X, 256, dk=24, dv=24)

            X = to2d(X)
            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
            X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize])

            X, snext = tf.nn.dynamic_rnn(GRUCell(memsize,
                                                 rec_gate_init=rec_gate_init),
                                         (X, ph_new[:, :, None]),
                                         dtype=tf.float32,
                                         time_major=False,
                                         initial_state=ph_istate)

            X = tf.reshape(X, (-1, memsize))
            Xtout = X
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1))
                X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1))

            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext
Пример #8
0
    def apply_policy(
        ph_ob,
        ph_new,
        ph_istate,
        reuse,
        scope,
        hidsize,
        memsize,
        extrahid,
        sy_nenvs,
        sy_nsteps,
        pdparamsize,
        rec_gate_init,
    ):
        ph = ph_ob
        logger.info(
            f"CnnGruPolicy: using '{ph.name}' shape {ph.shape} as image input")
        assert len(ph.shape.as_list()) == 3  # B, Envs, Features
        X = tf.cast(ph, tf.float32) / 255.0
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())

        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device("/gpu:0" if yes_gpu else "/cpu:0"):
            X = activ(fc(
                X,
                "fc1",
                nh=32,
                init_scale=np.sqrt(2),
            ))
            X = activ(fc(
                X,
                "fc2",
                nh=64,
                init_scale=np.sqrt(2),
            ))
            X = activ(fc(
                X,
                "fc3",
                nh=64,
                init_scale=np.sqrt(2),
            ))
            X = to2d(X)
            X = activ(fc(X, "fc1", nh=hidsize, init_scale=np.sqrt(2)))
            X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize])
            X, snext = tf.nn.dynamic_rnn(
                GRUCell(memsize, rec_gate_init=rec_gate_init),
                (X, ph_new[:, :, None]),
                dtype=tf.float32,
                time_major=False,
                initial_state=ph_istate,
            )
            X = tf.reshape(X, (-1, memsize))
            Xtout = X
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, "fc2val", nh=memsize, init_scale=0.1))
                X = X + activ(fc(X, "fc2act", nh=memsize, init_scale=0.1))
            pdparam = fc(X, "pd", nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, "vf_int", nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, "vf_ext", nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext
    def apply_multi_head_policy(self, ph_ob, ph_new, ph_istate, reuse, scope,
                                hidsize, memsize, extrahid, sy_nenvs,
                                sy_nsteps, pdparamsize, rec_gate_init):

        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnGruPolicy: using '%s' shape %s as image input" %
                    (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        yes_gpu = any(get_available_gpus())

        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):

            all_pdparam = []
            all_vint = []
            all_vext = []
            all_snext = []

            for i in range(self.num_agents):

                scope = 'agent_{}'.format(str(i))
                pdparam, vpred_int, vpred_ext, snext = self._build_policy_net(
                    X=X,
                    ph_new=ph_new,
                    ph_istate=ph_istate,
                    scope=scope,
                    reuse=False,
                    hidsize=hidsize,
                    memsize=memsize,
                    extrahid=extrahid,
                    sy_nenvs=sy_nenvs,
                    sy_nsteps=sy_nsteps,
                    pdparamsize=pdparamsize,
                    rec_gate_init=rec_gate_init)

                if i == 0:
                    #[batch,naction] - > [batch, 1, naction]
                    all_pdparam = tf.expand_dims(pdparam, axis=1)
                    #[batch,1] -> [batch,1,1]
                    all_vint = tf.expand_dims(vpred_int, axis=1)
                    all_vext = tf.expand_dims(vpred_ext, axis=1)
                    all_snext = tf.expand_dims(snext, axis=1)
                else:
                    all_pdparam = tf.concat(
                        [all_pdparam,
                         tf.expand_dims(pdparam, axis=1)], axis=1)
                    all_vint = tf.concat(
                        [all_vint, tf.expand_dims(vpred_int, axis=1)], axis=1)
                    all_vext = tf.concat(
                        [all_vext, tf.expand_dims(vpred_ext, axis=1)], axis=1)
                    all_snext = tf.concat(
                        [all_snext, tf.expand_dims(snext, axis=1)], axis=1)

            #[batch, nstep] -> [batch,nstep, ngroups]
            one_hot_gidx = tf.one_hot(self.ph_agent_idx,
                                      self.num_agents,
                                      axis=-1)
            #[batch,nstep, ngroups] -> [batch * nstep, ngroups,1]
            one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents, 1))

            pdparam = tf.reduce_sum(one_hot_gidx * all_pdparam, axis=1)
            vpred_int = tf.reduce_sum(one_hot_gidx * all_vint, axis=1)
            vpred_ext = tf.reduce_sum(one_hot_gidx * all_vext, axis=1)
            snext = tf.reduce_sum(one_hot_gidx * all_snext, axis=1)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
            snext = tf.reshape(snext, (sy_nenvs, memsize))

        return pdparam, vpred_int, vpred_ext, snext
Пример #10
0
    def define_self_prediction_rew(self, convfeat, rep_size, enlargement,
                                   scope):
        #RND.
        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        #define expert agent observations random features
        #
        yes_gpu = any(get_available_gpus())
        with tf.variable_scope(
                tf.get_variable_scope(),
                reuse=True), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X_im = np.load(os.getcwd() + '/policies/obs.npy')
            Xr_im = tf.cast(X_im, tf.float32) / 255.
            Xr_im = tf.reshape(Xr_im, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
            Xr_im = tf.clip_by_value((Xr_im - tf.reduce_mean(Xr_im)) /
                                     (tf.math.reduce_std(Xr_im)**0.5), -5.0,
                                     5.0)
            Xr_im = tf.nn.leaky_relu(
                conv(Xr_im,
                     'c1r',
                     nf=convfeat * 1,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2)))
            Xr_im = tf.nn.leaky_relu(
                conv(Xr_im,
                     'c2r',
                     nf=convfeat * 2 * 1,
                     rf=4,
                     stride=2,
                     init_scale=np.sqrt(2)))
            Xr_im = tf.nn.leaky_relu(
                conv(Xr_im,
                     'c3r',
                     nf=convfeat * 2 * 1,
                     rf=3,
                     stride=1,
                     init_scale=np.sqrt(2)))
            Xr_im = [to2d(Xr_im)[::self.demonstration_stride]]
            Xr_im = fc(Xr_im[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))
            Xr_im = tf.stop_gradient(Xr_im)

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()
                   ) == 5:  # B,T,H,W,C ###Batch time height width color?
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c1rp_pred',
                         nf=convfeat,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c2rp_pred',
                         nf=convfeat * 2,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c3rp_pred',
                         nf=convfeat * 2,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)
                X_r_hat = tf.nn.relu(
                    fc(rgbrp,
                       'fc1r_hat1_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(X_r_hat,
                       'fc1r_hat2_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(X_r_hat,
                             'fc1r_hat3_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))
        ####
        #self.im_rew =  tf.math.maximum(1 - tf.divide(tf.reduce_mean(tf.square(self.Xr_im[:(X_r).shape[0]] - X_r), axis=-1, keep_dims=True),tf.add(tf.reduce_mean(tf.square(self.Xr_im[:X_r.shape[0]]), axis=-1, keep_dims=True),tf.reduce_mean(tf.square(X_r), axis=-1, keep_dims=True))),tf.constant(0.5))
        im_rew = tf.reduce_mean(tf.tensordot(tf.stop_gradient(X_r),
                                             Xr_im,
                                             axes=[[1], [1]]),
                                axis=1)
        im_rew = tf.reshape(im_rew, (self.sy_nenvs, self.sy_nsteps - 1))
        #self.int_rew =tf.math.maximum(self.im_rew,self.int_rew)
        self.int_rew = self.int_rew * (1 + tf.math.tanh(im_rew / 100))
        ####

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)