Пример #1
0
    def initial_state(self, inputs, time_major, batch_size = None, dtype = tf.float32, trainable = False,
                      trainable_initializers = None, trainable_regularizers = None,
                      state_initializer = tf.zeros_initializer()):
        if not batch_size:
            batch_size = get_batch_size(inputs, time_major)

        return state_initializer([batch_size] + self._spatial_size + [2 * self._num_channels], dtype = dtype)
Пример #2
0
    def test(self, visual: utils.TensorOrSequence,
             selfbboxes: utils.TensorOrSequence,
             bboxes: utils.TensorOrSequence, max_len: int, eos_idx: int,
             **kwargs) -> utils.Tuple[torch.Tensor, torch.Tensor]:
        b_s = utils.get_batch_size(visual)
        device = utils.get_device(visual)
        outputs = []
        log_probs = []

        mask = torch.ones((b_s, ), device=device)
        with self.statefulness(b_s):
            out = None
            for t in range(max_len):
                log_probs_t = self.step(t,
                                        out,
                                        visual,
                                        selfbboxes,
                                        bboxes,
                                        None,
                                        mode='feedback',
                                        **kwargs)
                out = torch.max(log_probs_t, -1)[1]
                mask = mask * (out.squeeze(-1) != eos_idx).float()
                log_probs.append(log_probs_t *
                                 mask.unsqueeze(-1).unsqueeze(-1))
                outputs.append(out)

        return torch.cat(outputs, 1), torch.cat(log_probs, 1)
Пример #3
0
    def sample_rl(self, visual: utils.TensorOrSequence,
                  selfbboxes: utils.TensorOrSequence,
                  bboxes: utils.TensorOrSequence, max_len: int,
                  **kwargs) -> utils.Tuple[torch.Tensor, torch.Tensor]:
        b_s = utils.get_batch_size(visual)
        outputs = []
        log_probs = []

        with self.statefulness(b_s):
            out = None
            for t in range(max_len):
                out = self.step(t,
                                out,
                                visual,
                                selfbboxes,
                                bboxes,
                                None,
                                mode='feedback',
                                **kwargs)
                distr = distributions.Categorical(logits=out[:, 0])
                out = distr.sample().unsqueeze(1)
                outputs.append(out)
                log_probs.append(distr.log_prob(out).unsqueeze(1))

        return torch.cat(outputs, 1), torch.cat(log_probs, 1)
Пример #4
0
    def mmd_penalty(self, sample_pz, sample_qz):
        opts = self.opts
        sigma2_p = 1.
        n = utils.get_batch_size(sample_qz)
        n = tf.cast(n, tf.int32)
        nf = tf.cast(n, tf.float32)

        norms_pz = tf.reduce_sum(tf.square(sample_pz), axis=1, keepdims=True)
        dotprods_pz = tf.matmul(sample_pz, sample_pz, transpose_b=True)
        distances_pz = norms_pz + tf.transpose(norms_pz) - 2. * dotprods_pz

        norms_qz = tf.reduce_sum(tf.square(sample_qz), axis=1, keepdims=True)
        dotprods_qz = tf.matmul(sample_qz, sample_qz, transpose_b=True)
        distances_qz = norms_qz + tf.transpose(norms_qz) - 2. * dotprods_qz

        dotprods = tf.matmul(sample_qz, sample_pz, transpose_b=True)
        distances = norms_qz + tf.transpose(norms_pz) - 2. * dotprods

        Cbase = 2. * opts['zdim'] * sigma2_p
        stat = 0.
        for scale in [.1, .2, .5, 1., 2., 5., 10.]:
            C = Cbase * scale
            res1 = C / (C + distances_qz)
            res1 += C / (C + distances_pz)
            res1 = tf.multiply(res1, 1. - tf.eye(n))
            res1 = tf.reduce_sum(res1) / (nf * nf - nf)
            res2 = C / (C + distances)
            res2 = tf.reduce_sum(res2) * 2. / (nf * nf)
            stat += res1 - res2
        return stat
Пример #5
0
    def mmd_penalty(self, sample_qz, sample_pz):
        opts = self.opts
        sigma2_p = opts['pz_scale']**2
        n = utils.get_batch_size(sample_qz)
        n = tf.cast(n, tf.int32)
        nf = tf.cast(n, tf.float32)

        norms_pz = tf.reduce_sum(tf.square(sample_pz), axis=1, keep_dims=True)
        dotprods_pz = tf.matmul(sample_pz, sample_pz, transpose_b=True)
        distances_pz = norms_pz + tf.transpose(norms_pz) - 2. * dotprods_pz

        norms_qz = tf.reduce_sum(tf.square(sample_qz), axis=1, keep_dims=True)
        dotprods_qz = tf.matmul(sample_qz, sample_qz, transpose_b=True)
        distances_qz = norms_qz + tf.transpose(norms_qz) - 2. * dotprods_qz

        dotprods = tf.matmul(sample_qz, sample_pz, transpose_b=True)
        distances = norms_qz + tf.transpose(norms_pz) - 2. * dotprods

        # k(x, y) = C / (C + ||x - y||^2)
        # C = tf.nn.top_k(tf.reshape(distances, [-1]), half_size).values[half_size - 1]
        # C += tf.nn.top_k(tf.reshape(distances_qz, [-1]), half_size).values[half_size - 1]
        Cbase = 2. * opts['zdim'] * sigma2_p

        stat = 0.
        for scale in [.1, .2, .5, 1., 2., 5., 10.]:
            C = Cbase * scale
            res1 = C / (C + distances_qz)
            res1 += C / (C + distances_pz)
            res1 = tf.multiply(res1, 1. - tf.eye(n))
            res1 = tf.reduce_sum(res1) / (nf * nf - nf)
            res2 = C / (C + distances)
            res2 = tf.reduce_sum(res2) * 2. / (nf * nf)
            stat += res1 - res2
        return stat
Пример #6
0
    def mmd_penalty(self, sample_pz, sample_qz):
        opts = self.opts
        sigma2_p = opts['pz_scale']**2
        kernel = opts['mmd_kernel']
        n = utils.get_batch_size(sample_qz)
        n = tf.cast(n, tf.int32)
        nf = tf.cast(n, tf.float32)
        half_size = (n * n - n) / 2

        norms_pz = tf.reduce_sum(tf.square(sample_pz), axis=1, keepdims=True)
        dotprods_pz = tf.matmul(sample_pz, sample_pz, transpose_b=True)
        distances_pz = norms_pz + tf.transpose(norms_pz) - 2. * dotprods_pz

        norms_qz = tf.reduce_sum(tf.square(sample_qz), axis=1, keepdims=True)
        dotprods_qz = tf.matmul(sample_qz, sample_qz, transpose_b=True)
        distances_qz = norms_qz + tf.transpose(norms_qz) - 2. * dotprods_qz

        dotprods = tf.matmul(sample_qz, sample_pz, transpose_b=True)
        distances = norms_qz + tf.transpose(norms_pz) - 2. * dotprods

        if kernel == 'RBF':
            # Median heuristic for the sigma^2 of Gaussian kernel
            sigma2_k = tf.nn.top_k(tf.reshape(distances, [-1]),
                                   half_size).values[half_size - 1]
            sigma2_k += tf.nn.top_k(tf.reshape(distances_qz, [-1]),
                                    half_size).values[half_size - 1]

            if opts['verbose']:
                sigma2_k = tf.Print(sigma2_k, [sigma2_k], 'Kernel width:')
            res1 = tf.exp(-distances_qz / 2. / sigma2_k)
            res1 += tf.exp(-distances_pz / 2. / sigma2_k)
            res1 = tf.multiply(res1, 1. - tf.eye(n))
            res1 = tf.reduce_sum(res1) / (nf * nf - nf)
            res2 = tf.exp(-distances / 2. / sigma2_k)
            res2 = tf.reduce_sum(res2) * 2. / (nf * nf)
            stat = res1 - res2
        elif kernel == 'IMQ':
            # k(x, y) = C / (C + ||x - y||^2)
            # C = tf.nn.top_k(tf.reshape(distances, [-1]), half_size).values[half_size - 1]
            # C += tf.nn.top_k(tf.reshape(distances_qz, [-1]), half_size).values[half_size - 1]
            if opts['pz'] == 'normal':
                Cbase = 2. * opts['zdim'] * sigma2_p
            elif opts['pz'] == 'sphere':
                Cbase = 2.
            elif opts['pz'] == 'uniform':
                # E ||x - y||^2 = E[sum (xi - yi)^2]
                #               = zdim E[(xi - yi)^2]
                #               = const * zdim
                Cbase = opts['zdim']
            stat = 0.
            for scale in [.1, .2, .5, 1., 2., 5., 10.]:
                C = Cbase * scale
                res1 = C / (C + distances_qz)
                res1 += C / (C + distances_pz)
                res1 = tf.multiply(res1, 1. - tf.eye(n))
                res1 = tf.reduce_sum(res1) / (nf * nf - nf)
                res2 = C / (C + distances)
                res2 = tf.reduce_sum(res2) * 2. / (nf * nf)
                stat += res1 - res2
        return stat
Пример #7
0
    def mmd_penalty(self, sample_qz, sample_pz):
        opts = self.opts
        sigma2_p = opts['pz_scale']**2
        kernel = opts['mmd_kernel']
        n = utils.get_batch_size(sample_qz)
        n = tf.cast(n, tf.int32)
        nf = tf.cast(n, tf.float32)
        half_size = tf.cast((n * n - n) / 2, tf.int32)

        distances_pz = square_dist_broadcast(sample_pz, sample_pz)
        distances_qz = square_dist_broadcast(sample_qz, sample_qz)
        distances = square_dist_broadcast(sample_qz, sample_pz)

        # distances_pz = self.square_dist(sample_pz, sample_pz)
        # distances_qz = self.square_dist(sample_qz, sample_qz)
        # distances = self.square_dist(sample_qz, sample_pz)

        if opts['mmd_kernel'] == 'RBF':
            # Median heuristic for the sigma^2 of Gaussian kernel
            sigma2_k = tf.nn.top_k(tf.reshape(distances, [-1]),
                                   half_size).values[half_size - 1]
            sigma2_k += tf.nn.top_k(tf.reshape(distances_qz, [-1]),
                                    half_size).values[half_size - 1]
            # Maximal heuristic for the sigma^2 of Gaussian kernel
            # sigma2_k = tf.nn.top_k(tf.reshape(distances_qz, [-1]), 1).values[0]
            # sigma2_k += tf.nn.top_k(tf.reshape(distances, [-1]), 1).values[0]
            # sigma2_k = opts['latent_space_dim'] * sigma2_p
            if opts['verbose']:
                sigma2_k = tf.Print(sigma2_k, [sigma2_k], 'Kernel width:')
            res1 = tf.exp(-distances_qz / 2. / sigma2_k)
            res1 += tf.exp(-distances_pz / 2. / sigma2_k)
            res1 = tf.multiply(res1, 1. - tf.eye(n))
            res1 = tf.reduce_sum(res1) / (nf * nf - nf)
            res2 = tf.exp(-distances / 2. / sigma2_k)
            res2 = tf.reduce_sum(res2) * 2. / (nf * nf)
            stat = res1 - res2
        elif opts['mmd_kernel'] == 'IMQ':
            Cbase = 2 * opts['zdim'] * sigma2_p
            stat = 0.
            for scale in [.1, .2, .5, 1., 2., 5., 10.]:
                C = Cbase * scale
                res1 = C / (C + distances_qz)
                res1 += C / (C + distances_pz)
                res1 = tf.multiply(res1, 1. - tf.eye(n))
                res1 = tf.reduce_sum(res1) / (nf * nf - nf)
                res2 = C / (C + distances)
                res2 = tf.reduce_sum(res2) * 2. / (nf * nf)
                stat += res1 - res2
        elif opts['mmd_kernel'] == 'RQ':
            stat = 0.
            for scale in [.1, .2, .5, 1., 2., 5., 10.]:
                res1 = (1. + distances_qz / scale / 2.)**(-scale)
                res1 += (1. + distances_pz / scale / 2.)**(-scale)
                res1 = tf.multiply(res1, 1. - tf.eye(n))
                res1 = tf.reduce_sum(res1) / (nf * nf - nf)
                res2 = (1. + distances / scale / 2.)**(-scale)
                res2 = tf.reduce_sum(res2) * 2. / (nf * nf)
                stat += res1 - res2
        return stat
Пример #8
0
    def apply(self,
              visual: utils.TensorOrSequence,
              selfbboxes: utils.TensorOrSequence,
              bboxes: utils.TensorOrSequence,
              out_size=1,
              return_probs=False,
              **kwargs):
        self.b_s = utils.get_batch_size(visual)
        self.device = utils.get_device(visual)
        self.seq_mask = torch.ones((self.b_s, self.beam_size, 1),
                                   device=self.device)
        self.seq_logprob = torch.zeros((self.b_s, 1, 1), device=self.device)
        self.log_probs = []
        self.selected_words = None
        if return_probs:
            self.all_log_probs = []

        outputs = []
        with self.model.statefulness(self.b_s):
            for t in range(self.max_len):
                if t == 0:
                    state = None
                visual, selfbboxes, bboxes, state, outputs = self.iter(
                    t, visual, selfbboxes, bboxes, state, outputs,
                    return_probs, **kwargs)

        # Sort result
        seq_logprob, sort_idxs = torch.sort(self.seq_logprob,
                                            1,
                                            descending=True)
        outputs = torch.cat(outputs, -1)
        outputs = torch.gather(
            outputs, 1, sort_idxs.expand(self.b_s, self.beam_size,
                                         self.max_len))
        log_probs = torch.cat(self.log_probs, -1)
        log_probs = torch.gather(
            log_probs, 1,
            sort_idxs.expand(self.b_s, self.beam_size, self.max_len))
        if return_probs:
            all_log_probs = torch.cat(self.all_log_probs, 2)
            all_log_probs = torch.gather(
                all_log_probs, 1,
                sort_idxs.unsqueeze(-1).expand(self.b_s, self.beam_size,
                                               self.max_len,
                                               all_log_probs.shape[-1]))

        outputs = outputs.contiguous()[:, :out_size]
        log_probs = log_probs.contiguous()[:, :out_size]
        if out_size == 1:
            outputs = outputs.squeeze(1)
            log_probs = log_probs.squeeze(1)

        if return_probs:
            return outputs, log_probs, all_log_probs
        else:
            return outputs, log_probs
Пример #9
0
def sinkhorn_it(opts, C):
    # Batch size
    M = utils.get_batch_size(C)
    # Kernel
    log_K = -C / opts['epsilon']
    # Initialization
    log_v = -logsumexp(log_K, axis=1, keepdims=True)
    Sinkhorn = []
    # Sinkhorn iterations
    for l in range(opts['L'] - 1):
        log_u = -logsumexp(log_K + log_v, axis=0, keepdims=True)
        Sinkhorn.append(tf.reduce_sum(tf.exp(log_u + log_K + log_v) * C))
        log_v = -logsumexp(log_K + log_u, axis=1, keepdims=True)
    log_u = -logsumexp(log_K + log_v, axis=0, keepdims=True)
    Sinkhorn.append(tf.reduce_sum(tf.exp(log_u + log_K + log_v) * C))
    return Sinkhorn
Пример #10
0
def sinkhorn_it_v2(opts, C):
    # Batch size
    M = utils.get_batch_size(C)
    # Initialization
    u = opts['epsilon'] * (tf.compat.v1.log(M) - logsumexp(
        -C / opts['epsilon'], axis=1, keepdims=True))
    v = opts['epsilon'] * (tf.compat.v1.log(M) - logsumexp(
        (-C + u) / opts['epsilon'], axis=0, keepdims=True))
    Sinkhorn = []
    sinkhorn_init = tf.reduce_sum(tf.exp((-C + u + v) / opts['epsilon']) * C)
    Sinkhorn.append(sinkhorn_init)
    # Sinkhorn iterations
    for l in range(opts['L'] - 1):
        u -= opts['epsilon'] * (tf.compat.v1.log(M) + logsumexp(
            (-C + u + v) / opts['epsilon'], axis=1, keepdims=True))
        v -= opts['epsilon'] * (tf.compat.v1.log(M) + logsumexp(
            (-C + u + v) / opts['epsilon'], axis=0, keepdims=True))
        Sinkhorn.append(
            tf.reduce_sum(tf.exp((-C + u + v) / opts['epsilon']) * C))
    return Sinkhorn
Пример #11
0
    def total_correlation(self, z, z_mean, z_logvar):
        """Estimate of total correlation and dimensionwise on a batch.
      Based on ICML paper
      """
        M = utils.get_batch_size(z)
        N = self.opts['dataset_size']
        # Compute log(q(z(x_j)|x_i)) for every sample in the batch, which is a
        # tensor of size [batch_size, batch_size, num_latents]. In the following
        # comments, [batch_size, batch_size, num_latents] are indexed by [j, i, l].
        log_qz_prob = utils.gaussian_log_density(tf.expand_dims(z, 1),
                                                 tf.expand_dims(z_mean, 0),
                                                 tf.expand_dims(z_logvar, 0))
        # Compute log prod_l q(z(x_j)_l) = sum_l(log(sum_i(q(z(x_j)_l|x_i)))
        # + constant) for each sample in the batch, which is a vector of size
        # [batch_size,].
        log_qz_product = tf.reduce_sum(
            tf.reduce_logsumexp(log_qz_prob, axis=1, keepdims=False) -
            tf.math.log(N * M),
            axis=1,
            keepdims=False)
        # Compute log(q(z(x_j))) as log(sum_i(q(z(x_j)|x_i))) + constant =
        # log(sum_i(prod_l q(z(x_j)_l|x_i))) + constant.
        log_qz = tf.reduce_logsumexp(tf.reduce_sum(
            log_qz_prob, axis=2, keepdims=False),
                                     axis=1,
                                     keepdims=False) - tf.math.log(N * M)
        # Compute log prod_l p(z_l) = sum_l(log(p(z_l)))
        # + constant) where p~N(0,1), for each sample in the batch, which is a vector of size
        # [batch_size,].
        pi = tf.constant(math.pi)
        log_pz_product = tf.reduce_sum(-0.5 *
                                       (tf.math.log(2 * pi) + tf.square(z)),
                                       axis=1,
                                       keepdims=False)

        return tf.reduce_mean(log_qz), tf.reduce_mean(
            log_qz_product), tf.reduce_mean(log_pz_product)
Пример #12
0
    def mmd_penalty(self, sample_qz, sample_pz):
        opts = self.opts
        sigma2_p = opts['pz_scale']**2
        kernel = opts['mmd_kernel']
        n = utils.get_batch_size(sample_qz)
        n = tf.cast(n, tf.int32)
        nf = tf.cast(n, tf.float32)
        half_size = (n * n - n) / 2

        norms_pz = tf.reduce_sum(tf.square(sample_pz), axis=1, keep_dims=True)
        dotprods_pz = tf.matmul(sample_pz, sample_pz, transpose_b=True)
        distances_pz = norms_pz + tf.transpose(norms_pz) - 2. * dotprods_pz

        norms_qz = tf.reduce_sum(tf.square(sample_qz), axis=1, keep_dims=True)
        dotprods_qz = tf.matmul(sample_qz, sample_qz, transpose_b=True)
        distances_qz = norms_qz + tf.transpose(norms_qz) - 2. * dotprods_qz

        dotprods = tf.matmul(sample_qz, sample_pz, transpose_b=True)
        distances = norms_qz + tf.transpose(norms_pz) - 2. * dotprods

        # if opts['verbose']:
        #     distances = tf.Print(
        #         distances,
        #         [tf.nn.top_k(tf.reshape(distances_qz, [-1]), 1).values[0]],
        #         'Maximal Qz squared pairwise distance:')
        #     distances = tf.Print(distances, [tf.reduce_mean(distances_qz)],
        #                         'Average Qz squared pairwise distance:')

        #     distances = tf.Print(
        #         distances,
        #         [tf.nn.top_k(tf.reshape(distances_pz, [-1]), 1).values[0]],
        #         'Maximal Pz squared pairwise distance:')
        #     distances = tf.Print(distances, [tf.reduce_mean(distances_pz)],
        #                         'Average Pz squared pairwise distance:')

        if kernel == 'RBF':
            # Median heuristic for the sigma^2 of Gaussian kernel
            sigma2_k = tf.nn.top_k(tf.reshape(distances, [-1]),
                                   half_size).values[half_size - 1]
            sigma2_k += tf.nn.top_k(tf.reshape(distances_qz, [-1]),
                                    half_size).values[half_size - 1]
            # Maximal heuristic for the sigma^2 of Gaussian kernel
            # sigma2_k = tf.nn.top_k(tf.reshape(distances_qz, [-1]), 1).values[0]
            # sigma2_k += tf.nn.top_k(tf.reshape(distances, [-1]), 1).values[0]
            # sigma2_k = opts['latent_space_dim'] * sigma2_p
            if opts['verbose']:
                sigma2_k = tf.Print(sigma2_k, [sigma2_k], 'Kernel width:')
            res1 = tf.exp(-distances_qz / 2. / sigma2_k)
            res1 += tf.exp(-distances_pz / 2. / sigma2_k)
            res1 = tf.multiply(res1, 1. - tf.eye(n))
            res1 = tf.reduce_sum(res1) / (nf * nf - nf)
            res2 = tf.exp(-distances / 2. / sigma2_k)
            res2 = tf.reduce_sum(res2) * 2. / (nf * nf)
            stat = res1 - res2
        elif kernel == 'IMQ':
            # k(x, y) = C / (C + ||x - y||^2)
            # C = tf.nn.top_k(tf.reshape(distances, [-1]), half_size).values[half_size - 1]
            # C += tf.nn.top_k(tf.reshape(distances_qz, [-1]), half_size).values[half_size - 1]
            Cbase = 2 * opts['zdim'] * sigma2_p
            stat = 0.
            for scale in [.1, .2, .5, 1., 2., 5., 10.]:
                C = Cbase * scale
                res1 = C / (C + distances_qz)
                res1 += C / (C + distances_pz)
                res1 = tf.multiply(res1, 1. - tf.eye(n))
                res1 = tf.reduce_sum(res1) / (nf * nf - nf)
                res2 = C / (C + distances)
                res2 = tf.reduce_sum(res2) * 2. / (nf * nf)
                stat += res1 - res2
        return stat
Пример #13
0
def model_fn(features, labels, mode, params):
    # Get global step
    global_step = tf.train.get_global_step()

    # Construct mtf graph + mesh from params
    graph = mtf.Graph()
    mesh_shape = mtf.convert_to_shape(params["mesh_shape"])
    layout_rules = mtf.convert_to_layout_rules(params["layout"])

    # Mesh setup
    if params["use_tpu"]:
        var_placer, mesh_impl = simd_mesh_setup(params, mesh_shape,
                                                layout_rules)
    else:
        var_placer = None
        gpu_ids = params["gpu_ids"]
        mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
            mesh_shape, layout_rules, gpu_ids)

    # Trainable variable precision
    # Store to checkpoints in master type, train in slice type, compute in activation type
    if params["precision"] == "bfloat16":
        variable_dtype = mtf.VariableDType(master_dtype=tf.bfloat16,
                                           slice_dtype=tf.float32,
                                           activation_dtype=tf.bfloat16)
    else:
        variable_dtype = mtf.VariableDType(master_dtype=tf.float32,
                                           slice_dtype=tf.float32,
                                           activation_dtype=tf.float32)

    # Build mtf mesh object
    mesh = mtf.Mesh(graph, "my_mesh", var_placer)

    # Build mtf_features & seq length dict for getting number of microbatches
    # We need to pack inputs into a dict to pass into serialize_training_step
    features_dict = {"inputs": features, "labels": labels}
    sequence_length_dict = {
        "inputs": params["n_ctx"],
        "labels": params["n_ctx"]
    }

    params = add_mode_to_params(params, mode)
    batch_size = get_batch_size(params)

    batch_dim = mtf.Dimension("batch", batch_size)
    batch_dims = [batch_dim]
    feature_length = sequence_length_dict["inputs"]
    length_dim = mtf.Dimension("sequence", feature_length)

    mtf_features = {}
    for key, x in features_dict.items():
        if x is not None:
            feature_shape = mtf.Shape(batch_dims + [length_dim])
            if type(features_dict[key]) == dict:
                features_dict[key] = features_dict[key]["feature"]
            x = tf.cast(features_dict[key], tf.int32)
            x = tf.reshape(x, feature_shape.to_integer_list)
            mtf_features[key] = mtf.import_fully_replicated(mesh,
                                                            x,
                                                            feature_shape,
                                                            name=key)

    # Instantiate dict for dimensions, bias, etc that can be calculated here once then passed into model
    other_features = {}
    memory_length_dim = mtf.Dimension("memory_length", length_dim.size)

    attn_bias = biasmask_attn_weights(
        mesh, length_dim, memory_length_dim,
        variable_dtype) if params["causal"] else None

    # Add attn_bias into mtf_features
    other_features["attn_bias"] = attn_bias

    # Define other Dimensions that we'll need inside the model
    embd_dim = mtf.Dimension("embd", params["n_embd"])
    vocab_dim = mtf.Dimension("vocab", params["n_vocab"])
    # We need this because gathering when both the args have the same dimension in them breaks things
    # This dim is specifically for the weights
    # This prevents the "Einsum has lhs dimension without corresponding rhs or output dimension." error
    embed_sequence_dim = mtf.Dimension("embed_sequence", params["n_ctx"])

    other_features["embd_dim"] = embd_dim
    other_features["vocab_dim"] = vocab_dim
    other_features["embed_sequence_dim"] = embed_sequence_dim
    other_features["memory_length_dim"] = memory_length_dim

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Set up the model for prediction
        inputs = mtf_features["inputs"]
        if params["remove_partial_sequences"] is None:
            params["remove_partial_sequences"] = False

        export = params.get("export", False)

        if not export:
            mtf_samples = sample_autoregressive(
                inputs,
                other_features=other_features,
                params=params,
                variable_dtype=variable_dtype,
                remove_partial_sequences=params["remove_partial_sequences"],
                stop_at_token=params["eos_id"],
                sampling_use_entmax=params['sampling_use_entmax'])

        else:
            with mtf.utils.outside_all_rewrites():
                with tf.variable_scope('gpt2'):
                    mtf_samples, loss, loss_batch = gpt2.model(
                        mtf_features,
                        other_features,
                        params,
                        mesh,
                        variable_dtype=variable_dtype,
                        context=None)

        mtf_samples = mtf.anonymize(mtf_samples)
        inputs = mtf.anonymize(inputs)
        lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=True)
        inputs = lowering.export_to_tf_tensor(inputs)
        outputs = lowering.export_to_tf_tensor(mtf_samples)
        predictions = {"inputs": inputs, "outputs": outputs}

        def scaffold_fn():
            return tf.train.Scaffold(
                local_init_op=tf.group(
                    tf.train.Scaffold.default_local_init_op(),
                    lowering.copy_masters_to_slices(),
                    name="mtf_local_init_op"),
                ready_op=tf.concat([
                    tf.report_uninitialized_variables(),
                    resources.report_uninitialized_resources()
                ],
                                   axis=0,
                                   name="mtf_ready_op"))

        return tpu_estimator.TPUEstimatorSpec(
            mode=tf.estimator.ModeKeys.PREDICT,
            predictions=predictions,
            scaffold_fn=scaffold_fn,
            prediction_hooks=[mtf.MtfRestoreHook(lowering)])

    # We're not predicting, so we better be training or evaluating
    assert (mode == tf.estimator.ModeKeys.TRAIN
            or mode == tf.estimator.ModeKeys.EVAL)

    if mode == tf.estimator.ModeKeys.TRAIN:
        # Gets number of microbatches per batch for serialized training
        # if param tokens_per_mb_per_replica = None, this defaults to 1 and no microbatching is performed
        num_microbatches = int(
            mtf_transformer.utils.serialize_num_microbatches(
                batch_dim=batch_dim,
                sequence_length=sequence_length_dict,
                mesh_shape=mesh_shape,
                layout_rules=layout_rules,
                tokens_per_microbatch_per_replica=params[
                    "tokens_per_mb_per_replica"]))
    else:
        num_microbatches = 1

    params[
        "num_microbatches"] = num_microbatches  # Add num microbatches to params

    if num_microbatches > 1:

        # For serialize_training_step we need to modify the model to output results in a dict
        def serialized_fn(mtf_features):
            if params["model"] == "GPT":
                with tf.variable_scope('gpt2'):
                    logits, loss, loss_batch = gpt2.model(
                        mtf_features,
                        other_features,
                        params,
                        mesh,
                        variable_dtype=variable_dtype)
                return {
                    "logits": logits,
                    "loss": loss,
                    "loss_batch": loss_batch
                }
            else:
                raise Exception(
                    f"'{params['model']}' is not a valid model - please select from [GPT]"
                )

        # Serialize the training step - Gradients are accumulated locally and reduced once.
        var_grads, output_dict = mtf.serialize_training_step(
            mtf_features, serialized_fn, batch_dim, num_microbatches)
        loss = output_dict["loss"]
        loss_batch = output_dict["loss_batch"]
        logits = output_dict["logits"]
    else:
        # If we're not splitting into microbatches, return logits & loss as is
        if params["model"] == "GPT":
            with mtf.utils.outside_all_rewrites():
                with tf.variable_scope('gpt2'):
                    logits, loss, loss_batch = gpt2.model(
                        mtf_features,
                        other_features,
                        params,
                        mesh,
                        variable_dtype=variable_dtype,
                        context=None)
        else:
            raise Exception(
                f"'{params['model']}' is not a valid model - please select from [GPT]"
            )

    # Auto layout generation
    if params["auto_layout"]:
        auto_layout(graph, mesh_shape, logits, loss)
    if params["auto_layout_and_mesh_shape"]:
        auto_layout_and_mesh_shape(graph, params["num_cores"], logits, loss)

    if mode == tf.estimator.ModeKeys.TRAIN:
        # In TRAIN mode, get optimizer
        if params["num_microbatches"] > 1:
            # If we are splitting the batch into microbatches, var grads are created in the serialize_training_step fn
            # So we pass them in here
            _, update_ops, var_grads = get_optimizer(
                mesh,
                loss,
                params,
                variable_dtype=variable_dtype,
                inp_var_grads=var_grads)
        else:
            # Otherwise, they are created in the get_optimizer fn, so we leave inp_var_grads blank
            _, update_ops, var_grads = get_optimizer(
                mesh, loss, params, variable_dtype=variable_dtype)
        # Log summaries to tensorboard
        mtf.scalar_summary("loss", loss)
        # Log gradients if in params
        if params["log_grads"] not in [None, False]:
            for g in var_grads:
                grad_norm = mtf.sqrt(mtf.reduce_sum(mtf.square(g)))
                mtf.scalar_summary("grads/norm" + g.name[:-2], grad_norm)
    else:
        # For now, we can only export fully-replicated tensors.
        # This has to be done before lowering or they will not be included in the graph
        mean_logits = mtf.reduce_mean(logits, reduced_dim=vocab_dim)
        max_logits = mtf.argmax(logits, vocab_dim)
        del logits
        fully_replicated_mean_logits = mtf.anonymize(mean_logits)
        fully_replicated_max_logits = mtf.anonymize(max_logits)
        fully_replicated_loss_batch = mtf.anonymize(loss_batch)

    # Gets & prints info about no. trainable vars in the model & dimension names
    get_graph_info(graph)

    # 'lowers' mtf tensors into a tf graph - this enables us to export results as tf tensors
    lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=True)
    tf_loss = lowering.export_to_tf_tensor(loss)
    tf_loss = tf.cast(tf_loss, tf.float32)

    if mode == tf.estimator.ModeKeys.TRAIN:
        # Use our patched version until mtf updates theirs
        host_call = create_host_call(params['model_path'])
        mtf.utils.remove_summaries()

        # Creates train_op
        tf_update_ops = [lowering.lowered_operation(op) for op in update_ops]
        tf_update_ops.append(tf.assign_add(
            global_step, 1))  # Need to manually increment global_step
        tf.logging.info(f"tf_update_ops: {tf_update_ops}")
        train_op = tf.group(tf_update_ops)
    else:
        tf_mean_logits = lowering.export_to_tf_tensor(
            fully_replicated_mean_logits)
        tf_max_logits = lowering.export_to_tf_tensor(
            fully_replicated_max_logits)
        tf_loss_batch = tf.to_float(
            lowering.export_to_tf_tensor(fully_replicated_loss_batch))

    with mtf.utils.outside_all_rewrites():
        # Copy master variables to slices. Must be called first.
        restore_hook = mtf.MtfRestoreHook(lowering)
        if mode == tf.estimator.ModeKeys.TRAIN:
            # Set up the checkpoint server and return the TPUEstimatorSpec
            saver = tf.train.Saver(tf.global_variables(),
                                   sharded=True,
                                   max_to_keep=10,
                                   keep_checkpoint_every_n_hours=2,
                                   defer_build=False,
                                   save_relative_paths=True)
            tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
            saver_listener = mtf.MtfCheckpointSaverListener(lowering)
            saver_hook = tf.train.CheckpointSaverHook(
                params["model_path"],
                save_steps=params["steps_per_checkpoint"],
                saver=saver,
                listeners=[saver_listener])

            return tpu_estimator.TPUEstimatorSpec(
                tf.estimator.ModeKeys.TRAIN,
                loss=tf_loss,
                host_call=host_call,
                train_op=train_op,
                training_hooks=[restore_hook, saver_hook])

        elif mode == tf.estimator.ModeKeys.EVAL:
            # Evaluation metrics
            def _perplexity(loss):
                perplexity = tf.exp(loss)
                return tf.metrics.mean(perplexity)

            def _bits_per_byte(loss):
                bpb = loss * (0.29335 / math.log(2))
                return tf.metrics.mean(bpb)

            def _metric_fn(tf_mean_logits, tf_loss_batch):
                mean_logits = tf.metrics.mean(tf_mean_logits)
                loss = tf.reduce_mean(tf_loss_batch)
                perp = _perplexity(loss)
                bpb = _bits_per_byte(loss)
                return {
                    "mean_logits": mean_logits,
                    "perplexity": perp,
                    "bits per byte": bpb
                }

            def _lambada_metric_fn(labels, tf_max_logits, tf_loss_batch):
                eos_token = params["eos_id"]
                answer_positions = tf.where(
                    tf.math.not_equal(labels, eos_token))

                correct_answers = tf.gather_nd(
                    tf.math.equal(tf_max_logits, labels), answer_positions)
                accuracy = tf.metrics.mean(tf.cast(correct_answers,
                                                   tf.float32))

                # I guess tf_loss_batch has z_loss and maybe other stuff added to it
                # so maybe this should be calculated separately in the future
                answer_loss = tf.gather_nd(tf_loss_batch, answer_positions)
                log_perplexity = tf.metrics.mean(answer_loss)

                return {
                    "lambada_acc": accuracy,
                    "lambada_log_ppl": log_perplexity
                }

            eval_task = params["eval_task"]
            if eval_task == "lambada":
                eval_metrics = (_lambada_metric_fn,
                                [labels, tf_max_logits, tf_loss_batch])
            else:
                eval_metrics = (_metric_fn, [tf_mean_logits, tf_loss_batch])

            return tpu_estimator.TPUEstimatorSpec(
                tf.estimator.ModeKeys.EVAL,
                evaluation_hooks=[restore_hook],
                loss=tf_loss,
                eval_metrics=eval_metrics)
Пример #14
0
def mmd(opts, pi0, pi, sample_pz, sample_qz):
    """
    Compute MMD between prior and aggregated posterior
    pi0: prior weights [K]
    pi: variational weights [batch,K]
    """
    sigma2_p = opts['pz_scale'] ** 2
    kernel = opts['mmd_kernel']
    n = utils.get_batch_size(sample_pz)
    n = tf.cast(n, tf.int32)
    nf = tf.cast(n, tf.float32)
    half_size = tf.cast((n * n - n) / 2,tf.int32)
    norms_pz = tf.reduce_sum(tf.square(sample_pz), axis=-1, keepdims=True)
    norms_qz = tf.reduce_sum(tf.square(sample_qz), axis=-1, keepdims=True)
    distances_pz = square_dist(sample_pz, norms_pz, sample_pz, norms_pz)
    distances_qz = square_dist(sample_qz, norms_qz, sample_qz, norms_qz)
    distances = square_dist(sample_qz, norms_qz, sample_pz, norms_pz)

    if kernel == 'RBF':
        assert False, 'To implement'
        # Median heuristic for the sigma^2 of Gaussian kernel
        sigma2_k = tf.nn.top_k(
            tf.reshape(distances, [-1]), half_size).values[half_size - 1]
        sigma2_k += tf.nn.top_k(
            tf.reshape(distances_qz, [-1]), half_size).values[half_size - 1]

        if opts['verbose']:
            sigma2_k = tf.Print(sigma2_k, [sigma2_k], 'Kernel width:')

        # First 2 terms of the MMD
        self.res1 = tf.exp( - distances_qz / 2. / sigma2_k)
        self.res1 = tf.multiply(tf.transpose(self.res1),tf.transpose(self.enc_mixweight))
        self.res1 = tf.multiply(tf.transpose(self.res1),tf.transpose(self.enc_mixweight))
        self.res1 += tf.exp( - distances_pz / 2. / sigma2_k) / (opts['nmixtures']*opts['nmixtures'])
        # Correcting for diagonal terms
        self.res1_diag = tf.diag_part(tf.reduce_sum(self.res1,axis=[1,2]))
        self.res1 = (tf.reduce_sum(self.res1)\
                - tf.reduce_sum(self.res1_diag)) / (nf * nf - nf)
        # Cross term of the MMD
        self.res2 = tf.exp( - distances / 2. / sigma2_k)
        self.res2 =  tf.multiply(tf.transpose(self.res2),tf.transpose(self.enc_mixweight))
        self.res2 = tf.transpose(self.res2) / opts['nmixtures']
        self.res2 = tf.reduce_sum(self.res2) * 2. / (nf * nf)
        stat = self.res1 - self.res2
    elif kernel == 'IMQ':
        # k(x, y) = C / (C + ||x - y||^2)
        Cbase = 2 * opts['zdim'] * sigma2_p
        res = 0.
        for scale in [.1, .2, .5, 1., 2., 5., 10.]:
            C = Cbase * scale
            # First 2 terms of the MMD
            res1_qz = C / (C + distances_qz)
            res1_qz = tf.multiply(tf.expand_dims(pi,axis=-1),
                                  tf.multiply(res1_qz,tf.transpose(pi)))
            res1_pz = (C / (C + distances_pz))
            res1_pz = tf.multiply(res1_pz,tf.expand_dims(tf.square(pi0),axis=-1))
            res1 = res1_qz + res1_pz
            # Correcting for diagonal terms
            res1_diag = tf.trace(tf.transpose(res1,perm=[1,0,2]))
            res1 = (tf.reduce_sum(res1,axis=[0,-1]) - res1_diag) / (nf * nf - nf)
            # Cross term of the MMD
            res2 = C / (C + distances)
            res2 = tf.multiply(tf.expand_dims(pi,axis=-1),
                               tf.multiply(res2,tf.expand_dims(pi0,axis=-1)))
            res2 = tf.reduce_sum(res2,axis=[0,-1]) / (nf * nf)
            res += tf.reduce_sum(tf.div(res1 - 2. * res2,tf.square(pi0)))
    else:
        raise ValueError('%s Unknown kernel' % kernel)
    return res
Пример #15
0
def main():
    args = parse_args()

    print("load the model configuration...", file=sys.stderr)
    print("=======================================================",
          file=sys.stderr)

    exp_config = generate_exp_config(args.net_name, args.pre_trained,
                                     args.include_fc, args.k_fold)
    weights_path = get_weights_path(net_name=args.net_name)

    net = importlib.import_module("Nets." + args.net_name)

    batch_size = get_batch_size(args.net_name, args.pre_trained)
    input_shape = get_input_shape(args.net_name, args.pre_trained)

    if args.pre_trained:
        preprocessing_function = net.preprocess_input
    else:
        preprocessing_function = None

    weights_filename = os.path.join(weights_path, "{}.h5".format(exp_config))

    assert os.path.exists(weights_filename), print(
        "the model doesn't exist...", file=sys.stderr)
    model = load_model(weights_filename)

    rotation_range = AUGMENT_PARAMETERS.get('rotation_range', 0.)
    width_shift_range = AUGMENT_PARAMETERS.get('width_shift_range', 0.)
    height_shift_range = AUGMENT_PARAMETERS.get('height_shift_range', 0.)
    shear_range = AUGMENT_PARAMETERS.get('shear_range', 0.)
    zoom_range = AUGMENT_PARAMETERS.get('zoom_range', 0.)
    fill_mode = AUGMENT_PARAMETERS.get('fill_mode', 'nearest')
    cval = AUGMENT_PARAMETERS.get('cval', 0.)
    horizontal_flip = AUGMENT_PARAMETERS.get('horizontal_flip', True)
    vertical_flip = AUGMENT_PARAMETERS.get('vertical_flip', True)

    # output path
    training_predict_path = get_training_predict_path(args.net_name)
    test_predict_path = get_test_predict_path(args.net_name)

    print("load training data...", file=sys.stderr)
    print("=======================================================",
          file=sys.stderr)

    img, label = load_data(dataset="train")

    split_filename = os.path.join(DATA_DIR, "KFold_{}.npz".format(args.k_fold))
    split = np.load(split_filename)

    test_indexes = split['test_indexes']

    print("validate the model on {} samples".format(test_indexes.shape[0]),
          file=sys.stderr)

    valid_generator = ImageDataGenerator(
        x=img[test_indexes],
        y=None,
        batch_size=batch_size,
        augment=False,
        shuffle=False,
        output_shape=(input_shape[0], input_shape[1]),
        n_channels=input_shape[2],
        preprocessing_function=preprocessing_function)

    valid_generator_aug = ImageDataGenerator(
        x=img[test_indexes],
        y=None,
        batch_size=batch_size,
        augment=True,
        shuffle=False,
        output_shape=(input_shape[0], input_shape[1]),
        n_channels=input_shape[2],
        rotation_range=rotation_range,
        width_shift_range=width_shift_range,
        height_shift_range=height_shift_range,
        shear_range=shear_range,
        zoom_range=zoom_range,
        fill_mode=fill_mode,
        cval=cval,
        horizontal_flip=horizontal_flip,
        vertical_flip=vertical_flip,
        preprocessing_function=preprocessing_function,
        augment_prob=1.0)

    valid_pred = model.predict_generator(valid_generator,
                                         use_multiprocessing=True,
                                         workers=8)
    valid_pred_aug = np.zeros((test_indexes.shape[0], N_LABELS),
                              dtype=np.float32)
    for i in range(TEST_TIME_AUGMENT):
        valid_pred_aug += model.predict_generator(valid_generator_aug,
                                                  use_multiprocessing=True,
                                                  workers=8)

    valid_pred = 0.5 * valid_pred + 0.5 * valid_pred_aug / TEST_TIME_AUGMENT

    filename = os.path.join(training_predict_path, "{}.npz".format(exp_config))
    np.savez(file=filename, pred=valid_pred, label=label[test_indexes])

    print("load test data...", file=sys.stderr)
    print("=======================================================",
          file=sys.stderr)

    x_test = load_data(dataset="test")

    test_generator = ImageDataGenerator(
        x=x_test,
        batch_size=batch_size,
        augment=False,
        shuffle=False,
        output_shape=(input_shape[0], input_shape[1]),
        n_channels=input_shape[2],
        preprocessing_function=preprocessing_function)

    test_generator_aug = ImageDataGenerator(
        x=x_test,
        batch_size=batch_size,
        augment=True,
        shuffle=False,
        output_shape=(input_shape[0], input_shape[1]),
        n_channels=input_shape[2],
        rotation_range=rotation_range,
        width_shift_range=width_shift_range,
        height_shift_range=height_shift_range,
        shear_range=shear_range,
        zoom_range=zoom_range,
        fill_mode=fill_mode,
        cval=cval,
        horizontal_flip=horizontal_flip,
        vertical_flip=vertical_flip,
        preprocessing_function=preprocessing_function,
        augment_prob=1.0)

    test_pred = model.predict_generator(test_generator,
                                        use_multiprocessing=True,
                                        workers=8)
    test_pred_aug = np.zeros((x_test.shape[0], N_LABELS), dtype=np.float32)
    for i in range(TEST_TIME_AUGMENT):
        test_pred_aug += model.predict_generator(test_generator_aug,
                                                 use_multiprocessing=True,
                                                 workers=8)

    test_pred = 0.5 * test_pred + 0.5 * test_pred_aug / TEST_TIME_AUGMENT

    filename = os.path.join(test_predict_path, "{}.npz".format(exp_config))
    np.savez(file=filename, pred=test_pred)
Пример #16
0
def MMD(opts, resp_qz, sample_qz, resp_pz, sample_pz):
    """
    Compute MMD between prior and aggregated posterior
    resp_pz: prior mixture resp. [K]
    resp_qz: variational mixture resp. [batch,K]
    sample_qz/sample_pz: latent samples [batch,K,zdim]
    """

    K, zdim = sample_qz.get_shape().as_list()[1:]
    nf = tf.cast(utils.get_batch_size(sample_qz), tf.float32)
    half_size = tf.cast((nf * nf - nf) / 2, tf.int32)
    # reshape resp_pz to be broadcastable along batch dim
    resp_pz = tf.expand_dims(resp_pz, axis=0)  #[1,K]
    # get pairwise distances
    distances_pz = square_dist(sample_pz, sample_pz)  #[batch,K,K,batch]
    distances_qz = square_dist(sample_qz, sample_qz)  #[batch,K,K,batch]
    distances = square_dist(sample_qz, sample_pz)  #[batch,K,K,batch]

    if opts['mmd_kernel'] == 'RBF':
        # Median heuristic for the sigma^2 of Gaussian kernel [K,]
        sigma2_k = tf.nn.top_k(tf.reshape(distances, [-1]),
                               half_size).values[:, half_size - 1]
        sigma2_k += tf.nn.top_k(tf.reshape(distances_qz, [-1]),
                                half_size).values[:, half_size - 1]
        # q term
        res_q = tf.exp(-distances_qz / 2. / sigma2_k)
        resp_qz_broadcast_1 = tf.expand_dims(tf.expand_dims(resp_qz, axis=2),
                                             axis=2)  #[batch,K,1,1]
        resp_qz_broadcast_2 = tf.expand_dims(tf.expand_dims(
            tf.transpose(resp_qz), axis=0),
                                             axis=0)  #[1,1,K,batch]
        res_q *= resp_qz_broadcast_1 * resp_qz_broadcast_2
        # p term
        res_p = tf.exp(-distances_pz / 2. / sigma2_k)
        resp_pz_broadcast_1 = tf.expand_dims(tf.expand_dims(resp_pz, axis=2),
                                             axis=2)  #[batch,K,1,1]
        resp_pz_broadcast_2 = tf.expand_dims(tf.expand_dims(
            tf.transpose(resp_pz), axis=0),
                                             axis=0)  #[1,1,K,batch]
        res_p *= resp_pz_broadcast_1 * resp_pz_broadcast_2
        #correction term
        res1 = tf.reduce_sum(res_q + res_p) - tf.linalg.trace(
            tf.reduce_sum(res_q + res_p, axis=[1, 2]))
        res1 /= nf * nf - nf
        # cross term
        res_qp = tf.exp(-distances / 2. / sigma2_k)
        res_qp *= resp_qz_broadcast_1 * resp_pz_broadcast_2
        res2 = tf.reduce_sum(res_qp) / (nf * nf)
        # mmd
        res = res1 - 2. * res2
    elif opts['mmd_kernel'] == 'IMQ':
        # k(x, y) = C / (C + ||x - y||^2)
        Cbase = 2 * zdim * ((opts['x_var'] + opts['x_var']) / 2.)**2
        res = 0.
        # for scale in [.1, .2, .5, 1., 2., 5., 10., 20., 50., 100.]:
        for scale in [.1, .2, .5, 1., 2., 5., 10.]:
            C = Cbase * scale
            # q term
            res_q = C / (C + distances_qz)
            resp_qz_broadcast_1 = tf.expand_dims(tf.expand_dims(resp_qz,
                                                                axis=2),
                                                 axis=2)  #[batch,K,1,1]
            resp_qz_broadcast_2 = tf.expand_dims(tf.expand_dims(
                tf.transpose(resp_qz), axis=0),
                                                 axis=0)  #[1,1,K,batch]
            res_q *= resp_qz_broadcast_1 * resp_qz_broadcast_2
            # p term
            res_p = C / (C + distances_pz)
            resp_pz_broadcast_1 = tf.expand_dims(tf.expand_dims(resp_pz,
                                                                axis=2),
                                                 axis=2)  #[batch,K,1,1]
            resp_pz_broadcast_2 = tf.expand_dims(tf.expand_dims(
                tf.transpose(resp_pz), axis=0),
                                                 axis=0)  #[1,1,K,batch]
            res_p *= resp_pz_broadcast_1 * resp_pz_broadcast_2
            #correction term
            res1 = tf.reduce_sum(res_q + res_p) - tf.linalg.trace(
                tf.reduce_sum(res_q + res_p, axis=[1, 2]))
            res1 /= nf * nf - nf
            # cross term
            res_qp = C / (C + distances)
            res_qp *= resp_qz_broadcast_1 * resp_pz_broadcast_2
            res2 = tf.reduce_sum(res_qp) / (nf * nf)
            # mmd
            res += res1 - 2. * res2
    else:
        raise ValueError('%s Unknown kernel' % opts['mmd_kernel'])

    return res