Exemplo n.º 1
0
def contrastive_loss(margin, im, sents):
    """
    Compute contrastive loss.
    Contrastive loss is computed between each language-image pair
    but not beween sentences in different languages.
    """
    n_langs = len(sents)
    final_cost = 0.
    # compute cost for each language and aggregate on final cost
    for i in range(n_langs):
        s_lang = sents[i]
        # compute image-sentence score matrix
        scores_lang = tensor.dot(im, s_lang.T)
        diagonal_lang = scores_lang.diagonal()
        # cost over sentence
        # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence)
        cost_sent_lang = tensor.maximum(0, margin - diagonal_lang + scores_lang)
        # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image)
        cost_im_lang = tensor.maximum(0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang)
        # clear diagonals
        cost_sent_lang = fill_diagonal(cost_sent_lang, 0)
        cost_im_lang = fill_diagonal(cost_im_lang, 0)

        # aggregate
        final_cost += cost_sent_lang.sum() + cost_im_lang.sum()

    return final_cost
Exemplo n.º 2
0
def contrastive_loss_all(margin, im, sents, lambda_img_sent=0.5, lambda_sent_sent=0.5):
    """
    Compute contrastive loss.
    Contrastive loss is computed between each language-image pair
    as well as beween sentences in different languages.
    """
    n_langs = len(sents)
    final_cost = 0.

    # compute costs for each language-image pair and aggregate final cost
    for i in range(n_langs):
        # compute image-sentence subcost
        s_lang = sents[i]
        # compute image-sentence score matrix
        scores_lang = tensor.dot(im, s_lang.T)
        diagonal_lang = scores_lang.diagonal()
        # cost over sentence
        # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence)
        cost_sent_lang = tensor.maximum(0, margin - diagonal_lang + scores_lang)
        # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image)
        cost_im_lang = tensor.maximum(0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang)
        # clear diagonals
        cost_sent_lang = fill_diagonal(cost_sent_lang, 0)
        cost_im_lang = fill_diagonal(cost_im_lang, 0)

        # aggregate
        final_cost += lambda_img_sent * (cost_sent_lang.sum() + cost_im_lang.sum())

    # compute costs for each-language-language pair and aggregate final cost
    for i in range(n_langs):
        for j in range(n_langs):
            if i==j:
                continue
            # compute sentence-sentence subcost
            s_lang1 = sents[i]
            s_lang2 = sents[j]
            # compute sent1-sent2 score matrix
            scores_lang = tensor.dot(s_lang1, s_lang2.T)
            diagonal_lang = scores_lang.diagonal()
            # cost over sent1
            # compare every diagonal score to scores in its column (i.e, all contrastive sent2 for each sent1)
            cost_sent1 = tensor.maximum(0, margin - diagonal_lang + scores_lang)
            # compare every diagonal score to scores in its row (i.e, all contrastive sent1 for each sent2)
            cost_sent2 = tensor.maximum(0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang)
            # clear diagonals
            cost_sent1 = fill_diagonal(cost_sent1, 0)
            cost_sent2 = fill_diagonal(cost_sent2, 0)

            # aggregate
            final_cost += lambda_sent_sent * (cost_sent1.sum() + cost_sent2.sum())

    return final_cost
Exemplo n.º 3
0
def contrastive_loss(s, im, options):
    """
    For a minibatch of sentence and image embeddings, compute the pairwise contrastive loss
    """
    margin = options['margin']

    if options['method'] == 'order':
        im2 = im.dimshuffle(('x', 0, 1))
        s2 = s.dimshuffle((0, 'x', 1))
        errors = order_violations(s2, im2, options).sum(axis=2)
    elif options['method'] == 'cosine':
        errors = -tensor.dot(
            im, s.T
        )  # negative because error is the opposite of (cosine) similarity

    diagonal = errors.diagonal()

    cost_s = tensor.maximum(
        0, margin - errors + diagonal
    )  # compare every diagonal score to scores in its column (all contrastive images for each sentence)
    cost_im = tensor.maximum(0, margin - errors + diagonal.reshape(
        (-1, 1)))  # all contrastive sentences for each image

    cost_tot = cost_s + cost_im

    # clear diagonals
    cost_tot = fill_diagonal(cost_tot, 0)

    return cost_tot.sum()
Exemplo n.º 4
0
def contrastive_loss(s, im, options):
    """
    For a minibatch of sentence and image embeddings, compute the pairwise contrastive loss
    """
    margin = options['margin']

    scores = None
    if options['method'] == 'order':
        im2 = im.dimshuffle(('x', 0, 1))
        s2 = s.dimshuffle((0, 'x', 1))
        scores = order_violations(s2, im2, options).sum(axis=2)
    elif options['method'] == 'cosine':
        scores = tensor.dot(im, s.T)

    diagonal = scores.diagonal()

    cost_s = tensor.maximum(0, margin - scores + diagonal)  # compare every diagonal score to scores in its column (all contrastive images for each sentence)
    cost_im = tensor.maximum(0, margin - scores + diagonal.reshape((-1, 1)))  # all contrastive sentences for each image

    cost_tot = cost_s + cost_im

    # clear diagonals
    cost_tot = fill_diagonal(cost_tot, 0)

    return cost_tot.sum()
Exemplo n.º 5
0
    def loss(lv1, lv2):
        """ Contrastive cosine distance optimization target """

        # compute image-sentence score matrix
        scores = T.dot(lv1, lv2.T)
        diagonal = scores.diagonal()

        # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence)
        cost_s = T.maximum(0, gamma - diagonal + scores)
        # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image)
        cost_im = T.maximum(0, gamma - diagonal.reshape((-1, 1)) + scores)

        # clear diagonals
        cost_s = fill_diagonal(cost_s, 0)
        cost_im = fill_diagonal(cost_im, 0)

        return cost_s.sum() + cost_im.sum()
def contrastive_loss(margin, im, s):
    """
    Compute contrastive loss
    """
    # compute image-sentence score matrix
    scores = tensor.dot(im, s.T)
    diagonal = scores.diagonal()

    # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence)
    cost_s = tensor.maximum(0, margin - diagonal + scores)
    # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image)
    cost_im = tensor.maximum(0, margin - diagonal.reshape((-1, 1)) + scores)

    # clear diagonals
    cost_s = fill_diagonal(cost_s, 0)
    cost_im = fill_diagonal(cost_im, 0)

    return cost_s.sum() + cost_im.sum()
def contrastive_loss(margin, im, s):
    """
    Compute contrastive loss
    """
    # compute image-sentence score matrix
    scores = tensor.dot(im, s.T)
    diagonal = scores.diagonal()

    # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence)
    cost_s = tensor.maximum(0, margin - diagonal + scores)
    # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image)
    cost_im = tensor.maximum(0, margin - diagonal.reshape((-1, 1)) + scores)

    # clear diagonals
    cost_s = fill_diagonal(cost_s, 0)
    cost_im = fill_diagonal(cost_im, 0)

    return cost_s.sum() + cost_im.sum()
Exemplo n.º 8
0
def contrastive_loss(tparams, options, im, sents):
    """
    Compute contrastive loss.
    Contrastive loss is computed between each language-image pair
    but not beween sentences in different languages.
    """
    margin = options['margin']
    attention_type = options[
        'attention_type'] if 'attention_type' in options else 'dot'
    n_langs = len(sents)

    final_cost = 0.
    # compute cost for each language and aggregate on final cost
    for i in range(n_langs):
        s_lang = sents[i]
        # compute image-sentence score matrix
        if attention_type == 'dot':
            scores_lang = tensor.dot(im, s_lang.T)

        elif attention_type == 'general':
            if attention_type == 'general':
                sents_img = tparams['image_sentence_%i_mapping' % i]
            scores_lang = im.dot(sents_img).dot(s_lang.T)

        else:
            raise Exception("Attention type not supported: %s" %
                            attention_type)

        diagonal_lang = scores_lang.diagonal()
        # cost over sentence
        # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence)
        cost_sent_lang = tensor.maximum(0,
                                        margin - diagonal_lang + scores_lang)
        # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image)
        cost_im_lang = tensor.maximum(
            0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang)
        # clear diagonals
        cost_sent_lang = fill_diagonal(cost_sent_lang, 0)
        cost_im_lang = fill_diagonal(cost_im_lang, 0)

        # aggregate
        final_cost += cost_sent_lang.sum() + cost_im_lang.sum()

    return final_cost
Exemplo n.º 9
0
    def contrastive(self, i, s, margin=0.2): 
        # i: (fixed) image embedding, 
        # s: sentence embedding
        errors = - util.cosine_matrix(i, s)
        diagonal = errors.diagonal()
        # compare every diagonal score to scores in its column (all contrastive images for each sentence)
        cost_s = T.maximum(0, margin - errors + diagonal)  
        # all contrastive sentences for each image
        cost_i = T.maximum(0, margin - errors + diagonal.reshape((-1, 1)))  
        cost_tot = cost_s + cost_i
        # clear diagonals
        cost_tot = fill_diagonal(cost_tot, 0)

        return cost_tot.mean()
Exemplo n.º 10
0
def contrastive(i, s, margin=0.2):
    # i: (fixed) image embedding,
    # s: sentence embedding
    errors = -cosine_matrix(i, s)
    diagonal = errors.diagonal()
    # compare every diagonal score to scores in its column (all contrastive images for each sentence)
    cost_s = T.maximum(0, margin - errors + diagonal)
    # all contrastive sentences for each image
    cost_i = T.maximum(0, margin - errors + diagonal.reshape((-1, 1)))
    cost_tot = cost_s + cost_i
    # clear diagonals
    cost_tot = fill_diagonal(cost_tot, 0)

    return cost_tot.mean()
Exemplo n.º 11
0
    def test_perform(self):
        x = tensor.matrix()
        y = tensor.scalar()
        f = function([x, y], fill_diagonal(x, y))
        for shp in [(8, 8), (5, 8), (8, 5)]:
            a = numpy.random.rand(*shp).astype(config.floatX)
            val = numpy.cast[config.floatX](numpy.random.rand())
            out = f(a, val)
            # We can't use numpy.fill_diagonal as it is bugged.
            assert numpy.allclose(numpy.diag(out), val)
            assert (out == val).sum() == min(a.shape)

        # test for 3d tensor
        a = numpy.random.rand(3, 3, 3).astype(config.floatX)
        x = tensor.tensor3()
        y = tensor.scalar()
        f = function([x, y], fill_diagonal(x, y))
        val = numpy.cast[config.floatX](numpy.random.rand() + 10)
        out = f(a, val)
        # We can't use numpy.fill_diagonal as it is bugged.
        assert out[0, 0, 0] == val
        assert out[1, 1, 1] == val
        assert out[2, 2, 2] == val
        assert (out == val).sum() == min(a.shape)
Exemplo n.º 12
0
    def test_perform(self):
        x = tensor.matrix()
        y = tensor.scalar()
        f = function([x, y], fill_diagonal(x, y))
        for shp in [(8, 8), (5, 8), (8, 5)]:
            a = numpy.random.rand(*shp).astype(config.floatX)
            val = numpy.cast[config.floatX](numpy.random.rand())
            out = f(a, val)
            # We can't use numpy.fill_diagonal as it is bugged.
            assert numpy.allclose(numpy.diag(out), val)
            assert (out == val).sum() == min(a.shape)

        # test for 3d tensor
        a = numpy.random.rand(3, 3, 3).astype(config.floatX)
        x = tensor.tensor3()
        y = tensor.scalar()
        f = function([x, y], fill_diagonal(x, y))
        val = numpy.cast[config.floatX](numpy.random.rand() + 10)
        out = f(a, val)
        # We can't use numpy.fill_diagonal as it is bugged.
        assert out[0, 0, 0] == val
        assert out[1, 1, 1] == val
        assert out[2, 2, 2] == val
        assert (out == val).sum() == min(a.shape)
Exemplo n.º 13
0
def contrastive_loss(labels, predict):
    """For a minibatch of sentence and image embeddings, compute the pairwise contrastive loss"""
    global model_options
    margin = model_config['margin']
    res = theano.tensor.split(predict, [model_config['output_dim'], model_config['output_dim']], 2, axis=-1)
    s = res[0]
    im = res[1]
    im2 = im.dimshuffle(('x', 0, 1))
    s2 = s.dimshuffle((0, 'x', 1))
    errors = tensor.pow(im2 - s2, 2).sum(axis=2)
    diagonal = errors.diagonal()
    # compare every diagonal score to scores in its column (all contrastive images for each sentence)
    cost_s = tensor.maximum(0, margin - errors + diagonal)
    # all contrastive sentences for each image
    cost_im = tensor.maximum(0, margin - errors + diagonal.reshape((-1, 1)))
    cost_tot = cost_s + cost_im
    cost_tot = fill_diagonal(cost_tot, 0)
    return cost_tot.sum()
Exemplo n.º 14
0
def contrastive_loss(labels, predict):
    """For a minibatch of sentence and image embeddings, compute the pairwise contrastive loss"""
    global model_options
    margin = model_config['margin']
    res = theano.tensor.split(predict, [model_config['output_dim'], model_config['output_dim']], 2, axis=-1)
    s = res[0]
    im = res[1]
    im2 = im.dimshuffle(('x', 0, 1))
    s2 = s.dimshuffle((0, 'x', 1))
    errors = tensor.pow(tensor.maximum(0, im2 - s2), 2).sum(axis=2)
    diagonal = errors.diagonal()
    # compare every diagonal score to scores in its column (all contrastive images for each sentence)
    cost_s = tensor.maximum(0, margin - errors + diagonal)
    # all contrastive sentences for each image
    cost_im = tensor.maximum(0, margin - errors + diagonal.reshape((-1, 1)))
    cost_tot = cost_s + cost_im
    cost_tot = fill_diagonal(cost_tot, 0)
    return cost_tot.sum()
Exemplo n.º 15
0
def contrastive_loss_all(tparams,
                         options,
                         im,
                         sents,
                         lambda_img_sent=0.5,
                         lambda_sent_sent=0.5):
    """
    Compute contrastive loss.
    Contrastive loss is computed between each language-image pair
    as well as beween sentences in different languages.
    """
    margin = options['margin']
    attention_type = options[
        'attention_type'] if 'attention_type' in options else 'dot'

    n_langs = len(sents)

    final_cost = 0.
    # compute costs for each language-image pair and aggregate final cost
    for i in range(n_langs):
        # compute image-sentence subcost
        s_lang = sents[i]

        # compute image-sentence score matrix
        #scores_lang = tensor.dot(im, s_lang.T)
        if attention_type == 'dot':
            scores_lang = tensor.dot(im, s_lang.T)

        elif attention_type == 'general':
            if attention_type == 'general':
                # get matrix to map sentences and images
                sents_img = tparams['image_sentence_%i_mapping' % i]
            scores_lang = im.dot(sents_img).dot(s_lang.T)

        else:
            raise Exception("Attention type not supported: %s" %
                            attention_type)

        diagonal_lang = scores_lang.diagonal()
        # cost over sentence
        # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence)
        cost_sent_lang = tensor.maximum(0,
                                        margin - diagonal_lang + scores_lang)
        # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image)
        cost_im_lang = tensor.maximum(
            0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang)
        # clear diagonals
        cost_sent_lang = fill_diagonal(cost_sent_lang, 0)
        cost_im_lang = fill_diagonal(cost_im_lang, 0)

        # aggregate
        final_cost += lambda_img_sent * (cost_sent_lang.sum() +
                                         cost_im_lang.sum())

    # compute costs for each-language-language pair and aggregate final cost
    for i in range(n_langs):
        for j in range(n_langs):
            if i == j or j <= i:
                continue
            # compute sentence-sentence subcost
            s_lang1 = sents[i]
            s_lang2 = sents[j]

            # compute sent1-sent2 score matrix
            #scores_lang = tensor.dot(s_lang1, s_lang2.T)
            if attention_type == 'dot':
                scores_lang = tensor.dot(s_lang1, s_lang2.T)

            elif attention_type == 'general':
                # get matrices to map sentences in different languages
                sents_sents = tparams['sentence_%i_sentence_%i_mapping' %
                                      (i, j)]
                scores_lang = s_lang1.dot(sents_sents).dot(s_lang2.T)

            else:
                raise Exception("Attention type not supported: %s" %
                                attention_type)

            diagonal_lang = scores_lang.diagonal()
            # cost over sent1
            # compare every diagonal score to scores in its column (i.e, all contrastive sent2 for each sent1)
            cost_sent1 = tensor.maximum(0,
                                        margin - diagonal_lang + scores_lang)
            # compare every diagonal score to scores in its row (i.e, all contrastive sent1 for each sent2)
            cost_sent2 = tensor.maximum(
                0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang)
            # clear diagonals
            cost_sent1 = fill_diagonal(cost_sent1, 0)
            cost_sent2 = fill_diagonal(cost_sent2, 0)

            # aggregate
            final_cost += lambda_sent_sent * (cost_sent1.sum() +
                                              cost_sent2.sum())

    return final_cost
Exemplo n.º 16
0
    def build_mlp(theano_params,
                  trng,
                  enc_states,
                  activation_mlp='relu',
                  dropout=False,
                  dropout_hid=0.,
                  use_noise=False,
                  **kwargs):
        """
        Builds an MLP scoring function for use during training.

        We are trying to predict a single FC7 vector of dimensionality 4096.

        The cost function is MSE between FC7_true and FC7_predicted.

        :param trng:
        :param enc_states:
        :param activation_mlp:
        :param dropout:
        :param dropout_hid:
        :param use_noise:
        :param kwargs:
        :return:

        """
        # apply dropout on encoder states
        if dropout and dropout_hid > 0. and use_noise:
            logger.warn('Applying dropout mask on bi-states')
            mask = inv_dropout_mask(enc_states.shape, trng, 1 - dropout_hid)
            enc_states *= mask
        else:
            logger.warn('No dropout on Encoder output')

        # set MLP activation function
        assert activation_mlp in ('relu', 'tanh'), \
            'MLP activation function must be tanh or relu'

        activation_mlp = 'lambda x: tensor.nnet.relu(x)' \
            if activation_mlp == 'relu' else 'lambda x: tensor.tanh(x)'
        logger.info('Using MLP activation function: {}'.format(activation_mlp))

        # The input to the MLP will be the mean value of the hidden states for
        # each instance in the minibatch.
        if kwargs['verbose']:
            logger.warn(enc_states.tag.test_value)

        # targets for the MLP -- dim: batch, fc_7 vector
        y = tensor.matrix('y', dtype='float64')
        # take the RELU over the visual features
        #y = eval('lambda x: tensor.nnet.relu(x)')(y)
        y.tag.test_value = np.ones((2, 4096))
        if kwargs['verbose']:
            logger.warn(y.tag.test_value)

        # train a single layer MLP to do everything
        output = get_layer('ff')[1](theano_params,
                                    enc_states,
                                    prefix='mlp',
                                    activ=activation_mlp)

        if kwargs['verbose']:
            logger.warn("MLP output {}".format(output.tag.test_value))

        if kwargs['loss'] == 'mse':
            loss = ((output.flatten() - y.flatten())**2).mean()
        elif kwargs['loss'] == 'constrastive':
            margin = kwargs['margin']
            U_norm = output / output.norm(2, axis=1).reshape(
                (output.shape[0], 1))
            V_norm = y / y.norm(2, axis=1).reshape((y.shape[0], 1))
            errors = tensor.dot(U_norm, V_norm.T)
            diag = errors.diagonal()
            # compare every diagonal score to scores in its column (all contrastive images for each sentence)
            cost_s = tensor.maximum(0, margin - errors + diag)
            # all contrastive sentences for each image
            cost_i = tensor.maximum(0, margin - errors + diag.reshape((-1, 1)))
            cost_tot = cost_s + cost_i
            # clear diagonals
            cost_tot = fill_diagonal(cost_tot, 0)
            if kwargs['verbose']:
                logger.warn("Full cost matrix {}".format(
                    cost_tot.tag.test_value))
            loss = cost_tot.mean()
        elif kwargs['loss'] == 'dot':
            margin = kwargs['margin']
            errors = tensor.dot(output, y.T)
            diag = errors.diagonal()
            # compare every diagonal score to scores in its column (all contrastive images for each sentence)
            cost_s = tensor.maximum(0, margin - errors + diag)
            # all contrastive sentences for each image
            cost_i = tensor.maximum(0, margin - errors + diag.reshape((-1, 1)))
            cost_tot = cost_s + cost_i
            # clear diagonals
            cost_tot = fill_diagonal(cost_tot, 0)
            if kwargs['verbose']:
                logger.warn("Full cost matrix {}".format(
                    cost_tot.tag.test_value))
            loss = cost_tot.mean()

        if kwargs['verbose']:
            logger.warn("Batch loss {}".format(loss.tag.test_value))

        return y, loss, output