def contrastive_loss(margin, im, sents): """ Compute contrastive loss. Contrastive loss is computed between each language-image pair but not beween sentences in different languages. """ n_langs = len(sents) final_cost = 0. # compute cost for each language and aggregate on final cost for i in range(n_langs): s_lang = sents[i] # compute image-sentence score matrix scores_lang = tensor.dot(im, s_lang.T) diagonal_lang = scores_lang.diagonal() # cost over sentence # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence) cost_sent_lang = tensor.maximum(0, margin - diagonal_lang + scores_lang) # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image) cost_im_lang = tensor.maximum(0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang) # clear diagonals cost_sent_lang = fill_diagonal(cost_sent_lang, 0) cost_im_lang = fill_diagonal(cost_im_lang, 0) # aggregate final_cost += cost_sent_lang.sum() + cost_im_lang.sum() return final_cost
def contrastive_loss_all(margin, im, sents, lambda_img_sent=0.5, lambda_sent_sent=0.5): """ Compute contrastive loss. Contrastive loss is computed between each language-image pair as well as beween sentences in different languages. """ n_langs = len(sents) final_cost = 0. # compute costs for each language-image pair and aggregate final cost for i in range(n_langs): # compute image-sentence subcost s_lang = sents[i] # compute image-sentence score matrix scores_lang = tensor.dot(im, s_lang.T) diagonal_lang = scores_lang.diagonal() # cost over sentence # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence) cost_sent_lang = tensor.maximum(0, margin - diagonal_lang + scores_lang) # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image) cost_im_lang = tensor.maximum(0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang) # clear diagonals cost_sent_lang = fill_diagonal(cost_sent_lang, 0) cost_im_lang = fill_diagonal(cost_im_lang, 0) # aggregate final_cost += lambda_img_sent * (cost_sent_lang.sum() + cost_im_lang.sum()) # compute costs for each-language-language pair and aggregate final cost for i in range(n_langs): for j in range(n_langs): if i==j: continue # compute sentence-sentence subcost s_lang1 = sents[i] s_lang2 = sents[j] # compute sent1-sent2 score matrix scores_lang = tensor.dot(s_lang1, s_lang2.T) diagonal_lang = scores_lang.diagonal() # cost over sent1 # compare every diagonal score to scores in its column (i.e, all contrastive sent2 for each sent1) cost_sent1 = tensor.maximum(0, margin - diagonal_lang + scores_lang) # compare every diagonal score to scores in its row (i.e, all contrastive sent1 for each sent2) cost_sent2 = tensor.maximum(0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang) # clear diagonals cost_sent1 = fill_diagonal(cost_sent1, 0) cost_sent2 = fill_diagonal(cost_sent2, 0) # aggregate final_cost += lambda_sent_sent * (cost_sent1.sum() + cost_sent2.sum()) return final_cost
def contrastive_loss(s, im, options): """ For a minibatch of sentence and image embeddings, compute the pairwise contrastive loss """ margin = options['margin'] if options['method'] == 'order': im2 = im.dimshuffle(('x', 0, 1)) s2 = s.dimshuffle((0, 'x', 1)) errors = order_violations(s2, im2, options).sum(axis=2) elif options['method'] == 'cosine': errors = -tensor.dot( im, s.T ) # negative because error is the opposite of (cosine) similarity diagonal = errors.diagonal() cost_s = tensor.maximum( 0, margin - errors + diagonal ) # compare every diagonal score to scores in its column (all contrastive images for each sentence) cost_im = tensor.maximum(0, margin - errors + diagonal.reshape( (-1, 1))) # all contrastive sentences for each image cost_tot = cost_s + cost_im # clear diagonals cost_tot = fill_diagonal(cost_tot, 0) return cost_tot.sum()
def contrastive_loss(s, im, options): """ For a minibatch of sentence and image embeddings, compute the pairwise contrastive loss """ margin = options['margin'] scores = None if options['method'] == 'order': im2 = im.dimshuffle(('x', 0, 1)) s2 = s.dimshuffle((0, 'x', 1)) scores = order_violations(s2, im2, options).sum(axis=2) elif options['method'] == 'cosine': scores = tensor.dot(im, s.T) diagonal = scores.diagonal() cost_s = tensor.maximum(0, margin - scores + diagonal) # compare every diagonal score to scores in its column (all contrastive images for each sentence) cost_im = tensor.maximum(0, margin - scores + diagonal.reshape((-1, 1))) # all contrastive sentences for each image cost_tot = cost_s + cost_im # clear diagonals cost_tot = fill_diagonal(cost_tot, 0) return cost_tot.sum()
def loss(lv1, lv2): """ Contrastive cosine distance optimization target """ # compute image-sentence score matrix scores = T.dot(lv1, lv2.T) diagonal = scores.diagonal() # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence) cost_s = T.maximum(0, gamma - diagonal + scores) # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image) cost_im = T.maximum(0, gamma - diagonal.reshape((-1, 1)) + scores) # clear diagonals cost_s = fill_diagonal(cost_s, 0) cost_im = fill_diagonal(cost_im, 0) return cost_s.sum() + cost_im.sum()
def contrastive_loss(margin, im, s): """ Compute contrastive loss """ # compute image-sentence score matrix scores = tensor.dot(im, s.T) diagonal = scores.diagonal() # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence) cost_s = tensor.maximum(0, margin - diagonal + scores) # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image) cost_im = tensor.maximum(0, margin - diagonal.reshape((-1, 1)) + scores) # clear diagonals cost_s = fill_diagonal(cost_s, 0) cost_im = fill_diagonal(cost_im, 0) return cost_s.sum() + cost_im.sum()
def contrastive_loss(tparams, options, im, sents): """ Compute contrastive loss. Contrastive loss is computed between each language-image pair but not beween sentences in different languages. """ margin = options['margin'] attention_type = options[ 'attention_type'] if 'attention_type' in options else 'dot' n_langs = len(sents) final_cost = 0. # compute cost for each language and aggregate on final cost for i in range(n_langs): s_lang = sents[i] # compute image-sentence score matrix if attention_type == 'dot': scores_lang = tensor.dot(im, s_lang.T) elif attention_type == 'general': if attention_type == 'general': sents_img = tparams['image_sentence_%i_mapping' % i] scores_lang = im.dot(sents_img).dot(s_lang.T) else: raise Exception("Attention type not supported: %s" % attention_type) diagonal_lang = scores_lang.diagonal() # cost over sentence # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence) cost_sent_lang = tensor.maximum(0, margin - diagonal_lang + scores_lang) # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image) cost_im_lang = tensor.maximum( 0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang) # clear diagonals cost_sent_lang = fill_diagonal(cost_sent_lang, 0) cost_im_lang = fill_diagonal(cost_im_lang, 0) # aggregate final_cost += cost_sent_lang.sum() + cost_im_lang.sum() return final_cost
def contrastive(self, i, s, margin=0.2): # i: (fixed) image embedding, # s: sentence embedding errors = - util.cosine_matrix(i, s) diagonal = errors.diagonal() # compare every diagonal score to scores in its column (all contrastive images for each sentence) cost_s = T.maximum(0, margin - errors + diagonal) # all contrastive sentences for each image cost_i = T.maximum(0, margin - errors + diagonal.reshape((-1, 1))) cost_tot = cost_s + cost_i # clear diagonals cost_tot = fill_diagonal(cost_tot, 0) return cost_tot.mean()
def contrastive(i, s, margin=0.2): # i: (fixed) image embedding, # s: sentence embedding errors = -cosine_matrix(i, s) diagonal = errors.diagonal() # compare every diagonal score to scores in its column (all contrastive images for each sentence) cost_s = T.maximum(0, margin - errors + diagonal) # all contrastive sentences for each image cost_i = T.maximum(0, margin - errors + diagonal.reshape((-1, 1))) cost_tot = cost_s + cost_i # clear diagonals cost_tot = fill_diagonal(cost_tot, 0) return cost_tot.mean()
def test_perform(self): x = tensor.matrix() y = tensor.scalar() f = function([x, y], fill_diagonal(x, y)) for shp in [(8, 8), (5, 8), (8, 5)]: a = numpy.random.rand(*shp).astype(config.floatX) val = numpy.cast[config.floatX](numpy.random.rand()) out = f(a, val) # We can't use numpy.fill_diagonal as it is bugged. assert numpy.allclose(numpy.diag(out), val) assert (out == val).sum() == min(a.shape) # test for 3d tensor a = numpy.random.rand(3, 3, 3).astype(config.floatX) x = tensor.tensor3() y = tensor.scalar() f = function([x, y], fill_diagonal(x, y)) val = numpy.cast[config.floatX](numpy.random.rand() + 10) out = f(a, val) # We can't use numpy.fill_diagonal as it is bugged. assert out[0, 0, 0] == val assert out[1, 1, 1] == val assert out[2, 2, 2] == val assert (out == val).sum() == min(a.shape)
def contrastive_loss(labels, predict): """For a minibatch of sentence and image embeddings, compute the pairwise contrastive loss""" global model_options margin = model_config['margin'] res = theano.tensor.split(predict, [model_config['output_dim'], model_config['output_dim']], 2, axis=-1) s = res[0] im = res[1] im2 = im.dimshuffle(('x', 0, 1)) s2 = s.dimshuffle((0, 'x', 1)) errors = tensor.pow(im2 - s2, 2).sum(axis=2) diagonal = errors.diagonal() # compare every diagonal score to scores in its column (all contrastive images for each sentence) cost_s = tensor.maximum(0, margin - errors + diagonal) # all contrastive sentences for each image cost_im = tensor.maximum(0, margin - errors + diagonal.reshape((-1, 1))) cost_tot = cost_s + cost_im cost_tot = fill_diagonal(cost_tot, 0) return cost_tot.sum()
def contrastive_loss(labels, predict): """For a minibatch of sentence and image embeddings, compute the pairwise contrastive loss""" global model_options margin = model_config['margin'] res = theano.tensor.split(predict, [model_config['output_dim'], model_config['output_dim']], 2, axis=-1) s = res[0] im = res[1] im2 = im.dimshuffle(('x', 0, 1)) s2 = s.dimshuffle((0, 'x', 1)) errors = tensor.pow(tensor.maximum(0, im2 - s2), 2).sum(axis=2) diagonal = errors.diagonal() # compare every diagonal score to scores in its column (all contrastive images for each sentence) cost_s = tensor.maximum(0, margin - errors + diagonal) # all contrastive sentences for each image cost_im = tensor.maximum(0, margin - errors + diagonal.reshape((-1, 1))) cost_tot = cost_s + cost_im cost_tot = fill_diagonal(cost_tot, 0) return cost_tot.sum()
def contrastive_loss_all(tparams, options, im, sents, lambda_img_sent=0.5, lambda_sent_sent=0.5): """ Compute contrastive loss. Contrastive loss is computed between each language-image pair as well as beween sentences in different languages. """ margin = options['margin'] attention_type = options[ 'attention_type'] if 'attention_type' in options else 'dot' n_langs = len(sents) final_cost = 0. # compute costs for each language-image pair and aggregate final cost for i in range(n_langs): # compute image-sentence subcost s_lang = sents[i] # compute image-sentence score matrix #scores_lang = tensor.dot(im, s_lang.T) if attention_type == 'dot': scores_lang = tensor.dot(im, s_lang.T) elif attention_type == 'general': if attention_type == 'general': # get matrix to map sentences and images sents_img = tparams['image_sentence_%i_mapping' % i] scores_lang = im.dot(sents_img).dot(s_lang.T) else: raise Exception("Attention type not supported: %s" % attention_type) diagonal_lang = scores_lang.diagonal() # cost over sentence # compare every diagonal score to scores in its column (i.e, all contrastive images for each sentence) cost_sent_lang = tensor.maximum(0, margin - diagonal_lang + scores_lang) # compare every diagonal score to scores in its row (i.e, all contrastive sentences for each image) cost_im_lang = tensor.maximum( 0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang) # clear diagonals cost_sent_lang = fill_diagonal(cost_sent_lang, 0) cost_im_lang = fill_diagonal(cost_im_lang, 0) # aggregate final_cost += lambda_img_sent * (cost_sent_lang.sum() + cost_im_lang.sum()) # compute costs for each-language-language pair and aggregate final cost for i in range(n_langs): for j in range(n_langs): if i == j or j <= i: continue # compute sentence-sentence subcost s_lang1 = sents[i] s_lang2 = sents[j] # compute sent1-sent2 score matrix #scores_lang = tensor.dot(s_lang1, s_lang2.T) if attention_type == 'dot': scores_lang = tensor.dot(s_lang1, s_lang2.T) elif attention_type == 'general': # get matrices to map sentences in different languages sents_sents = tparams['sentence_%i_sentence_%i_mapping' % (i, j)] scores_lang = s_lang1.dot(sents_sents).dot(s_lang2.T) else: raise Exception("Attention type not supported: %s" % attention_type) diagonal_lang = scores_lang.diagonal() # cost over sent1 # compare every diagonal score to scores in its column (i.e, all contrastive sent2 for each sent1) cost_sent1 = tensor.maximum(0, margin - diagonal_lang + scores_lang) # compare every diagonal score to scores in its row (i.e, all contrastive sent1 for each sent2) cost_sent2 = tensor.maximum( 0, margin - diagonal_lang.reshape((-1, 1)) + scores_lang) # clear diagonals cost_sent1 = fill_diagonal(cost_sent1, 0) cost_sent2 = fill_diagonal(cost_sent2, 0) # aggregate final_cost += lambda_sent_sent * (cost_sent1.sum() + cost_sent2.sum()) return final_cost
def build_mlp(theano_params, trng, enc_states, activation_mlp='relu', dropout=False, dropout_hid=0., use_noise=False, **kwargs): """ Builds an MLP scoring function for use during training. We are trying to predict a single FC7 vector of dimensionality 4096. The cost function is MSE between FC7_true and FC7_predicted. :param trng: :param enc_states: :param activation_mlp: :param dropout: :param dropout_hid: :param use_noise: :param kwargs: :return: """ # apply dropout on encoder states if dropout and dropout_hid > 0. and use_noise: logger.warn('Applying dropout mask on bi-states') mask = inv_dropout_mask(enc_states.shape, trng, 1 - dropout_hid) enc_states *= mask else: logger.warn('No dropout on Encoder output') # set MLP activation function assert activation_mlp in ('relu', 'tanh'), \ 'MLP activation function must be tanh or relu' activation_mlp = 'lambda x: tensor.nnet.relu(x)' \ if activation_mlp == 'relu' else 'lambda x: tensor.tanh(x)' logger.info('Using MLP activation function: {}'.format(activation_mlp)) # The input to the MLP will be the mean value of the hidden states for # each instance in the minibatch. if kwargs['verbose']: logger.warn(enc_states.tag.test_value) # targets for the MLP -- dim: batch, fc_7 vector y = tensor.matrix('y', dtype='float64') # take the RELU over the visual features #y = eval('lambda x: tensor.nnet.relu(x)')(y) y.tag.test_value = np.ones((2, 4096)) if kwargs['verbose']: logger.warn(y.tag.test_value) # train a single layer MLP to do everything output = get_layer('ff')[1](theano_params, enc_states, prefix='mlp', activ=activation_mlp) if kwargs['verbose']: logger.warn("MLP output {}".format(output.tag.test_value)) if kwargs['loss'] == 'mse': loss = ((output.flatten() - y.flatten())**2).mean() elif kwargs['loss'] == 'constrastive': margin = kwargs['margin'] U_norm = output / output.norm(2, axis=1).reshape( (output.shape[0], 1)) V_norm = y / y.norm(2, axis=1).reshape((y.shape[0], 1)) errors = tensor.dot(U_norm, V_norm.T) diag = errors.diagonal() # compare every diagonal score to scores in its column (all contrastive images for each sentence) cost_s = tensor.maximum(0, margin - errors + diag) # all contrastive sentences for each image cost_i = tensor.maximum(0, margin - errors + diag.reshape((-1, 1))) cost_tot = cost_s + cost_i # clear diagonals cost_tot = fill_diagonal(cost_tot, 0) if kwargs['verbose']: logger.warn("Full cost matrix {}".format( cost_tot.tag.test_value)) loss = cost_tot.mean() elif kwargs['loss'] == 'dot': margin = kwargs['margin'] errors = tensor.dot(output, y.T) diag = errors.diagonal() # compare every diagonal score to scores in its column (all contrastive images for each sentence) cost_s = tensor.maximum(0, margin - errors + diag) # all contrastive sentences for each image cost_i = tensor.maximum(0, margin - errors + diag.reshape((-1, 1))) cost_tot = cost_s + cost_i # clear diagonals cost_tot = fill_diagonal(cost_tot, 0) if kwargs['verbose']: logger.warn("Full cost matrix {}".format( cost_tot.tag.test_value)) loss = cost_tot.mean() if kwargs['verbose']: logger.warn("Batch loss {}".format(loss.tag.test_value)) return y, loss, output