Exemplo n.º 1
0
 def do_prepare(self, params, prepare):
     if similarity in params:
         self.similarity = similarity
     else:  # Default similarity is cosine
         self.similarity = lambda s1, s2: cosine(np.nan_to_num(s1),
                                                 np.nan_to_num(s2))
     return prepare(params, self.samples)
Exemplo n.º 2
0
    def run(self, batcher, params):
        results = {}
        for dataset in self.datasets:
            sys_scores = []
            input1, input2, gs_scores = self.data[dataset]
            for ii in range(0, len(gs_scores), params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]
                batch2 = input2[ii:ii + params.batch_size]
                
                # we assume that the get_batch function already throws out the faulty ones
                if len(batch1) == len(batch2) and len(batch1) > 0:
                    enc1 = batcher(batch1, params)
                    enc2 = batcher(batch2, params)

                    for kk in range(enc2.shape[0]):
                        sys_score = cosine(np.nan_to_num(enc1[kk]), np.nan_to_num(enc2[kk]))
                        sys_scores.append(sys_score)

            results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores), 'spearman': spearmanr(sys_scores, gs_scores)}
            logging.debug('{0} : pearson = {1}, spearman = {2}'.format(dataset, results[dataset]['pearson'], results[dataset]['spearman']))
        avg_pearson = np.mean([results[dset]['pearson'][0] for dset in results.keys()])
        avg_spearman = np.mean([results[dset]['spearman'][0] for dset in results.keys()])
        results['all'] = {'pearson': avg_pearson, 'spearman': avg_spearman}
        logging.debug('Results (all) : Pearson = {0}, Spearman = {1}\n'.format(results['all']['pearson'], results['all']['spearman']))

        return results
Exemplo n.º 3
0
 def scoring_fn(i, j):
     """
     Calculate how similar the entities are
     :param i: triple index
     :param j: triple index
     :return: cosine similarity
     """
     # Get verb entities
     ents = (marg[i], marg[j])
     # Get cosine similarity
     return cosine(*ents)
Exemplo n.º 4
0
def getdistinctivefeatures(lang1, lang2, phonemeMap):
    """
    Contrast this with getF1.

    I can't get this to work correctly.

    :param lang1: a set of Phonemes
    :param lang2: a set of Phonemes
    :return: the Distinctive Features score for these languages.
    """

    if len(lang1) == 0:
        print "ERROR: first lang is empty or doesn't exist"
        return -1
    if len(lang2) == 0:
        print "ERROR: second lang is empty or doesn't exist"
        return -1

    # loop over all pairs.
    scores = {}

    total = 0

    for p in lang1:
        # get closest in lang2
        maxsim = 0  # just a small number...
        maxp = None  # max phoneme associate with maxsim
        for p2 in lang2:
            pu1 = p.Phoneme
            pu2 = p2.Phoneme
            if pu1 in phonemeMap and pu2 in phonemeMap:

                ps = tuple(sorted([pu1, pu2]))
                if ps in phonedist:
                    sim = phonedist[ps]
                else:
                    sim = 1-utils.cosine(phonemeMap[pu1], phonemeMap[pu2])
                    phonedist[ps] = sim
            else:
                # not there...?
                #print "SHOULD NEVER HAPPEN!",
                #if pu1 not in phonemeMap:
                #    print "missing ", pu1
                #if pu2 not in phonemeMap:
                    #print "missing ", pu2

                sim = 0
            scores[(pu1,pu2)] = sim
            total += sim

    total /= float(len(lang1) * len(lang2))

    return total
Exemplo n.º 5
0
 def score_fn(term, description, verb_weight=1, head_weight=1):
     "Score using vector addition and cosine similarity"
     which, triple = description
     if which == 'SBJ':
         weights = [verb_weight, head_weight, 1]
     elif which == 'OBJ':
         weights = [verb_weight, 1, head_weight]
     else:
         raise ValueError(which)
     combined = sum(vec[token] * wei for token, wei in zip(triple, weights))
     target = vec[term]
     return cosine(target, combined)
Exemplo n.º 6
0
def compare(lang1, lang2):
    """
    Given two language names, get distance according to phonology scores.

    :param lang1: name of first lang (eg English)
    :param lang2: name of second lang
    :return: the distance of the languages, or -1 if one or both langs not found.
    """
    l1, l2 = comparefeats(lang1, lang2)

    if l1 and l2:
        return utils.cosine(l1.phon_feats(), l2.phon_feats())
    else:
        print "One or both langs not found: {0}, {1}".format(l1, l2)
        return -1
Exemplo n.º 7
0
 def scoring_fn(i, j):
     "Add triples and take cosine"
     # Get tokens
     v1 = raw_triples[i][0]
     v2, s, o = raw_triples[j]
     if with_lookup:
         v1 = pred_name[lookup['v'][v1]]
         v2 = pred_name[lookup['v'][v2]]
         s = pred_name[lookup['n'][s]]
         o = pred_name[lookup['n'][o]]
     # Lookup vectors
     v1, v2, s, o = [vec[model.vocab[x].index] for x in [v1, v2, s, o]]
     comp1 = v1 + s + o
     comp2 = v2 + s + o
     return cosine(comp1, comp2)
Exemplo n.º 8
0
def calculate_distance_offset(pairs, df, offset, offset2=None):
    # CONVERTED TO calculate_distance_mode
    """
    From a set of pairs, calculate the distance of the offset of the 
    pair in relation to the global offset.

    Parameters:
    -----------
    pairs: list
        list containing tuples of IDs of norms [(id1,id2),(id3,id4)...]
    df: pandas.dataframe
        dataframe containing ids and embeddings of sentences
    offset: np.array
        vector containing the global offset (offset of all conflicts)
    """
    label = 0
    vdist = []
    pb = progressbar.ProgressBar(len(pairs))
    for i, arr in enumerate(pairs):
        emb1 = df.id2embed(arr[0])
        emb2 = df.id2embed(arr[1])
        local_offset = emb1 - emb2

        # cosine (similar:0->2:not_similar)
        cos = utils.cosine(local_offset, offset)
        # euclidean distance (similar:0->inf:not_similar)
        euc = utils.euclidean(local_offset, offset)
        if len(offset2) > 0:
            cos2 = utils.cosine(local_offset, offset2)
            euc2 = utils.euclidean(local_offset, offset2)
            vdist.append([cos, euc, cos2, euc2])
        else:
            vdist.append((cos, euc))
        pb.update()
        #if i == 1000: break
    return vdist
Exemplo n.º 9
0
def compare(lang1, lang2):
    """
    Given two language names, get distance according to phonology scores.

    :param lang1: name of first lang (eg English)
    :param lang2: name of second lang
    :return: the distance of the languages, or -1 if one or both langs not found.
    """
    l1, l2 = comparefeats(lang1, lang2)

    if l1 and l2:
        return utils.cosine(l1.phon_feats(),l2.phon_feats())
    else:
        print "One or both langs not found: {0}, {1}".format(l1, l2)
        return -1
Exemplo n.º 10
0
    def run(self, batcher, params):
        results = {}
        for dataset in self.datasets:
            sys_scores = []
            input1, input2, gs_scores = self.data[dataset]
            for ii in range(0, len(gs_scores), params.batch_size):
                batch1 = input1[ii:ii + params.batch_size]
                batch2 = input2[ii:ii + params.batch_size]

                # we assume that the get_batch function already throws out the faulty ones
                if len(batch1) == len(batch2) and len(batch1) > 0:
                    enc1 = batcher(batch1, params)
                    enc2 = batcher(batch2, params)

                    for kk in range(enc2.shape[0]):
                        sys_score = cosine(np.nan_to_num(enc1[kk]),
                                           np.nan_to_num(enc2[kk]))
                        sys_scores.append(sys_score)

            results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores), 'spearman': spearmanr(sys_scores, gs_scores),\
                                'nsamples': len(sys_scores)}
            logging.debug('%s : pearson = %.4f, spearman = %.4f' %
                          (dataset, results[dataset]['pearson'][0],
                           results[dataset]['spearman'][0]))

        weights = [results[dset]['nsamples'] for dset in results.keys()]
        list_prs = np.array(
            [results[dset]['pearson'][0] for dset in results.keys()])
        list_spr = np.array(
            [results[dset]['spearman'][0] for dset in results.keys()])

        avg_pearson = np.average(list_prs)
        avg_spearman = np.average(list_spr)
        wavg_pearson = np.average(list_prs, weights=weights)
        wavg_spearman = np.average(list_spr, weights=weights)

        results['all'] = {'pearson': {'mean':avg_pearson, 'wmean':wavg_pearson},\
                          'spearman': {'mean':avg_spearman, 'wmean':wavg_spearman}}
        logging.debug(
            'ALL (weighted average) : Pearson = %.4f, Spearman = %.4f' %
            (wavg_pearson, wavg_spearman))
        logging.debug('ALL (average) : Pearson = %.4f, Spearman = %.4f\n' %
                      (avg_pearson, avg_spearman))

        return results
Exemplo n.º 11
0
 def neighbours(self, word, size = 10):
     """
     Get nearest words with KDTree, ranking by cosine distance
     """
     word = word.strip()
     v = self.word_vec(word)
     [distances], [points] = self.kdt.query(array([v]), k = size, return_distance = True)
     assert len(distances) == len(points), "distances and points should be in same shape."
     words, scores = [], {}
     for (x,y) in zip(points, distances):
         w = self.index2word[x]
         if w == word: s = 1.0
         else: s = utils.cosine(v, self.syn0[x])
         if s < 0: s = abs(s)
         words.append(w)
         scores[w] = min(s, 1.0)
     for x in sorted(words, key=scores.get, reverse=True):
         yield x, scores[x]
Exemplo n.º 12
0
    def calc_word_similarity(self, test_file, embed_vec):
        """Calculate Word Similarity
        
        Arguments:
            test_file {str} -- similarity test file e.g wordsim-240 and wordsim-297
            embed_vec {Keyedvectors} -- A pre-load gensim word vectors
        """
        pred, label, found = [], [], 0
        with open(test_file, 'r', encoding='utf8') as fr:
            lines = fr.readlines()
            for line in lines:
                w1, w2, score = line.split()
                if w1 in embed_vec and w2 in embed_vec:
                    found += 1
                    pred.append(cosine(embed_vec[w1], embed_vec[w2]))
                    label.append(float(score))

        file_name = test_file.split("/")[-1].replace('.txt', '')
        print(f"Test File: {file_name}")
        print(f"Numbers of words Found: {found}")
        print(f"Numbers of words Not Found: {len(lines) - found}")
        print(f"Spearman's Rank Coeficient: {rho(label, pred)}")
Exemplo n.º 13
0
        # construction of the query vector    
        parsed_dir = "C:\\Users\\nbonardo\\movies_parsed"
        n_files = len([name for name in os.listdir(parsed_dir) if os.path.isfile(os.path.join(parsed_dir, name))])
        queryVector = dict()
        nWordsQ = len(term_ids)
        for term_id in term_ids:                        
            term_idf = 1 + math.log(n_files / len(idx[str(term_id)]))        
            queryVector[term_id] = term_idf / nWordsQ
        #print("Query vector", queryVector)

        # calculation of the cosine similarity between each document vector of the result and the quere vector
        # storing the result (as tuple) in a heap structure
        h = []
        for doc in final:
            cos = utils.cosine(queryVector, docVectors[doc])
            #print("Cosine similarity of doc", doc, "=", cos, utils.getMovieTitle(doc))
            heapq.heappush(h, (cos, doc))

        # output of the top-K movies according to cosine similarity
        K = 5
        K = min(K, len(final))
        print("*** TOP-", K, " results ***")
        print(" Id  |        Title                             | Cosine Similarity")
        print("-----+------------------------------------------+------------------")
        for _ in range(K):
            movieTup = heapq.heappop(h)
            #print(movieTup[1], utils.getMovieTitle(movieTup[1]), movieTup[0])
            print("%4s | %40s | %.6s" % (movieTup[1]+1, utils.getMovieTitle(movieTup[1]), movieTup[0]))
        
    elif search_engine == '3':
Exemplo n.º 14
0
 def relevance(self, title, content, sentences):
     """计算各个句子和标题以及正文的相关性"""
     c = []
     for s in sentences:
         c.append(self.weight * cosine(s, content) + (1 - self.weight) * cosine(s, title))
     return c
    # F6: Proper Noun
    for Si in sents:
        Si_propnouns = np.intersect1d(Si, propernoun)
        F6.append(len(Si_propnouns) / len(Si))

    # F7: Similarities Between Sentences
    vocab = sorted(set(flat))

    TF = get_TF(sents, vocab)

    sim_SiSj = []
    for i, Si in enumerate(TF):
        temp = []
        for j, Sj in enumerate(TF):
            if i == j: continue
            temp.append(cosine(Si, Sj))
        sim_SiSj.append(sum(temp))
    max_simSiSj = max(sim_SiSj)

    for sim_Si in sim_SiSj:
        F7.append(sim_Si / max_simSiSj)

    # F8: Term Weight
    TFIDF = get_TFIDF(sents, vocab)

    sum_TFIDF = []
    for tfidf in TFIDF:
        sum_TFIDF.append(sum(tfidf))
    max_sum_TFIDF = max(sum_TFIDF)

    for sum_tfidf in sum_TFIDF:
Exemplo n.º 16
0
    ids = torch.tensor(query_tokens['input_ids']).unsqueeze(0)
    mask = torch.tensor(query_tokens['attention_mask']).unsqueeze(0)

    pred = classifier(ids, mask)
    top_val, top_idx = torch.topk(pred[0], 3, dim=1)
    pred_categories = model_classes[top_idx].tolist()[0]
    topics = [cat_map[cat] for cat in pred_categories]

    ### Encode query to embedding and return top item

    # encode user query
    query_embedding = sentence_transformer.encode(query)

    # filter df to relevant categories and grab embeddings
    relevant_docs = df[df['categories'].isin(pred_categories)]
    relevant_embeddings = embeddings[relevant_docs.index]

    # Calculate cosine similarity of user query and relevant embeddings
    sims = []
    for doc in relevant_embeddings:
        doc_sim = cosine(query_embedding, doc)
        sims.append(doc_sim)

    top_matches = np.argsort(sims)[::-1]
    top_item = relevant_docs.iloc[top_matches[0]]

    print(f'User Query: {query}\n')
    print(f'Predicted Topics {topics}\n')
    print(f'Recomended Paper:\n {top_item.title}\n')
    print(f'Abstract:\n{top_item.abstract}\n')
Exemplo n.º 17
0
  def compute_loss(self, input_image, pca_render, gcn_render, pca_texture, gcn_texture, proj_color,
                   pca_color, gcn_color, input_feat, gcn_feat, regularization, get_inter=False):
    """Adds to the inference model the layers required to generate loss."""
    with tf.name_scope('loss'):
      with tf.name_scope('data_loss'):
        skin_mask = self._erosion2d(input_image[..., 3:])
        gcn_render_mask = tf.round(gcn_render[..., 3:]) * skin_mask

        # pca_render_loss = tf.losses.mean_squared_error(
        pca_render_loss = tf.losses.absolute_difference(
            predictions=pca_render[..., :3] * gcn_render_mask, labels=input_image[..., :3] *
            gcn_render_mask, reduction=tf.losses.Reduction.SUM) / tf.reduce_sum(gcn_render_mask)

        # gcn_render_loss = tf.losses.mean_squared_error(
        gcn_render_loss = tf.losses.absolute_difference(
            predictions=gcn_render[..., :3] * gcn_render_mask, labels=input_image[..., :3] *
            gcn_render_mask, reduction=tf.losses.Reduction.SUM) / tf.reduce_sum(gcn_render_mask)

        # project_loss_image = tf.losses.mean_squared_error(
        project_loss_image = tf.losses.absolute_difference(
            predictions=gcn_color * proj_color[..., 3:],
            labels=proj_color[..., :3] * proj_color[..., 3:], reduction=tf.losses.Reduction.MEAN)

        # project_loss_pca = tf.losses.mean_squared_error(
        project_loss_pca = tf.losses.absolute_difference(
            predictions=gcn_color * (1 - proj_color[..., 3:]),
            labels=pca_color * (1 - proj_color[..., 3:]), reduction=tf.losses.Reduction.MEAN)

        project_loss = project_loss_image + 0.3 * project_loss_pca

        # refine_loss = tf.losses.mean_squared_error(
        refine_loss = tf.losses.absolute_difference(predictions=gcn_texture, labels=pca_texture,
                                                    reduction=tf.losses.Reduction.MEAN)

        perception_loss = 1 - tf.reduce_mean(utils.cosine(input_feat, gcn_feat))

        var_losses = []
        gcn_skin_texture = tf.gather(gcn_texture, self.bfm.skin_index, axis=1)
        for i in range(3):
          _, variance = tf.nn.moments(gcn_skin_texture[..., i], axes=1)
          var_losses.append(variance)
        var_loss = tf.reduce_mean(var_losses)

        sym_diff = tf.gather(gcn_texture, self.bfm.left_index, axis=1) - tf.gather(
            gcn_texture, self.bfm.right_index, axis=1)
        sym_loss = tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(sym_diff) + 1e-16, axis=-1)))

        # adj_tensor = tf.constant(self.adjacent.reshape(
        #     [1, self.num_vert, self.num_vert, 1]),
        #                          dtype=tf.int32,
        #                          shape=[1, self.num_vert, self.num_vert, 1])
        # coo = self.adjacent.tocoo()

        # indices = np.mat([0, self.adjacent.row, self.adjacent.col, 0]).transpose()
        # values = np.ones_like(self.adjacent.data, np.float32)
        # adj_tensor = tf.SparseTensor(indices, values, self.adjacent.shape)
        # # adj_tensor = tf.SparseTensor(self.adjacent.indices,
        # #                             np.clip(self.adjacent.data, 0, 1),
        # #                             self.adjacent.shape)
        # expand = tf.ones([1, self.num_vert, self.num_vert, 3], dtype=tf.float32)
        # expand = expand * tf.expand_dims(gcn_texture, axis=1)
        # exp_trans = tf.transpose(expand, [0, 2, 1, 3])
        # # vertical = tf.ones([self.num_vert, self.num_vert, 3], dtype=tf.float32)
        # # vertical = vertical * tf.expand_dims(gcn_texture, axis=2)
        # smooth_loss = tf.abs((expand - exp_trans) * adj_tensor)
        # test = tf.sparse_to_dense(smooth_loss.indices, )

        #TODO: need attention
        # data_loss = self.ph_ref_lambda * refine_loss + self.ph_ren_lambda * (
        #     gcn_render_loss + 0.2 * project_loss +
        #     0.2 * perception_loss) + 0.1 * sym_loss
        data_loss = self.ph_ref_lambda * refine_loss + self.ph_ren_lambda * (
            project_loss + 0.2 * perception_loss + 0.5 * sym_loss + 0.01 * var_loss)

        # if not get_inter:
        #   self.skin_mask = skin_mask
        #   self.gcn_render_mask = gcn_render_mask
        #   self.gcn_render_image = gcn_render[..., :3]
        #   self.input_image_rgb = input_image[..., :3]
        #   self.pca_render_image = pca_render[..., :3]

      with tf.name_scope('regularization'):
        regularization *= tf.add_n(self.regularizers)
      loss = data_loss + regularization

      tf.summary.scalar('loss/data_loss', data_loss)
      tf.summary.scalar('loss/pca_render_loss', pca_render_loss)
      tf.summary.scalar('loss/gcn_render_loss', gcn_render_loss)
      tf.summary.scalar('loss/project_loss', project_loss)
      tf.summary.scalar('loss/refine_loss', refine_loss)
      tf.summary.scalar('loss/perception_loss', perception_loss)
      tf.summary.scalar('loss/var_loss', var_loss)
      tf.summary.scalar('loss/sym_loss', sym_loss)
      tf.summary.scalar('loss/regularization', regularization)

      logger.info('Successfully Computed Losses')

      return loss, pca_render_loss, gcn_render_loss, project_loss, refine_loss, perception_loss, var_loss, sym_loss
Exemplo n.º 18
0
 def sim(a, b):
     "Cosine similarity of vectors"
     return cosine(vec[a], vec[b])