Exemplo n.º 1
0
def W2V_Vec(sent_A, sent_B, vec):
    if len(sent_A) <= 1:
        sent_A += 'none'

    elif len(sent_B) <= 1:
        sent_B += 'none'
    vec1 = 0
    vec2 = 0
    sent_A = tokenize(sent_A)
    sent_B = tokenize(sent_B)

    for word in sent_A:
        if word not in ", . ? ! # $ % ^ & * ( ) { } [ ]".split():
            try:
                vec1 += vec[word]
            except:
                continue
    for word in sent_B:
        if word not in ", . ? ! # $ % ^ & * ( ) { } [ ]".split():
            try:
                vec2 += vec[word]
            except:
                continue
    try:
        result = cos(vec1, vec2)
    except:
        result = 0.0

    if np.isnan(result):
        return 0.0
    else:
        return result
Exemplo n.º 2
0
    def get_negative_data(self, numpy_data, check_sim, limit):
        neg_data = numpy_data.copy()
        np.random.shuffle(neg_data)
        if check_sim:
            c = 0
            for i in range(len(neg_data)):
                sim = 1 - cos(neg_data[i], numpy_data[i])
                while sim > limit:
                    c += 1
                    neg_data[i] = neg_data[int(random.random() *
                                               np.shape(neg_data)[0])]
                    sim = 1 - cos(neg_data[i], numpy_data[i])

            print("Negative sample build. #Changes: ", c)

        return neg_data
Exemplo n.º 3
0
def words_score(sentence, words_infos):
    '''新数据与老数据对比,分类'''
    s2v = AvgWord2vec()
    words_vec = s2v.transfrom_sentence_to_vec(sentence)
    for words_info in words_infos:
        score = cos(words_vec, words_info.get("words_vec"))
        print(score)
        # 夹角越小越相似
        if score < best_score:
            return words_info.get("intent")
        else:
            return "匹配失败"
Exemplo n.º 4
0
    def is_word_embed_match(self, mention_x: MentionDataLight, mention_y: MentionDataLight):
        """
        Check if input mentions Word Embedding cosine distance below above 0.65

        Args:
            mention_x: MentionDataLight
            mention_y: MentionDataLight

        Returns:
            bool
        """
        match_result = False
        x_embed = self.embedding.get_feature_vector(mention_x)
        y_embed = self.embedding.get_feature_vector(mention_y)
        # make sure words are not 'unk/None/0'
        if x_embed is not None and y_embed is not None:
            dist = cos(x_embed, y_embed)
            if not math.isnan(dist):
                sim = 1 - dist
                if sim > 0.65:
                    match_result = True

        return match_result
Exemplo n.º 5
0
    with torch.no_grad():
        query_bert_outputs, _ = model(query,
                                      attention_mask=(query > 0).long(),
                                      token_type_ids=None,
                                      output_all_encoded_layers=True)
        query_bert_outputs = torch.cat(query_bert_outputs[-1:], dim=-1)
        pred = span_extractor(query_bert_outputs, pos).squeeze(0)
        candidate_abstract_output, _ = model(
            candidate_abstract,
            attention_mask=(candidate_abstract > 0).long(),
            token_type_ids=None,
            output_all_encoded_layers=True)
        abstract_bert_outputs = torch.cat(candidate_abstract_output[-1:],
                                          dim=-1)
        label = span_extractor(abstract_bert_outputs, pos_answer).squeeze(0)
    #print(pred.size(),type.size(),torch.max(type))
    pred = pred.cpu().numpy()
    label = label.cpu().numpy()
    for i in range(query.size()[0]):
        mse_distance.append(mse(pred[i], label[i]))
        point_distance.append(sum(pred[i] * label[i]))
        cos_distance.append(cos(pred[i], label[i]))
    #print('loss',loss)

#pred_set = np.concatenate(pred_set, axis=0)
#label_set = np.concatenate(label_set, axis=0)
bert_dist = pd.DataFrame()
bert_dist['bert_cos_distance'] = cos_distance
bert_dist['bert_point_distance'] = point_distance
bert_dist['bert_mse_distance'] = mse_distance
bert_dist.to_pickle('data/bert_dis_test.pkl')
Exemplo n.º 6
0
import json
from pprint import pprint
import numpy as np
import pandas as pd

data = [json.loads(line) for line in open('gensim.json', 'r')]

xx = []
for parte in range(0, len(data)):
    xx.append(
        np.mean([
            data[parte]['features'][i]['layers'][0]['values']
            for i in range(0, len(data[parte]['features']))
        ],
                axis=0))

from scipy.spatial.distance import cosine as cos

df = pd.read_csv('gensim.csv', encoding="latin-1", header=None)
print(df.shape)

print(len(data))

for i in range(0, len(xx)):
    print(np.array(df)[i], cos(xx[3], xx[i]))