示例#1
0
文件: cal_cosine.py 项目: fay128/citi
def cal_cosine():
    bert = BERTEmbedding(chinese_bert_file,
                         task=kashgari.CLASSIFICATION,
                         sequence_length=10)

    # call for single embed
    seg_list1 = jieba.cut("我来到北京清华大学", cut_all=False)
    seg_list2 = jieba.cut("天然矿泉水是指从地下深处自然涌出的或钻井采集的", cut_all=False)
    seg_list1 = list(seg_list1)
    seg_list2 = list(seg_list2)
    embed_tensor1 = bert.embed_one(seg_list1)
    embed_tensor2 = bert.embed_one(seg_list2)

    # embed_tensor1 = bert.embed_one(['今','天','天','气','不','错'])
    # embed_tensor2 = bert.embed_one(['我','住','在','南','京'])

    print(embed_tensor1.shape)
    print(embed_tensor2.shape)

    embedding1 = np.zeros(shape=(1, 3072))
    embedding2 = np.zeros(shape=(1, 3072))

    for i in range(embed_tensor1.shape[0]):
        # print(embed_tensor1[i][:])
        embedding1 += embed_tensor1[i][:]
        embedding2 += embed_tensor2[i][:]

    print(embedding1)
    print(embedding2)
    cos_value = cosine_similarity(embedding1, embedding2)
    print('cos_value =', str(cos_value[0][0]))
示例#2
0
    return attention1, attention2


'''获取词向量'''
'''加载bert预训权重,https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
   需要解压文件,将文件夹路径放入BERTEmbedding中'''
bert = BERTEmbedding(
    '/media/ding/Files/ubuntu_study/Datasets/chinese-bert_chinese_wwm_L-12_H-768_A-12',
    task=kashgari.CLASSIFICATION,
    sequence_length=7)

sents = 'The quick fox jumped over lazy dog.'
sents = sents.replace('.', ' ')
input = sents.split()

embed_tensor = bert.embed_one(input)  #shape (7, 3072) 7为序列长度

#获取(7, 3072)的q k v
#初始化每个词的q k v为本身,实际网络中q k v是学习得到的向量
'''Attention1'''
index1 = input.index('fox')
index2 = input.index('jumped')
q1, k1, v1 = embed_tensor[index1], embed_tensor[index1], embed_tensor[index1]
q2, k2, v2 = embed_tensor[index2], embed_tensor[index2], embed_tensor[index2]

attention1, attention2 = scaled_dot_product_attention(q1, k1, v1, q2, k2, v2)
print('fox 与自己的attention', attention1[0][index1])
print('fox 与jumped的attention', attention2[0][index2])
'''Attention2'''
index1 = input.index('over')
index2 = input.index('dog')