def cal_cosine(): bert = BERTEmbedding(chinese_bert_file, task=kashgari.CLASSIFICATION, sequence_length=10) # call for single embed seg_list1 = jieba.cut("我来到北京清华大学", cut_all=False) seg_list2 = jieba.cut("天然矿泉水是指从地下深处自然涌出的或钻井采集的", cut_all=False) seg_list1 = list(seg_list1) seg_list2 = list(seg_list2) embed_tensor1 = bert.embed_one(seg_list1) embed_tensor2 = bert.embed_one(seg_list2) # embed_tensor1 = bert.embed_one(['今','天','天','气','不','错']) # embed_tensor2 = bert.embed_one(['我','住','在','南','京']) print(embed_tensor1.shape) print(embed_tensor2.shape) embedding1 = np.zeros(shape=(1, 3072)) embedding2 = np.zeros(shape=(1, 3072)) for i in range(embed_tensor1.shape[0]): # print(embed_tensor1[i][:]) embedding1 += embed_tensor1[i][:] embedding2 += embed_tensor2[i][:] print(embedding1) print(embedding2) cos_value = cosine_similarity(embedding1, embedding2) print('cos_value =', str(cos_value[0][0]))
return attention1, attention2 '''获取词向量''' '''加载bert预训权重,https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip 需要解压文件,将文件夹路径放入BERTEmbedding中''' bert = BERTEmbedding( '/media/ding/Files/ubuntu_study/Datasets/chinese-bert_chinese_wwm_L-12_H-768_A-12', task=kashgari.CLASSIFICATION, sequence_length=7) sents = 'The quick fox jumped over lazy dog.' sents = sents.replace('.', ' ') input = sents.split() embed_tensor = bert.embed_one(input) #shape (7, 3072) 7为序列长度 #获取(7, 3072)的q k v #初始化每个词的q k v为本身,实际网络中q k v是学习得到的向量 '''Attention1''' index1 = input.index('fox') index2 = input.index('jumped') q1, k1, v1 = embed_tensor[index1], embed_tensor[index1], embed_tensor[index1] q2, k2, v2 = embed_tensor[index2], embed_tensor[index2], embed_tensor[index2] attention1, attention2 = scaled_dot_product_attention(q1, k1, v1, q2, k2, v2) print('fox 与自己的attention', attention1[0][index1]) print('fox 与jumped的attention', attention2[0][index2]) '''Attention2''' index1 = input.index('over') index2 = input.index('dog')