def carga(): client = pymongo.MongoClient(MONGODB_URI) db = client.docs docs=db.SIMILITUD completo=[] newcorpus = PlaintextCorpusReader(corpus_root, '.*') result={} for fileid in newcorpus.fileids(): for file2 in newcorpus.fileids(): result= {"f1": fileid, "f2":file2, "value": compare_texts(newcorpus.raw(fileid), newcorpus.raw(file2))} docs.insert_one(result).inserted_id
def similaridad_cosine(self, pregunta): sentencias=[s for s in self.sents if len("".join(s))>set.SIZE_PARRAFOS] self.totalsentencias=len(sentencias) resultado=[] for s in sentencias: cadena=" ".join(s).lower() resultado.append([s,compare_texts(pregunta,cadena)]) resultado=sorted(resultado, key=lambda res: res[1] ,reverse=True) final=resultado[:set.TOTAL_RESPUESTAS] registro=[] for v in final: registro.append([v[0],v[1],self.nombre]) return [registro]
paper["status"] = document["status"].replace("\r", "") nodes.append(paper) #计算两个节点abstract的余弦相似度 for index1, document1 in enumerate(doc): #单向比较,abstract属性(字符串)不能为空 if ((index < index1) and document["abstract"].strip() and document1["abstract"].strip()): #对进行比较的两个abstract进行简单的预处理(去标点、统一小写、去掉几个停用词) #计算短文本余弦相似度:https://github.com/sergeio/text_comparer ct = compare_texts( document["abstract"].replace(',', '').replace( '.', '').lower().replace('the', '').replace('of', '').replace('and', ''), document1["abstract"].replace(',', '').replace( '.', '').lower().replace('the', '').replace('of', '').replace('and', '')) if (ct > 0.3): #print ct edge = {} edge["source"] = document["title"] edge["target"] = document1["title"] edge["type"] = "relation" edge["value"] = ct links.append(edge) endTime = time.time()
def test_text_comparison(self): t1 = "Crispy chicken sandwich." t2 = "You are a \n chicken!" t3 = "You are an apple?" self.assertTrue(compare_texts(t1, t2) > compare_texts(t1, t3))