예제 #1
0
def carga():
    client = pymongo.MongoClient(MONGODB_URI)
    db = client.docs
    docs=db.SIMILITUD

    completo=[]
    newcorpus = PlaintextCorpusReader(corpus_root, '.*')
    result={}
    for fileid in newcorpus.fileids():
        for file2 in newcorpus.fileids():
            result= {"f1": fileid, "f2":file2, "value": compare_texts(newcorpus.raw(fileid), newcorpus.raw(file2))}
            docs.insert_one(result).inserted_id
예제 #2
0
    def similaridad_cosine(self, pregunta):
        sentencias=[s for s in self.sents if len("".join(s))>set.SIZE_PARRAFOS]
        self.totalsentencias=len(sentencias)

        resultado=[]
        for s in sentencias:
             cadena=" ".join(s).lower()
             resultado.append([s,compare_texts(pregunta,cadena)])

        resultado=sorted(resultado, key=lambda res: res[1] ,reverse=True)
        final=resultado[:set.TOTAL_RESPUESTAS]
        registro=[]
        for v in final:
            registro.append([v[0],v[1],self.nombre])
            return [registro]
예제 #3
0
파일: main.py 프로젝트: wkw307/policykg
    paper["status"] = document["status"].replace("\r", "")
    nodes.append(paper)

    #计算两个节点abstract的余弦相似度
    for index1, document1 in enumerate(doc):
        #单向比较,abstract属性(字符串)不能为空
        if ((index < index1) and document["abstract"].strip()
                and document1["abstract"].strip()):
            #对进行比较的两个abstract进行简单的预处理(去标点、统一小写、去掉几个停用词)
            #计算短文本余弦相似度:https://github.com/sergeio/text_comparer
            ct = compare_texts(
                document["abstract"].replace(',', '').replace(
                    '.',
                    '').lower().replace('the',
                                        '').replace('of',
                                                    '').replace('and', ''),
                document1["abstract"].replace(',', '').replace(
                    '.',
                    '').lower().replace('the',
                                        '').replace('of',
                                                    '').replace('and', ''))
            if (ct > 0.3):
                #print ct
                edge = {}
                edge["source"] = document["title"]
                edge["target"] = document1["title"]
                edge["type"] = "relation"
                edge["value"] = ct
                links.append(edge)

endTime = time.time()
예제 #4
0
 def test_text_comparison(self):
     t1 = "Crispy chicken sandwich."
     t2 = "You are a \n chicken!"
     t3 = "You are an apple?"
     self.assertTrue(compare_texts(t1, t2) > compare_texts(t1, t3))