Python TfIdf.similarities示例

    def calcularfrecuencia(self, texto, palabra=[]):

        table = TfIdf()
        table.add_document("informacion", texto)
        resultado = table.similarities(palabra)[0][1]
        if resultado > 0.0:
            return True
        return False

示例#2

显示文件

    def test_similarity(self):
        table = TfIdf()
        table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"])
        table.add_document("bar", ["a", "b", "c", "i", "j", "k"])
        table.add_document("baz", ["k", "l", "m", "n"])

        self.assertEqual(
            table.similarities(["a", "b", "c"]),
            [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])

示例#3

显示文件

class SearchEngine:
    def __init__(self):
        self.tfidf = TfIdf()

    def load_documents(self, documents):
        for doc in documents:
            name = doc.name
            text = self.doc_to_text(doc)
            words = self.text_to_word_array(text)

            self.tfidf.add_document(name, words)

    def query(self, query):
        return self.tfidf.similarities(query)

    def doc_to_text(self, doc):
        parser = AKParser()
        tree = parser.parse(doc)

        text = ''

        q = deque()
        q.append(tree)

        while True:
            if not q:
                break

            node = q.pop()

            if node.tag:
                if 'link' == node.tag.lower():
                    val = node.children[0].value.split('|')[0]
                    text += f' {val} '
                    continue

            if node.value:
                text += f' {node.value} '

            children = node.children

            if children:
                for c in children:
                    q.append(c)

        return re.sub(' +', ' ', text)

    def text_to_word_array(self, text):
        regex = re.compile('[^a-zA-Z\s]')

        text = regex.sub('', text)
        text = re.sub(' +', ' ', text)

        return text.lower().split()

示例#4

显示文件

文件： test_tfidf.py 项目： hojihun5516/python-tf-idf

    def test_similarity(self):
        table = TfIdf()
        table.add_document("doc1", [
            "The", "game", "of", "life", "is", "a", "game", "of",
            "everlasting", "learning"
        ])
        table.add_document(
            "doc2",
            ["The", "unexamined", "life", "is", "not", "worth", "living"])
        table.add_document("doc3", ["Never", "stop", "learning"])

        table.calculate_tf()
        table.calculate_idf()
        table.calculate_tf_idf()
        """self.assertEqual(
            table.similarities(["life","learning"]),
            [["foo", 1.0], ["bar", 0.707106781], ["baz", 0.707106781]])"""

        print(table.similarities(["life", "learning"]))

示例#5

显示文件

    def calculateTFIDFofNew(self, inputTitle, inputBody):
        title = self.textToWordsArray(inputTitle)
        sentences = self.textArrayToWordsArray(inputBody)
        if len(sentences) < 1:
            return []

        table = TfIdf()
        for i in range(0, len(sentences)):
            table.add_document("sentences" + str(i), sentences[i])

        result = []
        similarities = table.similarities(title)
        for similarity in similarities:
            result.append(similarity[1])

        resLen = len(result)
        for i in range(resLen, 5):
            result.append(0)
        return result

示例#6

显示文件

文件： M2.py 项目： gokulpgit/Web-Crawler

            soup = BeautifulSoup(obj["content"],
                                 "html.parser",
                                 from_encoding="iso-8859-1")
            joinedText = [
                ps.stem(word) for word in ' '.join(
                    [p.get_text()
                     for p in soup.find_all("p", text=True)]).split()
            ]
            line = set(
                re.sub(r'[^a-zA-Z0-9]', ' ', joinedText.lower()).split())
            #response = vectorizer.fit_transform([])
            table.add_document(obj["url"], joinedText)

            doc = html.fromstring(obj["content"])
            for l in doc.iterlinks():
                G.add_edge(obj["url"], l)

            #print(obj["url"])
        tf_idf = table.similarities(query)
        for x in tf_idf:
            rankDict[x[0]] += x[1]
        #G = nx.barabasi_albert_graph(60, 41)
        pr = nx.pagerank(G, 0.4)
        for key, val in pr.items():
            rankDict[key] += val

        for y in sorted(rankDict.items(), key=lambda x: x[1], reverse=True):
            print(y[0])

    else:
        print("No results were found for this query.")

示例#7

显示文件

def menu():
    print("Que deseja fazer?")
    print("1 - Consultar a informação do site do jornal ABola")
    print("2 - Aplicar o algoritmo do TFIDF")
    print("3 - Sair")
    word = 0
    nword = 0
    narray = []
    j = 0
    for line in fileinput.input():
        if line.replace("\n", "") == "1":
            os.system('python3 web_scraper.py')
            print("Que deseja fazer?")
            print("1 - Consultar a informação do site do jornal ABola")
            print("2 - Aplicar o algoritmo do TFIDF")
            print("3 - Sair")
        elif (line.replace("\n", "") == "2") or (word > 0):
            if word == 0 and j == 0:
                if (os.path.isdir("artigos") == False):
                    print(
                        'Necessita de gerar primeiro o conteúdo. Escolha a opção 1'
                    )
                    print("Que deseja fazer?")
                    print("1 - Consultar a informação do site do jornal ABola")
                    print("2 - Aplicar o algoritmo do TFIDF")
                    print("3 - Sair")
                else:
                    filesA = os.listdir('artigos')
                    table = TfIdf()
                    for i in filesA:
                        with open('artigos/{}'.format(i), 'r') as content:
                            #print(content.read().split('h2'))
                            val = content.read().split('h2')
                            firstVal = val[0]
                            secondVal = val[1]
                            table.add_document(
                                'title{}'.format(i),
                                re.sub(r'[\W]', ' ', firstVal).lower().split())
                            table.add_document(
                                'text{}'.format(i),
                                re.sub(r'[\W]', ' ',
                                       secondVal).lower().split())
                    word += 1
                    print('Indique quantas palavras quer comparar:')
            elif (word == 1) and (j == 0):
                if (line.replace("\n", "").isnumeric() and int(line) > 1):
                    nword = int(line)
                    word += 1
                else:
                    print('Digite um número maior que 1')
            elif (word > 1) and (word <= nword) and (j == 0):
                if (line.replace("\n", "") != ''):
                    narray.append(line.replace("\n", "").lower())
                    word += 1
            else:
                j = 1
                if (j == 1):
                    if line.replace("\n", "") != '':
                        narray.append(line.replace("\n", "").lower())
                        j += 1
                if (j == 2):
                    print(narray)
                    fTDIDF = open('output' + narray[0] + '.html', 'w+')
                    fTDIDF.write(
                        '<h2>Resultados da aplicação do algoritmo:<h2>')
                    splitArray = {}
                    for s in table.similarities(narray):
                        if s[0].startswith('title'):
                            s[0] = s[0].replace('title', '')
                            if s[0] in splitArray.keys():
                                splitArray[s[0]] += s[1] * 0.7
                            else:
                                splitArray[s[0]] = s[1] * 0.7
                        elif s[0].startswith('text'):
                            s[0] = s[0].replace('text', '')
                            if s[0] in splitArray.keys():
                                splitArray[s[0]] += s[1] * 0.3
                            else:
                                splitArray[s[0]] = s[1] * 0.3

                    for elem in splitArray.keys():
                        fTDIDF.write(
                            '<p><h5><a href="artigos/{}" >'.format(elem) +
                            elem + '</a> -> ' + str(splitArray[elem]) +
                            '</h5></p>')

                    new = 2  # open in a new tab, if possible
                    url = "file:///home/ze/SPLN/WebScraper/output" + narray[
                        0] + ".html"
                    webbrowser.open(url, new=new)
                    word = 0
                    nword = 0
                    narray = []
                    print("Que deseja fazer?")
                    print("1 - Consultar a informação do site do jornal ABola")
                    print("2 - Aplicar o algoritmo do TFIDF")
                    print("3 - Sair")
        elif (line.replace("\n", "") == "3") and (word == 0):
            print("Obrigado pela sua visita")
            fileinput.close()

示例#8

显示文件

        i.lower()
        lemmatizer.lemmatize(i)
    return list    





if __name__ =="__main__":
    print("Please enter the URLs saperated by a space" )
    urls=input().strip().split()
    index(urls)
    l1=[]
    l2=tagrem.text_parser(urls,l1)
    print("Please enter the query saperated by space")
    query=input().strip().split()
    print("The entered urls are")
    for i in range(len(urls)):
        print(i +":"+urls[i])
    query=word_preprocessing(query)
    table = TfIdf()
    for i in range(len(urls)):
        table.add_document(str(i),l2[i])
    x=table.similarities(query) 
    print("URLs Indexed by cosine rankings are")
    for i in x:
        a=int(x[i][0])
        print(i+":"+urls[a])

示例#9

显示文件

table.add_document("criadorCanvas", [
    "Por", "muitos", "anos", "o", "termo", "modelo", "de", "negócios", "foi",
    "usado", "sem", "um", "consenso", "na", "sua", "definição", ".", "Muitos",
    "autores", "o", "mencionavam", "sem", "explicitar", "do", "que",
    "exatamente", "falavam", "E", "foi", "exatamente", "pensando", "nisso",
    "que", "o", "consultor", "suíço", "Alexander", "Osterwalder", "começou",
    "a", "desenvolver", "sua", "tese", "de", "doutorado", "que", "daria",
    "origem", "ao", "Business", "Model", "Canvas.", "Alexander", "percebeu",
    "que", "definir", "o", "termo", "não", "seria", "suficiente.", "Era",
    "necessário", "criar", "algo", "que", "incentivasse", "a", "inovação,",
    "a", "prototipação", "e", "co-criação", "(criação", "colaborativa).",
    "Utilizando-se,", "assim,", "de", "conceitos", "de", "design", "thinking",
    "Alexander", "começou", "com", "um", "simples", "gráfico", "feito", "em",
    "powerpoint", "que", "anos", "mais", "tarde", "se", "tornaria", "uma",
    "bela", "tela", "(canvas)", "separada", "em", "nove", "blocos."
])
ordena = table.similarities(["o", "que", "é", "canvas"])
dicio = dict(ordena)
print sorted(dicio.items(), key=itemgetter(1), reverse=True)

#tableOrdenada = []tableCopia = ordena[:]
#while len(tableOrdenada) < len(ordena):
#maior = tableCopia[0]
#for a in tableCopia:
#if a[1] > maior[1]:
#maior = a
#tableOrdenada.append(maior)
#tableCopia.remove(maior)
#print(tableOrdenada)

示例#10

显示文件

table.add_document("quandoUsar", [
    "lancar", "o", "produto", "perfeito", "nao", "é", "mais", "suficiente",
    "afinal", "nao", "basta", "ser", "é", "preciso", "parecer", "em", "outras",
    "palavras", "tao", "importante", "quanto", "ter", "produtos", "ou",
    "servicos", "de", "alto", "valor", "agregado", "é", "saber", "transmitir",
    "seu", "modelo", "de", "negocios", "com", "clareza", "e", "precisaonesse",
    "sentido", "o", "canvas", "é", "muito", "utilizado", "para", "colocar",
    "as", "ideias", "de", "negocio", "no", "papel", "assim", "é", "possivel",
    "discutir", "e", "persuadir", "socios", "sobre", "estratégias", "de",
    "acao", "bem", "como", "apresentar", "a", "empresa", "aos", "investidores",
    "ainda", "assim", "é", "na", "area", "de", "tecnologia", "da",
    "informacao", "que", "esse", "modelo", "vem", "sendo", "cada", "vez",
    "mais", "demandado"
])

oQueE = table.similarities(["o", "que", "e", "canvas"])
dicio = dict(oQueE)
#print sorted(dicio.items(), key=itemgetter(1),reverse=True)

ondeUsar = table.similarities(["onde", "posso", "usar", "o", "canvas"])
dicio = dict(ondeUsar)
#print sorted(dicio.items(), key=itemgetter(1),reverse=True)

comoAplicar = table.similarities(["como", "posso", "aplicar", "o", "canvas"])
dicio = dict(comoAplicar)
#print sorted(dicio.items(),key=itemgetter(1),reverse=True)

quandoAplicar = table.similarities(
    ["quando", "devo", "aplicar", "o", "canvas"])
dicio = dict(quandoAplicar)
#print sorted(dicio.items(),key=itemgetter(1),reverse=True)