def calcularfrecuencia(self, texto, palabra=[]): table = TfIdf() table.add_document("informacion", texto) resultado = table.similarities(palabra)[0][1] if resultado > 0.0: return True return False
def test_similarity(self): table = TfIdf() table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"]) table.add_document("bar", ["a", "b", "c", "i", "j", "k"]) table.add_document("baz", ["k", "l", "m", "n"]) self.assertEqual( table.similarities(["a", "b", "c"]), [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])
class SearchEngine: def __init__(self): self.tfidf = TfIdf() def load_documents(self, documents): for doc in documents: name = doc.name text = self.doc_to_text(doc) words = self.text_to_word_array(text) self.tfidf.add_document(name, words) def query(self, query): return self.tfidf.similarities(query) def doc_to_text(self, doc): parser = AKParser() tree = parser.parse(doc) text = '' q = deque() q.append(tree) while True: if not q: break node = q.pop() if node.tag: if 'link' == node.tag.lower(): val = node.children[0].value.split('|')[0] text += f' {val} ' continue if node.value: text += f' {node.value} ' children = node.children if children: for c in children: q.append(c) return re.sub(' +', ' ', text) def text_to_word_array(self, text): regex = re.compile('[^a-zA-Z\s]') text = regex.sub('', text) text = re.sub(' +', ' ', text) return text.lower().split()
def test_similarity(self): table = TfIdf() table.add_document("doc1", [ "The", "game", "of", "life", "is", "a", "game", "of", "everlasting", "learning" ]) table.add_document( "doc2", ["The", "unexamined", "life", "is", "not", "worth", "living"]) table.add_document("doc3", ["Never", "stop", "learning"]) table.calculate_tf() table.calculate_idf() table.calculate_tf_idf() """self.assertEqual( table.similarities(["life","learning"]), [["foo", 1.0], ["bar", 0.707106781], ["baz", 0.707106781]])""" print(table.similarities(["life", "learning"]))
def calculateTFIDFofNew(self, inputTitle, inputBody): title = self.textToWordsArray(inputTitle) sentences = self.textArrayToWordsArray(inputBody) if len(sentences) < 1: return [] table = TfIdf() for i in range(0, len(sentences)): table.add_document("sentences" + str(i), sentences[i]) result = [] similarities = table.similarities(title) for similarity in similarities: result.append(similarity[1]) resLen = len(result) for i in range(resLen, 5): result.append(0) return result
soup = BeautifulSoup(obj["content"], "html.parser", from_encoding="iso-8859-1") joinedText = [ ps.stem(word) for word in ' '.join( [p.get_text() for p in soup.find_all("p", text=True)]).split() ] line = set( re.sub(r'[^a-zA-Z0-9]', ' ', joinedText.lower()).split()) #response = vectorizer.fit_transform([]) table.add_document(obj["url"], joinedText) doc = html.fromstring(obj["content"]) for l in doc.iterlinks(): G.add_edge(obj["url"], l) #print(obj["url"]) tf_idf = table.similarities(query) for x in tf_idf: rankDict[x[0]] += x[1] #G = nx.barabasi_albert_graph(60, 41) pr = nx.pagerank(G, 0.4) for key, val in pr.items(): rankDict[key] += val for y in sorted(rankDict.items(), key=lambda x: x[1], reverse=True): print(y[0]) else: print("No results were found for this query.")
def menu(): print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") word = 0 nword = 0 narray = [] j = 0 for line in fileinput.input(): if line.replace("\n", "") == "1": os.system('python3 web_scraper.py') print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") elif (line.replace("\n", "") == "2") or (word > 0): if word == 0 and j == 0: if (os.path.isdir("artigos") == False): print( 'Necessita de gerar primeiro o conteúdo. Escolha a opção 1' ) print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") else: filesA = os.listdir('artigos') table = TfIdf() for i in filesA: with open('artigos/{}'.format(i), 'r') as content: #print(content.read().split('h2')) val = content.read().split('h2') firstVal = val[0] secondVal = val[1] table.add_document( 'title{}'.format(i), re.sub(r'[\W]', ' ', firstVal).lower().split()) table.add_document( 'text{}'.format(i), re.sub(r'[\W]', ' ', secondVal).lower().split()) word += 1 print('Indique quantas palavras quer comparar:') elif (word == 1) and (j == 0): if (line.replace("\n", "").isnumeric() and int(line) > 1): nword = int(line) word += 1 else: print('Digite um número maior que 1') elif (word > 1) and (word <= nword) and (j == 0): if (line.replace("\n", "") != ''): narray.append(line.replace("\n", "").lower()) word += 1 else: j = 1 if (j == 1): if line.replace("\n", "") != '': narray.append(line.replace("\n", "").lower()) j += 1 if (j == 2): print(narray) fTDIDF = open('output' + narray[0] + '.html', 'w+') fTDIDF.write( '<h2>Resultados da aplicação do algoritmo:<h2>') splitArray = {} for s in table.similarities(narray): if s[0].startswith('title'): s[0] = s[0].replace('title', '') if s[0] in splitArray.keys(): splitArray[s[0]] += s[1] * 0.7 else: splitArray[s[0]] = s[1] * 0.7 elif s[0].startswith('text'): s[0] = s[0].replace('text', '') if s[0] in splitArray.keys(): splitArray[s[0]] += s[1] * 0.3 else: splitArray[s[0]] = s[1] * 0.3 for elem in splitArray.keys(): fTDIDF.write( '<p><h5><a href="artigos/{}" >'.format(elem) + elem + '</a> -> ' + str(splitArray[elem]) + '</h5></p>') new = 2 # open in a new tab, if possible url = "file:///home/ze/SPLN/WebScraper/output" + narray[ 0] + ".html" webbrowser.open(url, new=new) word = 0 nword = 0 narray = [] print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") elif (line.replace("\n", "") == "3") and (word == 0): print("Obrigado pela sua visita") fileinput.close()
i.lower() lemmatizer.lemmatize(i) return list if __name__ =="__main__": print("Please enter the URLs saperated by a space" ) urls=input().strip().split() index(urls) l1=[] l2=tagrem.text_parser(urls,l1) print("Please enter the query saperated by space") query=input().strip().split() print("The entered urls are") for i in range(len(urls)): print(i +":"+urls[i]) query=word_preprocessing(query) table = TfIdf() for i in range(len(urls)): table.add_document(str(i),l2[i]) x=table.similarities(query) print("URLs Indexed by cosine rankings are") for i in x: a=int(x[i][0]) print(i+":"+urls[a])
table.add_document("criadorCanvas", [ "Por", "muitos", "anos", "o", "termo", "modelo", "de", "negócios", "foi", "usado", "sem", "um", "consenso", "na", "sua", "definição", ".", "Muitos", "autores", "o", "mencionavam", "sem", "explicitar", "do", "que", "exatamente", "falavam", "E", "foi", "exatamente", "pensando", "nisso", "que", "o", "consultor", "suíço", "Alexander", "Osterwalder", "começou", "a", "desenvolver", "sua", "tese", "de", "doutorado", "que", "daria", "origem", "ao", "Business", "Model", "Canvas.", "Alexander", "percebeu", "que", "definir", "o", "termo", "não", "seria", "suficiente.", "Era", "necessário", "criar", "algo", "que", "incentivasse", "a", "inovação,", "a", "prototipação", "e", "co-criação", "(criação", "colaborativa).", "Utilizando-se,", "assim,", "de", "conceitos", "de", "design", "thinking", "Alexander", "começou", "com", "um", "simples", "gráfico", "feito", "em", "powerpoint", "que", "anos", "mais", "tarde", "se", "tornaria", "uma", "bela", "tela", "(canvas)", "separada", "em", "nove", "blocos." ]) ordena = table.similarities(["o", "que", "é", "canvas"]) dicio = dict(ordena) print sorted(dicio.items(), key=itemgetter(1), reverse=True) #tableOrdenada = []tableCopia = ordena[:] #while len(tableOrdenada) < len(ordena): #maior = tableCopia[0] #for a in tableCopia: #if a[1] > maior[1]: #maior = a #tableOrdenada.append(maior) #tableCopia.remove(maior) #print(tableOrdenada)
table.add_document("quandoUsar", [ "lancar", "o", "produto", "perfeito", "nao", "é", "mais", "suficiente", "afinal", "nao", "basta", "ser", "é", "preciso", "parecer", "em", "outras", "palavras", "tao", "importante", "quanto", "ter", "produtos", "ou", "servicos", "de", "alto", "valor", "agregado", "é", "saber", "transmitir", "seu", "modelo", "de", "negocios", "com", "clareza", "e", "precisaonesse", "sentido", "o", "canvas", "é", "muito", "utilizado", "para", "colocar", "as", "ideias", "de", "negocio", "no", "papel", "assim", "é", "possivel", "discutir", "e", "persuadir", "socios", "sobre", "estratégias", "de", "acao", "bem", "como", "apresentar", "a", "empresa", "aos", "investidores", "ainda", "assim", "é", "na", "area", "de", "tecnologia", "da", "informacao", "que", "esse", "modelo", "vem", "sendo", "cada", "vez", "mais", "demandado" ]) oQueE = table.similarities(["o", "que", "e", "canvas"]) dicio = dict(oQueE) #print sorted(dicio.items(), key=itemgetter(1),reverse=True) ondeUsar = table.similarities(["onde", "posso", "usar", "o", "canvas"]) dicio = dict(ondeUsar) #print sorted(dicio.items(), key=itemgetter(1),reverse=True) comoAplicar = table.similarities(["como", "posso", "aplicar", "o", "canvas"]) dicio = dict(comoAplicar) #print sorted(dicio.items(),key=itemgetter(1),reverse=True) quandoAplicar = table.similarities( ["quando", "devo", "aplicar", "o", "canvas"]) dicio = dict(quandoAplicar) #print sorted(dicio.items(),key=itemgetter(1),reverse=True)