def calcularfrecuencia(self, texto, palabra=[]): table = TfIdf() table.add_document("informacion", texto) resultado = table.similarities(palabra)[0][1] if resultado > 0.0: return True return False
class SearchEngine: def __init__(self): self.tfidf = TfIdf() def load_documents(self, documents): for doc in documents: name = doc.name text = self.doc_to_text(doc) words = self.text_to_word_array(text) self.tfidf.add_document(name, words) def query(self, query): return self.tfidf.similarities(query) def doc_to_text(self, doc): parser = AKParser() tree = parser.parse(doc) text = '' q = deque() q.append(tree) while True: if not q: break node = q.pop() if node.tag: if 'link' == node.tag.lower(): val = node.children[0].value.split('|')[0] text += f' {val} ' continue if node.value: text += f' {node.value} ' children = node.children if children: for c in children: q.append(c) return re.sub(' +', ' ', text) def text_to_word_array(self, text): regex = re.compile('[^a-zA-Z\s]') text = regex.sub('', text) text = re.sub(' +', ' ', text) return text.lower().split()
def build_tfidf_model(self, files): ''' It builds the Tf-Idf model :param files: List of files of the corpora :return: A Tf-Idf object with the model loaded ''' tfidf = TfIdf() for file_path in files: with open(file_path) as f: doc_name = file_path.split('/')[-1] doc_text = f.readline().split() tfidf.add_document(doc_name, doc_text) return tfidf
def calculateTFIDFofNew(self, inputTitle, inputBody): title = self.textToWordsArray(inputTitle) sentences = self.textArrayToWordsArray(inputBody) if len(sentences) < 1: return [] table = TfIdf() for i in range(0, len(sentences)): table.add_document("sentences" + str(i), sentences[i]) result = [] similarities = table.similarities(title) for similarity in similarities: result.append(similarity[1]) resLen = len(result) for i in range(resLen, 5): result.append(0) return result
def test_similarity(self): table = TfIdf() table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"]) table.add_document("bar", ["a", "b", "c", "i", "j", "k"]) table.add_document("baz", ["k", "l", "m", "n"]) self.assertEqual( table.similarities(["a", "b", "c"]), [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])
def test_similarity(self): table = TfIdf() table.add_document("doc1", [ "The", "game", "of", "life", "is", "a", "game", "of", "everlasting", "learning" ]) table.add_document( "doc2", ["The", "unexamined", "life", "is", "not", "worth", "living"]) table.add_document("doc3", ["Never", "stop", "learning"]) table.calculate_tf() table.calculate_idf() table.calculate_tf_idf() """self.assertEqual( table.similarities(["life","learning"]), [["foo", 1.0], ["bar", 0.707106781], ["baz", 0.707106781]])""" print(table.similarities(["life", "learning"]))
class TestSequenceFunctions(unittest.TestCase): def setUp(self): self.unk_cutoff = 2 self.vocab = TfIdf(unk_cutoff=self.unk_cutoff) def test_vocab(self): self.vocab.train_seen("a", 300) self.vocab.train_seen("b") self.vocab.train_seen("c") self.vocab.finalize() # Infrequent words should look the same self.assertEqual(self.vocab.vocab_lookup("b"), self.vocab.vocab_lookup("c")) # Infrequent words should look the same as never seen words self.assertEqual(self.vocab.vocab_lookup("b"), self.vocab.vocab_lookup("d"), "") # The frequent word should be different from the infrequent word self.assertNotEqual(self.vocab.vocab_lookup("a"), self.vocab.vocab_lookup("b")) def test_censor(self): self.vocab.train_seen("a", 300) self.vocab.train_seen("b") self.vocab.train_seen("c") self.vocab.finalize() censored_a = [str(x) for x in self.vocab.tokenize("a b d")] censored_b = [str(x) for x in self.vocab.tokenize("d b a")] censored_c = [str(x) for x in self.vocab.tokenize("a b d")] censored_d = [str(x) for x in self.vocab.tokenize("b d a")] self.assertEqual(censored_a, censored_c) self.assertEqual(censored_b, censored_d) # Should add start and end tag print(censored_a) self.assertEqual(len(censored_a), 3) self.assertEqual(censored_a[0], censored_b[2]) self.assertEqual(censored_a[1], censored_b[0]) def test_tf(self): self.vocab.train_seen("a", 300) self.vocab.finalize() self.vocab.add_document("a a b") # Test MLE word_a = self.vocab.vocab_lookup("a") word_b = self.vocab.vocab_lookup("b") word_c = self.vocab.vocab_lookup("c") self.assertAlmostEqual(self.vocab.term_freq(word_a), 0.66666666) self.assertAlmostEqual(self.vocab.term_freq(word_b), 0.33333333) self.assertAlmostEqual(self.vocab.term_freq(word_c), 0.33333333) def test_df(self): self.vocab.train_seen("a", 300) self.vocab.train_seen("b", 100) self.vocab.finalize() self.vocab.add_document("a a b") self.vocab.add_document("b b c") self.vocab.add_document("a a a") self.vocab.add_document("a a a") # Test MLE word_a = self.vocab.vocab_lookup("a") word_b = self.vocab.vocab_lookup("b") word_c = self.vocab.vocab_lookup("c") word_d = self.vocab.vocab_lookup("d") self.assertAlmostEqual(self.vocab.inv_docfreq(word_a), log10(1.3333333)) self.assertAlmostEqual(self.vocab.inv_docfreq(word_b), log10(2.0)) self.assertAlmostEqual(self.vocab.inv_docfreq(word_c), log10(4.0)) self.assertAlmostEqual(self.vocab.inv_docfreq(word_d), log10(4.0))
for x in results: f = open(urls[x - 1][0]) obj = json.load(f) soup = BeautifulSoup(obj["content"], "html.parser", from_encoding="iso-8859-1") joinedText = [ ps.stem(word) for word in ' '.join( [p.get_text() for p in soup.find_all("p", text=True)]).split() ] line = set( re.sub(r'[^a-zA-Z0-9]', ' ', joinedText.lower()).split()) #response = vectorizer.fit_transform([]) table.add_document(obj["url"], joinedText) doc = html.fromstring(obj["content"]) for l in doc.iterlinks(): G.add_edge(obj["url"], l) #print(obj["url"]) tf_idf = table.similarities(query) for x in tf_idf: rankDict[x[0]] += x[1] #G = nx.barabasi_albert_graph(60, 41) pr = nx.pagerank(G, 0.4) for key, val in pr.items(): rankDict[key] += val for y in sorted(rankDict.items(), key=lambda x: x[1], reverse=True):
def menu(): print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") word = 0 nword = 0 narray = [] j = 0 for line in fileinput.input(): if line.replace("\n", "") == "1": os.system('python3 web_scraper.py') print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") elif (line.replace("\n", "") == "2") or (word > 0): if word == 0 and j == 0: if (os.path.isdir("artigos") == False): print( 'Necessita de gerar primeiro o conteúdo. Escolha a opção 1' ) print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") else: filesA = os.listdir('artigos') table = TfIdf() for i in filesA: with open('artigos/{}'.format(i), 'r') as content: #print(content.read().split('h2')) val = content.read().split('h2') firstVal = val[0] secondVal = val[1] table.add_document( 'title{}'.format(i), re.sub(r'[\W]', ' ', firstVal).lower().split()) table.add_document( 'text{}'.format(i), re.sub(r'[\W]', ' ', secondVal).lower().split()) word += 1 print('Indique quantas palavras quer comparar:') elif (word == 1) and (j == 0): if (line.replace("\n", "").isnumeric() and int(line) > 1): nword = int(line) word += 1 else: print('Digite um número maior que 1') elif (word > 1) and (word <= nword) and (j == 0): if (line.replace("\n", "") != ''): narray.append(line.replace("\n", "").lower()) word += 1 else: j = 1 if (j == 1): if line.replace("\n", "") != '': narray.append(line.replace("\n", "").lower()) j += 1 if (j == 2): print(narray) fTDIDF = open('output' + narray[0] + '.html', 'w+') fTDIDF.write( '<h2>Resultados da aplicação do algoritmo:<h2>') splitArray = {} for s in table.similarities(narray): if s[0].startswith('title'): s[0] = s[0].replace('title', '') if s[0] in splitArray.keys(): splitArray[s[0]] += s[1] * 0.7 else: splitArray[s[0]] = s[1] * 0.7 elif s[0].startswith('text'): s[0] = s[0].replace('text', '') if s[0] in splitArray.keys(): splitArray[s[0]] += s[1] * 0.3 else: splitArray[s[0]] = s[1] * 0.3 for elem in splitArray.keys(): fTDIDF.write( '<p><h5><a href="artigos/{}" >'.format(elem) + elem + '</a> -> ' + str(splitArray[elem]) + '</h5></p>') new = 2 # open in a new tab, if possible url = "file:///home/ze/SPLN/WebScraper/output" + narray[ 0] + ".html" webbrowser.open(url, new=new) word = 0 nword = 0 narray = [] print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") elif (line.replace("\n", "") == "3") and (word == 0): print("Obrigado pela sua visita") fileinput.close()
i.lower() lemmatizer.lemmatize(i) return list if __name__ =="__main__": print("Please enter the URLs saperated by a space" ) urls=input().strip().split() index(urls) l1=[] l2=tagrem.text_parser(urls,l1) print("Please enter the query saperated by space") query=input().strip().split() print("The entered urls are") for i in range(len(urls)): print(i +":"+urls[i]) query=word_preprocessing(query) table = TfIdf() for i in range(len(urls)): table.add_document(str(i),l2[i]) x=table.similarities(query) print("URLs Indexed by cosine rankings are") for i in x: a=int(x[i][0]) print(i+":"+urls[a])
Created on 15 Jul 2018 @author: goksukara ''' from tfidf import TfIdf def addallfilesinpath(path): pass if __name__ == "__main__": Tf_idf = TfIdf('s') list=[['human', 'human', 'interface'],['ship', 'human', 'interface']] list1=[['ship', 'humasn', 'interface']] list2=[['human', 'human', 'am']] list3=[['humafn', 'humasn', 'am1']] #map(unicode,list) Tf_idf.add_document(list) Tf_idf.add_document(list1) Tf_idf.Saverelatedwords() Tf_idf.add_document(list2) Tf_idf.add_document(list3) Tf_idf.SaveCorpusdic() Tf_idf.loaddictionary() Tf_idf.buildmodel() #Tf_idf.listnhighIdfs(10) Tf_idf.getTF_IDF()
from tfidf import TfIdf import unittest from operator import itemgetter table = TfIdf() table.add_document("oqueE", [ "Business", "Model", "Generation", "ou", "simplesmente", "Canvas", "É", "uma", "metodologia", "criada", "em", "meados", "dos", "anos", "2000", "pelo", "Suíço", "Alex", "Osterwalder", "durante", "sua", "Tese", "de", "Doutorado", "na", "prestigiada", "HEC", "Lausanne", ",e", "Yves", "Pigneur.", "O", "Canvas", "é", "um", "esquema", "visual", "que", "possibilita", "as", "pessoas", "cocriarem", "modelos", "de", "negócios", "analisando", "9", "elementos", "que", "toda", "empresa", "ou", "organização", "possuem:", "proposta", "de", "valor", "parcerias", "chaves", ",atividades", "chaves", ",recursos", "chaves", ",relacionamento", "com", "clientes", ",segmentos", "de", "clientes", ",canais", "de", "distribuição", ",estrutura", "de", "custos", "e", "fluxo", "de", "receitas", "(HSM", ",2017", "organização", "do", "empreendedor", "de", "seus", "concorrentes", "ou", "qual", "quer", "outra", "empresa", "Conforme", "Osterwalder", "e", "Pigneur", "o", "conceito", "Canvas", "já", "foi", "aplicado", "e", "testado", "em", "todo", "o", "mundo", "e", "já", "é", "utilizado", "por", "grandes", "organizações", "como", "IBM", "Ericsson", "Deloitte", "Public", "Works", "o", "governo", "do", "Canadá", "entre", "outras" ]) table.add_document("ondeUsar", [ "O", "canvas", "pode", "ser", "usado", "em", "companhia", "firma", "casa", "negócio", "sociedade", "entidade", "estabelecimento", "instituição", "organização", "empregador", "parceria", "corporação", "cometimento", "desígnio", "empreendimento", "feito", "intento", "tentativa", "projeto",
from tfidf import TfIdf import unittest from operator import itemgetter table = TfIdf() table.add_document("oqueE", [ "business", "model", "generation", "ou", "simplesmente", "canvas", "e", "uma", "metodologia", "criada", "em", "meados", "dos", "anos", "2000", "pelo", "suico", "alex", "osterwalder", "durante", "sua", "tese", "de", "doutorado", "na", "prestigiada", "hec", "lausanne", ",e", "yves", "pigneur.", "o", "canvas", "e", "um", "esquema", "visual", "que", "possibilita", "as", "pessoas", "cocriarem", "modelos", "de", "negócios", "analisando", "9", "elementos", "que", "toda", "empresa", "ou", "organização", "possuem:", "proposta", "de", "valor", "parcerias", "chaves", ",atividades", "chaves", ",recursos", "chaves", ",relacionamento", "com", "clientes", ",segmentos", "de", "clientes", "canais", "de", "distribuição", ",estrutura", "de", "custos", "e", "fluxo", "de", "receitas", "hsm", ",2017", "organização", "do", "empreendedor", "de", "seus", "concorrentes", "ou", "qual", "quer", "outra", "empresa", "conforme", "osterwalder", "e", "pigneur", "o", "conceito", "canvas", "ja", "foi", "aplicado", "e", "testado", "em", "todo", "o", "mundo", "e", "ja", "e", "utilizado", "por", "grandes", "organizações", "como", "ibm", "ericsson", "deloitte", "public", "works", "o", "governo", "do", "canada", "entre", "outras" ]) table.add_document("ondeUsar", [ "O", "canvas", "pode", "ser", "usado", "em", "companhia", "firma", "casa", "negócio", "sociedade", "entidade", "estabelecimento", "instituição", "organização", "empregador", "parceria", "corporação", "cometimento", "desígnio", "empreendimento", "feito", "intento", "tentativa", "projeto",