示例#1
0
    def calcularfrecuencia(self, texto, palabra=[]):

        table = TfIdf()
        table.add_document("informacion", texto)
        resultado = table.similarities(palabra)[0][1]
        if resultado > 0.0:
            return True
        return False
示例#2
0
class SearchEngine:
    def __init__(self):
        self.tfidf = TfIdf()

    def load_documents(self, documents):
        for doc in documents:
            name = doc.name
            text = self.doc_to_text(doc)
            words = self.text_to_word_array(text)

            self.tfidf.add_document(name, words)

    def query(self, query):
        return self.tfidf.similarities(query)

    def doc_to_text(self, doc):
        parser = AKParser()
        tree = parser.parse(doc)

        text = ''

        q = deque()
        q.append(tree)

        while True:
            if not q:
                break

            node = q.pop()

            if node.tag:
                if 'link' == node.tag.lower():
                    val = node.children[0].value.split('|')[0]
                    text += f' {val} '
                    continue

            if node.value:
                text += f' {node.value} '

            children = node.children

            if children:
                for c in children:
                    q.append(c)

        return re.sub(' +', ' ', text)

    def text_to_word_array(self, text):
        regex = re.compile('[^a-zA-Z\s]')

        text = regex.sub('', text)
        text = re.sub(' +', ' ', text)

        return text.lower().split()
示例#3
0
 def build_tfidf_model(self, files):
     '''
     It builds the Tf-Idf model
     :param files: List of files of the corpora
     :return: A Tf-Idf object with the model loaded
     '''
     tfidf = TfIdf()
     for file_path in files:
         with open(file_path) as f:
             doc_name = file_path.split('/')[-1]
             doc_text = f.readline().split()
             tfidf.add_document(doc_name, doc_text)
     return tfidf
示例#4
0
    def calculateTFIDFofNew(self, inputTitle, inputBody):
        title = self.textToWordsArray(inputTitle)
        sentences = self.textArrayToWordsArray(inputBody)
        if len(sentences) < 1:
            return []

        table = TfIdf()
        for i in range(0, len(sentences)):
            table.add_document("sentences" + str(i), sentences[i])

        result = []
        similarities = table.similarities(title)
        for similarity in similarities:
            result.append(similarity[1])

        resLen = len(result)
        for i in range(resLen, 5):
            result.append(0)
        return result
示例#5
0
    def test_similarity(self):
        table = TfIdf()
        table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"])
        table.add_document("bar", ["a", "b", "c", "i", "j", "k"])
        table.add_document("baz", ["k", "l", "m", "n"])

        self.assertEqual(
            table.similarities(["a", "b", "c"]),
            [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])
示例#6
0
    def test_similarity(self):
        table = TfIdf()
        table.add_document("doc1", [
            "The", "game", "of", "life", "is", "a", "game", "of",
            "everlasting", "learning"
        ])
        table.add_document(
            "doc2",
            ["The", "unexamined", "life", "is", "not", "worth", "living"])
        table.add_document("doc3", ["Never", "stop", "learning"])

        table.calculate_tf()
        table.calculate_idf()
        table.calculate_tf_idf()
        """self.assertEqual(
            table.similarities(["life","learning"]),
            [["foo", 1.0], ["bar", 0.707106781], ["baz", 0.707106781]])"""

        print(table.similarities(["life", "learning"]))
示例#7
0
class TestSequenceFunctions(unittest.TestCase):
    def setUp(self):
        self.unk_cutoff = 2
        self.vocab = TfIdf(unk_cutoff=self.unk_cutoff)

    def test_vocab(self):
        self.vocab.train_seen("a", 300)

        self.vocab.train_seen("b")
        self.vocab.train_seen("c")
        self.vocab.finalize()

        # Infrequent words should look the same
        self.assertEqual(self.vocab.vocab_lookup("b"),
                         self.vocab.vocab_lookup("c"))

        # Infrequent words should look the same as never seen words
        self.assertEqual(self.vocab.vocab_lookup("b"),
                         self.vocab.vocab_lookup("d"), "")

        # The frequent word should be different from the infrequent word
        self.assertNotEqual(self.vocab.vocab_lookup("a"),
                            self.vocab.vocab_lookup("b"))

    def test_censor(self):
        self.vocab.train_seen("a", 300)

        self.vocab.train_seen("b")
        self.vocab.train_seen("c")
        self.vocab.finalize()

        censored_a = [str(x) for x in self.vocab.tokenize("a b d")]
        censored_b = [str(x) for x in self.vocab.tokenize("d b a")]
        censored_c = [str(x) for x in self.vocab.tokenize("a b d")]
        censored_d = [str(x) for x in self.vocab.tokenize("b d a")]

        self.assertEqual(censored_a, censored_c)
        self.assertEqual(censored_b, censored_d)

        # Should add start and end tag
        print(censored_a)
        self.assertEqual(len(censored_a), 3)
        self.assertEqual(censored_a[0], censored_b[2])
        self.assertEqual(censored_a[1], censored_b[0])

    def test_tf(self):
        self.vocab.train_seen("a", 300)
        self.vocab.finalize()

        self.vocab.add_document("a a b")

        # Test MLE
        word_a = self.vocab.vocab_lookup("a")
        word_b = self.vocab.vocab_lookup("b")
        word_c = self.vocab.vocab_lookup("c")

        self.assertAlmostEqual(self.vocab.term_freq(word_a), 0.66666666)
        self.assertAlmostEqual(self.vocab.term_freq(word_b), 0.33333333)
        self.assertAlmostEqual(self.vocab.term_freq(word_c), 0.33333333)

    def test_df(self):
        self.vocab.train_seen("a", 300)
        self.vocab.train_seen("b", 100)
        self.vocab.finalize()

        self.vocab.add_document("a a b")
        self.vocab.add_document("b b c")
        self.vocab.add_document("a a a")
        self.vocab.add_document("a a a")

        # Test MLE
        word_a = self.vocab.vocab_lookup("a")
        word_b = self.vocab.vocab_lookup("b")
        word_c = self.vocab.vocab_lookup("c")
        word_d = self.vocab.vocab_lookup("d")

        self.assertAlmostEqual(self.vocab.inv_docfreq(word_a),
                               log10(1.3333333))
        self.assertAlmostEqual(self.vocab.inv_docfreq(word_b), log10(2.0))
        self.assertAlmostEqual(self.vocab.inv_docfreq(word_c), log10(4.0))
        self.assertAlmostEqual(self.vocab.inv_docfreq(word_d), log10(4.0))
示例#8
0
        for x in results:
            f = open(urls[x - 1][0])
            obj = json.load(f)

            soup = BeautifulSoup(obj["content"],
                                 "html.parser",
                                 from_encoding="iso-8859-1")
            joinedText = [
                ps.stem(word) for word in ' '.join(
                    [p.get_text()
                     for p in soup.find_all("p", text=True)]).split()
            ]
            line = set(
                re.sub(r'[^a-zA-Z0-9]', ' ', joinedText.lower()).split())
            #response = vectorizer.fit_transform([])
            table.add_document(obj["url"], joinedText)

            doc = html.fromstring(obj["content"])
            for l in doc.iterlinks():
                G.add_edge(obj["url"], l)

            #print(obj["url"])
        tf_idf = table.similarities(query)
        for x in tf_idf:
            rankDict[x[0]] += x[1]
        #G = nx.barabasi_albert_graph(60, 41)
        pr = nx.pagerank(G, 0.4)
        for key, val in pr.items():
            rankDict[key] += val

        for y in sorted(rankDict.items(), key=lambda x: x[1], reverse=True):
示例#9
0
def menu():
    print("Que deseja fazer?")
    print("1 - Consultar a informação do site do jornal ABola")
    print("2 - Aplicar o algoritmo do TFIDF")
    print("3 - Sair")
    word = 0
    nword = 0
    narray = []
    j = 0
    for line in fileinput.input():
        if line.replace("\n", "") == "1":
            os.system('python3 web_scraper.py')
            print("Que deseja fazer?")
            print("1 - Consultar a informação do site do jornal ABola")
            print("2 - Aplicar o algoritmo do TFIDF")
            print("3 - Sair")
        elif (line.replace("\n", "") == "2") or (word > 0):
            if word == 0 and j == 0:
                if (os.path.isdir("artigos") == False):
                    print(
                        'Necessita de gerar primeiro o conteúdo. Escolha a opção 1'
                    )
                    print("Que deseja fazer?")
                    print("1 - Consultar a informação do site do jornal ABola")
                    print("2 - Aplicar o algoritmo do TFIDF")
                    print("3 - Sair")
                else:
                    filesA = os.listdir('artigos')
                    table = TfIdf()
                    for i in filesA:
                        with open('artigos/{}'.format(i), 'r') as content:
                            #print(content.read().split('h2'))
                            val = content.read().split('h2')
                            firstVal = val[0]
                            secondVal = val[1]
                            table.add_document(
                                'title{}'.format(i),
                                re.sub(r'[\W]', ' ', firstVal).lower().split())
                            table.add_document(
                                'text{}'.format(i),
                                re.sub(r'[\W]', ' ',
                                       secondVal).lower().split())
                    word += 1
                    print('Indique quantas palavras quer comparar:')
            elif (word == 1) and (j == 0):
                if (line.replace("\n", "").isnumeric() and int(line) > 1):
                    nword = int(line)
                    word += 1
                else:
                    print('Digite um número maior que 1')
            elif (word > 1) and (word <= nword) and (j == 0):
                if (line.replace("\n", "") != ''):
                    narray.append(line.replace("\n", "").lower())
                    word += 1
            else:
                j = 1
                if (j == 1):
                    if line.replace("\n", "") != '':
                        narray.append(line.replace("\n", "").lower())
                        j += 1
                if (j == 2):
                    print(narray)
                    fTDIDF = open('output' + narray[0] + '.html', 'w+')
                    fTDIDF.write(
                        '<h2>Resultados da aplicação do algoritmo:<h2>')
                    splitArray = {}
                    for s in table.similarities(narray):
                        if s[0].startswith('title'):
                            s[0] = s[0].replace('title', '')
                            if s[0] in splitArray.keys():
                                splitArray[s[0]] += s[1] * 0.7
                            else:
                                splitArray[s[0]] = s[1] * 0.7
                        elif s[0].startswith('text'):
                            s[0] = s[0].replace('text', '')
                            if s[0] in splitArray.keys():
                                splitArray[s[0]] += s[1] * 0.3
                            else:
                                splitArray[s[0]] = s[1] * 0.3

                    for elem in splitArray.keys():
                        fTDIDF.write(
                            '<p><h5><a href="artigos/{}" >'.format(elem) +
                            elem + '</a> -> ' + str(splitArray[elem]) +
                            '</h5></p>')

                    new = 2  # open in a new tab, if possible
                    url = "file:///home/ze/SPLN/WebScraper/output" + narray[
                        0] + ".html"
                    webbrowser.open(url, new=new)
                    word = 0
                    nword = 0
                    narray = []
                    print("Que deseja fazer?")
                    print("1 - Consultar a informação do site do jornal ABola")
                    print("2 - Aplicar o algoritmo do TFIDF")
                    print("3 - Sair")
        elif (line.replace("\n", "") == "3") and (word == 0):
            print("Obrigado pela sua visita")
            fileinput.close()
示例#10
0
        i.lower()
        lemmatizer.lemmatize(i)
    return list    





if __name__ =="__main__":
    print("Please enter the URLs saperated by a space" )
    urls=input().strip().split()
    index(urls)
    l1=[]
    l2=tagrem.text_parser(urls,l1)
    print("Please enter the query saperated by space")
    query=input().strip().split()
    print("The entered urls are")
    for i in range(len(urls)):
        print(i +":"+urls[i])
    query=word_preprocessing(query)
    table = TfIdf()
    for i in range(len(urls)):
        table.add_document(str(i),l2[i])
    x=table.similarities(query) 
    print("URLs Indexed by cosine rankings are")
    for i in x:
        a=int(x[i][0])
        print(i+":"+urls[a]) 


    
示例#11
0
Created on 15 Jul 2018

@author: goksukara
'''
from tfidf import TfIdf

def addallfilesinpath(path):
    pass

if __name__ == "__main__":
    
    Tf_idf = TfIdf('s')
    list=[['human', 'human', 'interface'],['ship', 'human', 'interface']]
    list1=[['ship', 'humasn', 'interface']]
    list2=[['human', 'human', 'am']]
    list3=[['humafn', 'humasn', 'am1']]
    #map(unicode,list)
     
    Tf_idf.add_document(list)
    Tf_idf.add_document(list1)
    Tf_idf.Saverelatedwords()
    Tf_idf.add_document(list2)
    Tf_idf.add_document(list3)
    Tf_idf.SaveCorpusdic()
    Tf_idf.loaddictionary()
    Tf_idf.buildmodel()
    #Tf_idf.listnhighIdfs(10)
    Tf_idf.getTF_IDF()
     
   
示例#12
0
from tfidf import TfIdf
import unittest
from operator import itemgetter

table = TfIdf()
table.add_document("oqueE", [
    "Business", "Model", "Generation", "ou", "simplesmente", "Canvas", "É",
    "uma", "metodologia", "criada", "em", "meados", "dos", "anos", "2000",
    "pelo", "Suíço", "Alex", "Osterwalder", "durante", "sua", "Tese", "de",
    "Doutorado", "na", "prestigiada", "HEC", "Lausanne", ",e", "Yves",
    "Pigneur.", "O", "Canvas", "é", "um", "esquema", "visual", "que",
    "possibilita", "as", "pessoas", "cocriarem", "modelos", "de", "negócios",
    "analisando", "9", "elementos", "que", "toda", "empresa", "ou",
    "organização", "possuem:", "proposta", "de", "valor", "parcerias",
    "chaves", ",atividades", "chaves", ",recursos", "chaves",
    ",relacionamento", "com", "clientes", ",segmentos", "de", "clientes",
    ",canais", "de", "distribuição", ",estrutura", "de", "custos", "e",
    "fluxo", "de", "receitas", "(HSM", ",2017", "organização", "do",
    "empreendedor", "de", "seus", "concorrentes", "ou", "qual", "quer",
    "outra", "empresa", "Conforme", "Osterwalder", "e", "Pigneur", "o",
    "conceito", "Canvas", "já", "foi", "aplicado", "e", "testado", "em",
    "todo", "o", "mundo", "e", "já", "é", "utilizado", "por", "grandes",
    "organizações", "como", "IBM", "Ericsson", "Deloitte", "Public", "Works",
    "o", "governo", "do", "Canadá", "entre", "outras"
])

table.add_document("ondeUsar", [
    "O", "canvas", "pode", "ser", "usado", "em", "companhia", "firma", "casa",
    "negócio", "sociedade", "entidade", "estabelecimento", "instituição",
    "organização", "empregador", "parceria", "corporação", "cometimento",
    "desígnio", "empreendimento", "feito", "intento", "tentativa", "projeto",
示例#13
0
from tfidf import TfIdf
import unittest
from operator import itemgetter

table = TfIdf()

table.add_document("oqueE", [
    "business", "model", "generation", "ou", "simplesmente", "canvas", "e",
    "uma", "metodologia", "criada", "em", "meados", "dos", "anos", "2000",
    "pelo", "suico", "alex", "osterwalder", "durante", "sua", "tese", "de",
    "doutorado", "na", "prestigiada", "hec", "lausanne", ",e", "yves",
    "pigneur.", "o", "canvas", "e", "um", "esquema", "visual", "que",
    "possibilita", "as", "pessoas", "cocriarem", "modelos", "de", "negócios",
    "analisando", "9", "elementos", "que", "toda", "empresa", "ou",
    "organização", "possuem:", "proposta", "de", "valor", "parcerias",
    "chaves", ",atividades", "chaves", ",recursos", "chaves",
    ",relacionamento", "com", "clientes", ",segmentos", "de", "clientes",
    "canais", "de", "distribuição", ",estrutura", "de", "custos", "e", "fluxo",
    "de", "receitas", "hsm", ",2017", "organização", "do", "empreendedor",
    "de", "seus", "concorrentes", "ou", "qual", "quer", "outra", "empresa",
    "conforme", "osterwalder", "e", "pigneur", "o", "conceito", "canvas", "ja",
    "foi", "aplicado", "e", "testado", "em", "todo", "o", "mundo", "e", "ja",
    "e", "utilizado", "por", "grandes", "organizações", "como", "ibm",
    "ericsson", "deloitte", "public", "works", "o", "governo", "do", "canada",
    "entre", "outras"
])

table.add_document("ondeUsar", [
    "O", "canvas", "pode", "ser", "usado", "em", "companhia", "firma", "casa",
    "negócio", "sociedade", "entidade", "estabelecimento", "instituição",
    "organização", "empregador", "parceria", "corporação", "cometimento",
    "desígnio", "empreendimento", "feito", "intento", "tentativa", "projeto",