Пример #1
0
def main():
    tfidf = TfIdf(corpus_filename="moviecorpus.txt")
    #	tfidf.add_document_to_corpus()
    #	print tfidf.term_freq
    #	print tfidf.num_words
    for line in tfidf.get_summary('oblivion.txt', 5):
        print line
Пример #2
0
    def createTFIDFTopics(self):
        self.db = psycopg2.connect("dbname=%s user=%s password=%s host=%s" % (
            self.dbname, self.dbuser, self.dbpass, self.dbhost))
        c = self.db.cursor()

        headlines = {}
        c.execute(
            "SELECT article_day,country,title,url,article_hash FROM articles_headlines")
        for row in c.fetchall():
            title = row[2]
            # c.execute('SELECT content from articles where hash = ?',(row[4],))
            # content = c.fetchone()[0]

            lista = headlines.get(str(row[0])+'-'+row[1])
            if lista is None:
                # headlines[str(row[0])+'-'+row[1]] = [title + ' ' + content]
                headlines[str(row[0])+'-'+row[1]] = [title]
            else:
                # headlines[str(row[0])+'-'+row[1]].append(title + ' ' + content)
                headlines[str(row[0])+'-'+row[1]].append(title)
        self.db.close()

        for hd, contents in headlines.items():
            print(f'>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {hd}')
            with open('stopwords.txt', 'r') as st:
                tfidf = TfIdf(stopwords=[x.strip() for x in st.readlines()])
                tfidf.parse(contents)
Пример #3
0
    def createTFIDFTopics(self):
        self.db = sqlite3.connect(self.dbname,
                                  detect_types=sqlite3.PARSE_DECLTYPES)
        c = self.db.cursor()

        headlines = {}
        c.execute(
            "SELECT article_day,country,title,url,article_hash FROM articles_headlines"
        )
        for row in c.fetchall():
            title = row[2]
            # c.execute('SELECT content from articles where hash = ?',(row[4],))
            # content = c.fetchone()[0]

            lista = headlines.get(str(row[0]) + '-' + row[1])
            if lista is None:
                # headlines[str(row[0])+'-'+row[1]] = [title + ' ' + content]
                headlines[str(row[0]) + '-' + row[1]] = [title]
            else:
                # headlines[str(row[0])+'-'+row[1]].append(title + ' ' + content)
                headlines[str(row[0]) + '-' + row[1]].append(title)
        self.db.close()

        for hd, contents in headlines.iteritems():
            print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ' + hd
            with open('stopwords.txt', 'r') as st:
                tfidf = TfIdf(stopwords=[x.strip() for x in st.readlines()])
                tfidf.parse(contents)
Пример #4
0
    def calcularfrecuencia(self, texto, palabra=[]):

        table = TfIdf()
        table.add_document("informacion", texto)
        resultado = table.similarities(palabra)[0][1]
        if resultado > 0.0:
            return True
        return False
Пример #5
0
    def __init__(self, sql_obj=None):
        if not sql_obj:
            self.sql = SQLQuery()
        else:
            self.sql = sql_obj

        self.tfidf_obj = TfIdf()
        self.ids = None
Пример #6
0
    def test_similarity(self):
        table = TfIdf()
        table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"])
        table.add_document("bar", ["a", "b", "c", "i", "j", "k"])
        table.add_document("baz", ["k", "l", "m", "n"])

        self.assertEqual(
            table.similarities(["a", "b", "c"]),
            [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])
Пример #7
0
def test_provider():
    vocab = load_pickled('vocab.dat')
    tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt'])
    lda = LdaLoader('topics.lda', 200)
    label_vectorizer = LabelVectorizer(load_pickled('labels.dat'))
    stemmer = MemoizedStemmer()

    pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab,
                                          tfidf, lda, label_vectorizer,
                                          stemmer)

    return pos_provider
Пример #8
0
def gen_extra_sentences():
    word_idf_file = 'e:/el/tmpres/demo/merge/word_idf.txt'
    tfidf = TfIdf(word_idf_file)

    mesh_id_wid_file = 'e:/el/tmpres/demo/merge/mesh_id_wid.txt'
    merged_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions.txt'
    merged_tokenized_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions_tokenized.txt'
    extra_sentence_file = 'e:/el/tmpres/demo/merge/wiki_extra_sentences.txt'

    mesh_ids = list()
    wids = list()
    fin = open(mesh_id_wid_file, 'rb')
    for line in fin:
        vals = line.strip().split('\t')
        mesh_ids.append(vals[0])
        wids.append(int(vals[1]))
    fin.close()

    fin_desc = open(merged_desc_file, 'rb')
    fin_token_desc = open(merged_tokenized_desc_file, 'rb')
    fout = open(extra_sentence_file, 'wb')
    for idx, (mesh_id, mesh_desc, mesh_token_desc) in enumerate(
            izip(mesh_ids, fin_desc, fin_token_desc)):
        mesh_token_desc = mesh_token_desc.strip()
        mesh_desc_words = mesh_token_desc.split(' ')
        mesh_sentence_ends = find_sentence_ends(mesh_desc_words)

        wiki_desc = fin_desc.next().strip()
        wiki_token_desc = fin_token_desc.next().strip()
        wiki_desc_words = wiki_token_desc.split(' ')
        wiki_sentence_ends = find_sentence_ends(wiki_desc_words)

        extra_sentence_indices = get_sentences_to_add(mesh_desc_words,
                                                      mesh_sentence_ends,
                                                      wiki_desc_words,
                                                      wiki_sentence_ends,
                                                      tfidf)

        wiki_words_to_pos_list = tokenized_text_match(wiki_desc,
                                                      wiki_desc_words)
        original_sentences = get_original_sentences(wiki_desc,
                                                    wiki_words_to_pos_list,
                                                    wiki_sentence_ends)
        fout.write('%s\t%d\n' % (mesh_id, len(extra_sentence_indices)))
        for j in extra_sentence_indices:
            fout.write('%s\n' % original_sentences[j])

        # if idx == 10000:
        #     break
    fin_desc.close()
    fin_token_desc.close()
    fout.close()
Пример #9
0
 def build_tfidf_model(self, files):
     '''
     It builds the Tf-Idf model
     :param files: List of files of the corpora
     :return: A Tf-Idf object with the model loaded
     '''
     tfidf = TfIdf()
     for file_path in files:
         with open(file_path) as f:
             doc_name = file_path.split('/')[-1]
             doc_text = f.readline().split()
             tfidf.add_document(doc_name, doc_text)
     return tfidf
Пример #10
0
def main(args):
    summarizer = {
        'tfidf': TfIdf(),
        'cluster': Cluster(),
        'svd': SVD(),
        'pagerank': PageRank()
    }[args['alg']]

    summarizer.initialize(args['tf'], args['df'])
    summary = summarizer.summarize(args['doc'])

    for s in summary:
        print(s),
Пример #11
0
Файл: test.py Проект: mvj3/tfidf
    def test_tfidf(self):
        clean_tmp()

        t = TfIdf(self.data, root_dir)
        self.assertTrue(t.idf_cache['I'] < t.idf_cache['hello'])
        self.assertTrue(t.idf_cache['I'] < t.idf_cache['You'])
        self.assertTrue(t.idf_cache['I'] < t.idf_cache['not exist feature'],
                        "test default idf_default_val")

        result1 = t.tfidf_in_a_doc(self.data[1])
        self.assertTrue(result1['I'] < result1['You'])
        self.assertTrue(result1['You'] < result1['hello'])
        self.assertTrue(result1['hello'] == result1['world'])

        clean_tmp()
Пример #12
0
def tfidf_matrix(text_generator):
    """Builds tf-idf matrix from records from fname, using fields to create a text describing them
    
    """

    ti = TfIdf()
    #print "building tfidf indices"
    for i in text_generator:
        ti.add_input_document(i)

    A = np.zeros([ti.num_docs, len(ti.term_num_docs)])

    for i_ind, i in enumerate(text_generator):
        #print "-i_ind, i:", i_ind, i
        for j_ind, j in enumerate(ti.get_tfidf(i)):
            #print "-----j_ind, j:", j_ind, j
            A[i_ind, j_ind] = j
    #print ti.term_num_docs
    return A, ti
Пример #13
0
    def test_similarity(self):
        table = TfIdf()
        table.add_document("doc1", [
            "The", "game", "of", "life", "is", "a", "game", "of",
            "everlasting", "learning"
        ])
        table.add_document(
            "doc2",
            ["The", "unexamined", "life", "is", "not", "worth", "living"])
        table.add_document("doc3", ["Never", "stop", "learning"])

        table.calculate_tf()
        table.calculate_idf()
        table.calculate_tf_idf()
        """self.assertEqual(
            table.similarities(["life","learning"]),
            [["foo", 1.0], ["bar", 0.707106781], ["baz", 0.707106781]])"""

        print(table.similarities(["life", "learning"]))
Пример #14
0
    def calculateTFIDFofNew(self, inputTitle, inputBody):
        title = self.textToWordsArray(inputTitle)
        sentences = self.textArrayToWordsArray(inputBody)
        if len(sentences) < 1:
            return []

        table = TfIdf()
        for i in range(0, len(sentences)):
            table.add_document("sentences" + str(i), sentences[i])

        result = []
        similarities = table.similarities(title)
        for similarity in similarities:
            result.append(similarity[1])

        resLen = len(result)
        for i in range(resLen, 5):
            result.append(0)
        return result
Пример #15
0
def train_setup(vocab_file, pos_file, neg_file, cluster_labels_file,
                validation_file):
    vocab = load_pickled(vocab_file)
    tfidf = TfIdf(vocab, [pos_file, neg_file])
    label_vectorizer = LabelVectorizer(load_pickled(cluster_labels_file))
    stemmer = MemoizedStemmer()

    pos_provider = TrainingSampleProvider(pos_file, 1, vocab, tfidf,
                                          label_vectorizer, stemmer)
    neg_provider = TrainingSampleProvider(neg_file, -1, vocab, tfidf,
                                          label_vectorizer, stemmer)

    merged = SampleMerger(pos_provider, neg_provider)

    validation_provider = ValidationSampleProvider(validation_file, None,
                                                   vocab, tfidf,
                                                   label_vectorizer, stemmer)

    return merged, validation_provider
Пример #16
0
def train_setup():
    vocab = load_pickled('vocab.dat')
    tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt'])
    lda = LdaLoader('topics.lda', 200)
    label_vectorizer = LabelVectorizer(load_pickled('labels.dat'))
    stemmer = MemoizedStemmer()

    pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab,
                                          tfidf, lda, label_vectorizer,
                                          stemmer)
    neg_provider = TrainingSampleProvider('./data/train_neg.txt', -1, vocab,
                                          tfidf, lda, label_vectorizer,
                                          stemmer)

    merged = SampleMerger(pos_provider, neg_provider)

    validation_provider = ValidationSampleProvider('./data/test_data.txt',
                                                   None, vocab, tfidf, lda,
                                                   label_vectorizer, stemmer)

    return merged, validation_provider
Пример #17
0
    class CleanedTfIdf(TfIdf):
        """overrides the default TfIdf class to only produces correctly spelled english language words.
        instanciated here to prevent nltk dependancy unless needed"""
        def get_tokens(self, str):
            """overrides the default tokenizer to only used english language correctly spelled words
            Break a string into tokens, preserving URL tags as an entire token.
            This implementation does not preserve case.  
            """
            raw_tokens = re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())
            return [t for t in raw_tokens if wordnet.synsets(t)]

    comment_model = CleanedTfIdf(stopword_filename="stopwords.txt",
                                 DEFAULT_IDF=None)
else:
    comment_model = TfIdf(stopword_filename="stopwords.txt", DEFAULT_IDF=None)

if "--in-mem" in args or "-m" in args:
    #pop the arg
    if "--in-mem" in args:
        args.pop(args.index("--in-mem"))
    if "-m" in args:
        args.pop(args.index("-m"))
    print "shifting db to memory"
    # Read database to tempfile
    tempfile = StringIO.StringIO()
    for line in conn.iterdump():
        tempfile.write(u'{0}\n'.format(line))
    conn.close()
    tempfile.seek(0)
Пример #18
0
 def __init__(self):
     self.tfidf = TfIdf()
Пример #19
0
except IndexError:
    save_file = 'pickled_tfidf.pickle'

print "saving to ", save_file

try:
  with open(save_file) as rh:
    top_100 = cPickle.load(rh)
except IOError:
  top_100 = {}
print "proceeding with", len(top_100), "previous tfidf docs"

with open(save_file, 'w') as wh:
    wh.write('0\n')

comment_model = TfIdf(corpus_filename="idf_model_filteredsorted.txt", stopword_filename="curated_stopwords.txt", 
                        DEFAULT_IDF=0.0000001) #if not in idf model, give very low score, since model is filtered

#find the number of beers for progress indication
c.execute("SELECT id from beer")
total_beers = len(list(c.fetchall()))
print "calculating tfidf of ", total_beers, "beers."

c.execute("SELECT id, name FROM beer")
idx = 0 #don't want to unwrap the generator so we'll idex this way
worked = 0
for beer_id, name in c.fetchall():
    if idx%1000 == 0:
        print """*-*-*-* Finished {0}% of the processing.""".format(float(idx)/total_beers)
        with open(save_file, 'w') as wh:
            cPickle.dump(top_100, wh)
    idx += 1
from tfidf import TfIdf
import pandas as pd
corpuspath = '/Users/goksukara/Desktop/Projects/EclipseWorkspace/Specilization/PhytonCode/Data/'

if __name__ == "__main__":
    Tf_idf = TfIdf(corpuspath + 'Gensim_output')
    Tf_idf.loaddictionary()
    Tf_idf.buildmodel()
    Tf_idf.saveModel()
    Tf_idf.getTF_IDF()
    #print(Tf_idf.corpus_dict)
    #Tf_idf.listnhighIdfs(4)
Пример #21
0
 def __init__(self):
     pio.renderers.default = 'browser'
     tfidf = TfIdf()
     self.ids, self.titles, self.matrix = tfidf.get_matrix()
     self.vectorizer = tfidf.get_vectorizer()
Пример #22
0
def menu():
    print("Que deseja fazer?")
    print("1 - Consultar a informação do site do jornal ABola")
    print("2 - Aplicar o algoritmo do TFIDF")
    print("3 - Sair")
    word = 0
    nword = 0
    narray = []
    j = 0
    for line in fileinput.input():
        if line.replace("\n", "") == "1":
            os.system('python3 web_scraper.py')
            print("Que deseja fazer?")
            print("1 - Consultar a informação do site do jornal ABola")
            print("2 - Aplicar o algoritmo do TFIDF")
            print("3 - Sair")
        elif (line.replace("\n", "") == "2") or (word > 0):
            if word == 0 and j == 0:
                if (os.path.isdir("artigos") == False):
                    print(
                        'Necessita de gerar primeiro o conteúdo. Escolha a opção 1'
                    )
                    print("Que deseja fazer?")
                    print("1 - Consultar a informação do site do jornal ABola")
                    print("2 - Aplicar o algoritmo do TFIDF")
                    print("3 - Sair")
                else:
                    filesA = os.listdir('artigos')
                    table = TfIdf()
                    for i in filesA:
                        with open('artigos/{}'.format(i), 'r') as content:
                            #print(content.read().split('h2'))
                            val = content.read().split('h2')
                            firstVal = val[0]
                            secondVal = val[1]
                            table.add_document(
                                'title{}'.format(i),
                                re.sub(r'[\W]', ' ', firstVal).lower().split())
                            table.add_document(
                                'text{}'.format(i),
                                re.sub(r'[\W]', ' ',
                                       secondVal).lower().split())
                    word += 1
                    print('Indique quantas palavras quer comparar:')
            elif (word == 1) and (j == 0):
                if (line.replace("\n", "").isnumeric() and int(line) > 1):
                    nword = int(line)
                    word += 1
                else:
                    print('Digite um número maior que 1')
            elif (word > 1) and (word <= nword) and (j == 0):
                if (line.replace("\n", "") != ''):
                    narray.append(line.replace("\n", "").lower())
                    word += 1
            else:
                j = 1
                if (j == 1):
                    if line.replace("\n", "") != '':
                        narray.append(line.replace("\n", "").lower())
                        j += 1
                if (j == 2):
                    print(narray)
                    fTDIDF = open('output' + narray[0] + '.html', 'w+')
                    fTDIDF.write(
                        '<h2>Resultados da aplicação do algoritmo:<h2>')
                    splitArray = {}
                    for s in table.similarities(narray):
                        if s[0].startswith('title'):
                            s[0] = s[0].replace('title', '')
                            if s[0] in splitArray.keys():
                                splitArray[s[0]] += s[1] * 0.7
                            else:
                                splitArray[s[0]] = s[1] * 0.7
                        elif s[0].startswith('text'):
                            s[0] = s[0].replace('text', '')
                            if s[0] in splitArray.keys():
                                splitArray[s[0]] += s[1] * 0.3
                            else:
                                splitArray[s[0]] = s[1] * 0.3

                    for elem in splitArray.keys():
                        fTDIDF.write(
                            '<p><h5><a href="artigos/{}" >'.format(elem) +
                            elem + '</a> -> ' + str(splitArray[elem]) +
                            '</h5></p>')

                    new = 2  # open in a new tab, if possible
                    url = "file:///home/ze/SPLN/WebScraper/output" + narray[
                        0] + ".html"
                    webbrowser.open(url, new=new)
                    word = 0
                    nword = 0
                    narray = []
                    print("Que deseja fazer?")
                    print("1 - Consultar a informação do site do jornal ABola")
                    print("2 - Aplicar o algoritmo do TFIDF")
                    print("3 - Sair")
        elif (line.replace("\n", "") == "3") and (word == 0):
            print("Obrigado pela sua visita")
            fileinput.close()
Пример #23
0
 def idf_cache(self):
     """ 训练+测试 预料全在里面了 """
     result = TfIdf(self.documents_with_segments, self.cache_dir).idf_cache
     self.idf_file = result  # 采用自己的IDF,给FeaturesWeight用
     return result
Пример #24
0
# ]

q = 'gold'

dictOf = {i: document[i] for i in range(0, len(document))}

print('+----------------------------------+')
print('documents : ')
pprint.pprint(dictOf)
print('')
print('query :' + q)
print('+----------------------------------+')

print('')
print('Pembobotan TF-IDF')
tfidf = TfIdf().transform(q=q, document=document)
print("Bobot rata-rata: " + str(tfidf.weight_average()))
pprint.pprint(tfidf.get_weight())
print("+---------------------------------+")

print('')
print('Pembobotan W-IDF')
widf = WIdf().transform(q=q, document=document)
print("Bobot rata-rata: " + str(widf.weight_average()))
pprint.pprint(widf.get_weight())
print("+---------------------------------+")

print('')
print('Pembobotan TFRF')
tfrf = TFRF().transform(q=q, document=document)
print("Bobot rata-rata: " + str(tfrf.weight_average()))
Пример #25
0
except IndexError:
    if (len(results) == 0):
        print("No results were found for this query.")
        exit()
    else:
        pass

for posting in list_of_postings:
    results = set(results).intersection(posting)

if (len(important) > 0):
    for res in results:
        important.append(res)

#vectorizer = TfidfVectorizer()
table = TfIdf()
G = nx.Graph()
#return urls corresponding to numbers
with open("url_files.csv") as f:
    urls = [row for row in csv.reader(f)]

    if len(results) != 0:

        for x in results:
            f = open(urls[x - 1][0])
            obj = json.load(f)

            soup = BeautifulSoup(obj["content"],
                                 "html.parser",
                                 from_encoding="iso-8859-1")
            joinedText = [
Пример #26
0
def extract_keywords(db, tokens):
    dfs = db.get_dfs()
    tfidf = TfIdf(dfs)
    return tfidf.new_keywords(tokens)
Пример #27
0
        print("Getting data from " + url.strip() + "...", end="", flush=True)
        response = get(url=url)

        print("done!\nParsing HTML data...", end="", flush=True)
        parser.feed(response.text)
        print("done!")

        keydict = parser.get_keydict()

        urldata = {"url": url, "keywords": keydict}

        id_md5 = hashlib.md5(url.encode()).hexdigest()

        docs[id_md5] = urldata

ti = TfIdf(docs)

for kd, d in docs.items():
    print("Processing document " + kd + "...", end="", flush=True)
    for kw, t in d['keywords'].items():
        docs[kd]['keywords'][kw]['tf_idf'] = ti.tf_idf(kw, kd)
    print("done!")

fname = 'webdirectory.txt'
print("Saving to file " + fname + "...", end="", flush=True)
with open(fname, 'w') as file:
    file.write(json.dumps(docs, sort_keys=False))

print("done!\nCompleted!")
Пример #28
0
 def setUp(self):
     self.unk_cutoff = 2
     self.vocab = TfIdf(unk_cutoff=self.unk_cutoff)
Пример #29
0
'''
Created on 15 Jul 2018

@author: goksukara
'''
from tfidf import TfIdf

def addallfilesinpath(path):
    pass

if __name__ == "__main__":
    
    Tf_idf = TfIdf('s')
    list=[['human', 'human', 'interface'],['ship', 'human', 'interface']]
    list1=[['ship', 'humasn', 'interface']]
    list2=[['human', 'human', 'am']]
    list3=[['humafn', 'humasn', 'am1']]
    #map(unicode,list)
     
    Tf_idf.add_document(list)
    Tf_idf.add_document(list1)
    Tf_idf.Saverelatedwords()
    Tf_idf.add_document(list2)
    Tf_idf.add_document(list3)
    Tf_idf.SaveCorpusdic()
    Tf_idf.loaddictionary()
    Tf_idf.buildmodel()
    #Tf_idf.listnhighIdfs(10)
    Tf_idf.getTF_IDF()