Exemplo n.º 1
0
 def get(self):
     # standardize
     keywords = self.trimmed_stopwords(self.tokenize(self.theme, pos='noun_verbs'))
     # search about theme
     articles = self.search_articles([keyword.surface for keyword in keywords][:3])
     # clean
     docs = map(self.clean, articles)
     # divide sentences
     sentences_cand = map(self.divide, docs)
     sent = []
     for s in sentences_cand:
         sent.append(list(filter(self.is_sentence, s)))
     sentences = list(chain.from_iterable(sent))
     # tfidf format
     sentence_tokens = []
     for sentence in sentences:
         noun_tokens = [token.surface for token in self.tokenize(sentence, pos='noun')]
         sentence_tokens.append(' '.join(noun_tokens))
     # vectorize
     vector = TfIdf.vector(sentence_tokens)
     # clustering
     cluster = numpy.array(TfIdf.cluster(vector, clusters=3))
     # retrieve opinion with tf
     tfidf_score_index = numpy.argsort(numpy.array([sum(v) for v in vector.toarray()]))[::-1]
     opinions = []
     for i in range(3):
         # retrieve vector index by cluster
         c_index = numpy.where(cluster == i)
         for k in tfidf_score_index:
             if k in c_index[0]:
                 opinions.append(sentences[k])
                 break
     theme = namedtuple('Theme', 'keywords, opinions')
     return theme(' '.join([keyword.surface for keyword in keywords][:3]), opinions)
Exemplo n.º 2
0
 def __init__(self,
              corpus_filename=None,
              stopword_filename=None,
              DEFAULT_IDF=1.5):
     TfIdf.__init__(self, corpus_filename = corpus_filename, \
      stopword_filename = stopword_filename, DEFAULT_IDF = DEFAULT_IDF)
     self.init_file_count()
Exemplo n.º 3
0
def main():
    # SETTINGS
    NUM_PAGES = 10000
    corpus_filename = "corpus10k.txt"
    stopwords_filename = "stopwords10k.txt"

    myTfIdf = TfIdf(corpus_filename, stopwords_filename)

    content = []
    worker_threads = []

    url = "http://en.wikipedia.org/wiki/Special:Random"

    for i in range(NUM_PAGES):
        t = threading.Thread(target=clean_html_thread, args=(url, content))
        t.start()
        worker_threads.append(t)

    for t in worker_threads:
        t.join()

    for t in worker_threads:
        if not t.isAlive():
            # get results from thtead
            t.handled = True
    worker_threads = [t for t in worker_threads if not t.handled]

    for document in content:
        myTfIdf.add_input_document(document)
        print_keywords(document)

    myTfIdf.save_corpus_to_file(corpus_filename, stopwords_filename)
Exemplo n.º 4
0
    def createTFIDFTopics(self):
        self.db = sqlite3.connect(self.dbname,
                                  detect_types=sqlite3.PARSE_DECLTYPES)
        c = self.db.cursor()

        headlines = {}
        c.execute(
            "SELECT article_day,country,title,url,article_hash FROM articles_headlines"
        )
        for row in c.fetchall():
            title = row[2]
            # c.execute('SELECT content from articles where hash = ?',(row[4],))
            # content = c.fetchone()[0]

            lista = headlines.get(str(row[0]) + '-' + row[1])
            if lista is None:
                # headlines[str(row[0])+'-'+row[1]] = [title + ' ' + content]
                headlines[str(row[0]) + '-' + row[1]] = [title]
            else:
                # headlines[str(row[0])+'-'+row[1]].append(title + ' ' + content)
                headlines[str(row[0]) + '-' + row[1]].append(title)
        self.db.close()

        for hd, contents in headlines.iteritems():
            print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ' + hd
            with open('stopwords.txt', 'r') as st:
                tfidf = TfIdf(stopwords=[x.strip() for x in st.readlines()])
                tfidf.parse(contents)
Exemplo n.º 5
0
def main():
    tfidf = TfIdf(corpus_filename="moviecorpus.txt")
    #	tfidf.add_document_to_corpus()
    #	print tfidf.term_freq
    #	print tfidf.num_words
    for line in tfidf.get_summary('oblivion.txt', 5):
        print line
Exemplo n.º 6
0
    def createTFIDFTopics(self):
        self.db = psycopg2.connect("dbname=%s user=%s password=%s host=%s" % (
            self.dbname, self.dbuser, self.dbpass, self.dbhost))
        c = self.db.cursor()

        headlines = {}
        c.execute(
            "SELECT article_day,country,title,url,article_hash FROM articles_headlines")
        for row in c.fetchall():
            title = row[2]
            # c.execute('SELECT content from articles where hash = ?',(row[4],))
            # content = c.fetchone()[0]

            lista = headlines.get(str(row[0])+'-'+row[1])
            if lista is None:
                # headlines[str(row[0])+'-'+row[1]] = [title + ' ' + content]
                headlines[str(row[0])+'-'+row[1]] = [title]
            else:
                # headlines[str(row[0])+'-'+row[1]].append(title + ' ' + content)
                headlines[str(row[0])+'-'+row[1]].append(title)
        self.db.close()

        for hd, contents in headlines.items():
            print(f'>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {hd}')
            with open('stopwords.txt', 'r') as st:
                tfidf = TfIdf(stopwords=[x.strip() for x in st.readlines()])
                tfidf.parse(contents)
Exemplo n.º 7
0
    def __init__(self, sql_obj=None):
        if not sql_obj:
            self.sql = SQLQuery()
        else:
            self.sql = sql_obj

        self.tfidf_obj = TfIdf()
        self.ids = None
Exemplo n.º 8
0
    def calcularfrecuencia(self, texto, palabra=[]):

        table = TfIdf()
        table.add_document("informacion", texto)
        resultado = table.similarities(palabra)[0][1]
        if resultado > 0.0:
            return True
        return False
Exemplo n.º 9
0
    def get(self):
        # standardize
        keywords = self.trimmed_stopwords(
            self.tokenize(self.opinion, pos='noun_verbs'))
        # search about opinion with keywords
        articles = self.search_articles(
            self.keywords + [keyword.surface for keyword in keywords][:3])
        # clean
        docs = map(self.clean, articles)
        # divide sentences
        sentences_cand = map(self.divide, docs)
        sent = []
        for s in sentences_cand:
            sent.append(list(filter(self.is_sentence, s)))
        sentences = list(chain.from_iterable(sent))
        # tfidf format
        sentence_tokens = []
        for sentence in sentences:
            noun_tokens = [
                token.surface for token in self.tokenize(sentence, pos='noun')
            ]
            sentence_tokens.append(' '.join(noun_tokens))
        # vectorize
        vector = TfIdf.vector(sentence_tokens)
        # clustering
        cluster = numpy.array(TfIdf.cluster(vector, clusters=3))
        # retrieve opinion with tf
        tfidf_score = numpy.array([sum(v) for v in vector.toarray()])
        # retrieve opinion with senti
        # senti_score = numpy.array([self.senti(s) for s in sentences])
        senti_score = []
        # for s in sentences:
        #     senti_score.append(self.senti(s))

        for sentence in sentences:
            senti_tokens = [
                token.surface for token in self.tokenize(sentence, pos='senti')
            ]
            senti_score.append(self.senti(senti_tokens))

        senti_score = numpy.array(senti_score)
        score_index = numpy.argsort(tfidf_score * senti_score)
        positives = []
        negatives = []
        for i in range(3):
            # retrieve vector index by cluster
            c_index = numpy.where(cluster == i)
            for k in score_index:
                if k in c_index[0]:
                    negatives.append(sentences[k])
                    break
            for k in score_index[::-1]:
                if k in c_index[0]:
                    positives.append(sentences[k])
                    break
        opinion = namedtuple('Opinion', 'positives, negatives')
        return opinion(positives, negatives)
Exemplo n.º 10
0
def save_tfidf_like(parl_counter,sort_tfidf_like, counter_list,tot_counter,counter_list_parl):
    dic = dict(sort_tfidf_like)
    f =  open(dir_out+"tfidf_like_parametros.csv", 'w')
    f.write("palavra"+";"+"valor"+";"+"frequencia"+";"+"entropia maxima"+";"+"entropia da palvra"+";"+"prob_politica"+";"+"entropia entre deputados"+"\n")
    for word in parl_counter:
        f.write(word+";"+str(dic[word])+";"+ '%.4f'%(TfIdf.tf(word,parl_counter))+";"+
             '%.4f'%(math.log2(len(counter_list)))+";"+ '%.4f'%(TfIdf.entropy(word,tot_counter,counter_list))+";"+
             '%.4f'%(TfIdf.parl_prob(word,parl_counter,counter_list))+";"+ '%.4f'%(TfIdf.parl_entropy(word, tot_counter, counter_list_parl))+"\n")
    f.close()
Exemplo n.º 11
0
class SearchEngine:
    def __init__(self):
        self.tfidf = TfIdf()

    def load_documents(self, documents):
        for doc in documents:
            name = doc.name
            text = self.doc_to_text(doc)
            words = self.text_to_word_array(text)

            self.tfidf.add_document(name, words)

    def query(self, query):
        return self.tfidf.similarities(query)

    def doc_to_text(self, doc):
        parser = AKParser()
        tree = parser.parse(doc)

        text = ''

        q = deque()
        q.append(tree)

        while True:
            if not q:
                break

            node = q.pop()

            if node.tag:
                if 'link' == node.tag.lower():
                    val = node.children[0].value.split('|')[0]
                    text += f' {val} '
                    continue

            if node.value:
                text += f' {node.value} '

            children = node.children

            if children:
                for c in children:
                    q.append(c)

        return re.sub(' +', ' ', text)

    def text_to_word_array(self, text):
        regex = re.compile('[^a-zA-Z\s]')

        text = regex.sub('', text)
        text = re.sub(' +', ' ', text)

        return text.lower().split()
Exemplo n.º 12
0
    def test_similarity(self):
        table = TfIdf()
        table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"])
        table.add_document("bar", ["a", "b", "c", "i", "j", "k"])
        table.add_document("baz", ["k", "l", "m", "n"])

        self.assertEqual(
            table.similarities(["a", "b", "c"]),
            [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])
Exemplo n.º 13
0
 def build_tfidf_model(self, files):
     '''
     It builds the Tf-Idf model
     :param files: List of files of the corpora
     :return: A Tf-Idf object with the model loaded
     '''
     tfidf = TfIdf()
     for file_path in files:
         with open(file_path) as f:
             doc_name = file_path.split('/')[-1]
             doc_text = f.readline().split()
             tfidf.add_document(doc_name, doc_text)
     return tfidf
Exemplo n.º 14
0
Arquivo: test.py Projeto: mvj3/tfidf
    def test_tfidf(self):
        clean_tmp()

        t = TfIdf(self.data, root_dir)
        self.assertTrue(t.idf_cache['I'] < t.idf_cache['hello'])
        self.assertTrue(t.idf_cache['I'] < t.idf_cache['You'])
        self.assertTrue(t.idf_cache['I'] < t.idf_cache['not exist feature'],
                        "test default idf_default_val")

        result1 = t.tfidf_in_a_doc(self.data[1])
        self.assertTrue(result1['I'] < result1['You'])
        self.assertTrue(result1['You'] < result1['hello'])
        self.assertTrue(result1['hello'] == result1['world'])

        clean_tmp()
Exemplo n.º 15
0
    def postprocess_query(self, query):
        scores = sorted([(TfIdf.similarity(self, query, document), did)
                         for did, document in self.documents], reverse=True)

        for _, did in scores[:PseudoFeedback.__num_expansions__]:
            query.union(self.documents[did])
        return query
Exemplo n.º 16
0
def tfidf_month(tw_month,random_list):
    tweets = list(itertools.chain.from_iterable(itertools.chain.from_iterable(tw_month)))
    tot_counter = Counter(tweets)
    dep_counts = list()
    for dep in tw_month:
        tw = list(itertools.chain.from_iterable(dep))
        print(tw)
        dep_counts.append(Counter(tw))
    docs_counter = docs_counters(random_list,tot_counter)
    tfidf = TfIdf()
    tfidf_like = list()
    for word in tot_counter:
        tfidf_like.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, dep_counts))
    sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like))
    sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True)
    return sort_tfidf_like
Exemplo n.º 17
0
 def __init__(self):
     self.query_expander = QueryExpander()
     self.query_expander.loadKeywords(self.__directories['Keywords'])
     #inits window and connects delete event
     print gtk.pygtk_version
     self.is_query_expanding_active = True
     self.tfidf = TfIdf(self.__directories['Documents'], self.__directories['Keywords'])
     print 'Dokument', self.tfidf.print_documents()
     self.window = gtk.Window(gtk.WINDOW_TOPLEVEL)
     self.window.set_title("Czesc Milosz!")
     self.window.connect("delete_event", self.delete_event)
     self.window.set_default_size(600,500)
     self.window.set_border_width(5)
     
     #prepare layout
     self.box1 = gtk.VBox(False, 5)
     self.window.add(self.box1)
     document_view = self.get_document_textarea_layout()
     self.box1.pack_start(self.get_menu_box(), False, False, 0)
     self.box1.pack_start(self.get_search_panel_layout(), False, False, 0)
     self.box1.pack_start(self.get_query_expander_view(), True, True, 0)
     self.box1.pack_start(self.get_result_layaut(), True, True, 0)
     self.box1.pack_start(document_view, True, True, 0)
     self.box1.pack_start(self.get_keywords_layout(), True, True, 0)
     self.show_keywords()
     #self.tfidf.print_stemmed_keywords()
     self.window.show_all()
Exemplo n.º 18
0
def initialisation():
    pm = Parsemail()#tain c quoi déja le truc pour écraser les méthodes??la surcharge, des constructeurs
    listeDmail, listeDmailRaci = pm.parsemail()
    
    
    
    fi = FichierIverse()
    dicoInv, nbmotsdocs, nbmotCorpus =fi.fichInv(listeDmail)
    dicoInvR, nbmotsdocsR, nbmotCorpusR = fi.fichInv(listeDmailRaci)
    
    
    ti = TfIdf(False,dicoInv,nbmotsdocs,nbmotCorpus)
    ti.calcul()
    ti.serialisation()
    ti2 =TfIdf(True, dicoInvR, nbmotsdocsR, nbmotCorpusR)#le true autorise la concaténation avec l'ancien dico
    ti2.calcul()
    ti2.serialisation()
Exemplo n.º 19
0
    def __init__(self, sql_obj=None):
        if not sql_obj:
            self.sql = SQLQuery()
        else:
            self.sql = sql_obj

        self.tfidf_obj = TfIdf()
        self.ids = None
Exemplo n.º 20
0
    def calculateTFIDFofNew(self, inputTitle, inputBody):
        title = self.textToWordsArray(inputTitle)
        sentences = self.textArrayToWordsArray(inputBody)
        if len(sentences) < 1:
            return []

        table = TfIdf()
        for i in range(0, len(sentences)):
            table.add_document("sentences" + str(i), sentences[i])

        result = []
        similarities = table.similarities(title)
        for similarity in similarities:
            result.append(similarity[1])

        resLen = len(result)
        for i in range(resLen, 5):
            result.append(0)
        return result
Exemplo n.º 21
0
def tfidf_matrix(text_generator):
    """Builds tf-idf matrix from records from fname, using fields to create a text describing them
    
    """

    ti = TfIdf()
    #print "building tfidf indices"
    for i in text_generator:
        ti.add_input_document(i)

    A = np.zeros([ti.num_docs, len(ti.term_num_docs)])

    for i_ind, i in enumerate(text_generator):
        #print "-i_ind, i:", i_ind, i
        for j_ind, j in enumerate(ti.get_tfidf(i)):
            #print "-----j_ind, j:", j_ind, j
            A[i_ind, j_ind] = j
    #print ti.term_num_docs
    return A, ti
Exemplo n.º 22
0
def tfidf_matrix(text_generator):
    """Builds tf-idf matrix from records from fname, using fields to create a text describing them
    
    """
    
    ti = TfIdf()
    #print "building tfidf indices"
    for i in text_generator:
        ti.add_input_document(i)
        
    A = np.zeros([ti.num_docs, len(ti.term_num_docs)])
    
    for i_ind, i in enumerate(text_generator):
        #print "-i_ind, i:", i_ind, i
        for j_ind, j in enumerate(ti.get_tfidf(i)):
            #print "-----j_ind, j:", j_ind, j
            A[i_ind, j_ind] = j
    #print ti.term_num_docs
    return A, ti
Exemplo n.º 23
0
def test_provider():
    vocab = load_pickled('vocab.dat')
    tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt'])
    lda = LdaLoader('topics.lda', 200)
    label_vectorizer = LabelVectorizer(load_pickled('labels.dat'))
    stemmer = MemoizedStemmer()

    pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab,
                                          tfidf, lda, label_vectorizer,
                                          stemmer)

    return pos_provider
Exemplo n.º 24
0
def gen_extra_sentences():
    word_idf_file = 'e:/el/tmpres/demo/merge/word_idf.txt'
    tfidf = TfIdf(word_idf_file)

    mesh_id_wid_file = 'e:/el/tmpres/demo/merge/mesh_id_wid.txt'
    merged_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions.txt'
    merged_tokenized_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions_tokenized.txt'
    extra_sentence_file = 'e:/el/tmpres/demo/merge/wiki_extra_sentences.txt'

    mesh_ids = list()
    wids = list()
    fin = open(mesh_id_wid_file, 'rb')
    for line in fin:
        vals = line.strip().split('\t')
        mesh_ids.append(vals[0])
        wids.append(int(vals[1]))
    fin.close()

    fin_desc = open(merged_desc_file, 'rb')
    fin_token_desc = open(merged_tokenized_desc_file, 'rb')
    fout = open(extra_sentence_file, 'wb')
    for idx, (mesh_id, mesh_desc, mesh_token_desc) in enumerate(
            izip(mesh_ids, fin_desc, fin_token_desc)):
        mesh_token_desc = mesh_token_desc.strip()
        mesh_desc_words = mesh_token_desc.split(' ')
        mesh_sentence_ends = find_sentence_ends(mesh_desc_words)

        wiki_desc = fin_desc.next().strip()
        wiki_token_desc = fin_token_desc.next().strip()
        wiki_desc_words = wiki_token_desc.split(' ')
        wiki_sentence_ends = find_sentence_ends(wiki_desc_words)

        extra_sentence_indices = get_sentences_to_add(mesh_desc_words,
                                                      mesh_sentence_ends,
                                                      wiki_desc_words,
                                                      wiki_sentence_ends,
                                                      tfidf)

        wiki_words_to_pos_list = tokenized_text_match(wiki_desc,
                                                      wiki_desc_words)
        original_sentences = get_original_sentences(wiki_desc,
                                                    wiki_words_to_pos_list,
                                                    wiki_sentence_ends)
        fout.write('%s\t%d\n' % (mesh_id, len(extra_sentence_indices)))
        for j in extra_sentence_indices:
            fout.write('%s\n' % original_sentences[j])

        # if idx == 10000:
        #     break
    fin_desc.close()
    fin_token_desc.close()
    fout.close()
Exemplo n.º 25
0
def main(args):
    summarizer = {
        'tfidf': TfIdf(),
        'cluster': Cluster(),
        'svd': SVD(),
        'pagerank': PageRank()
    }[args['alg']]

    summarizer.initialize(args['tf'], args['df'])
    summary = summarizer.summarize(args['doc'])

    for s in summary:
        print(s),
Exemplo n.º 26
0
def callback(ch, method, properties, body):
    global tweet_obj, done, starttime, skipped, processed, systime
    json_tweet = json.loads(body)
    msg = json_tweet['sanitized_text']
    ts = int(json_tweet['timestamp'])
    msgid = int(json_tweet['id'])
    uid = int(json_tweet['user']['id'])
    tweet_obj = Tweet(msg, ts, msgid, uid)
    if utils.qualified(tweet_obj, TAGS, IGNORE, MIN_TOKENS):
        incr = TfIdf.getVals(tweet_obj)
        # print tweet.getVector()
        buckets.updateRndVec(incr)
        closeBuck = getClosestNeighborBuckets(tweet_obj)
        print(msg)
        if closeBuck[0] is not None:
            print("CLOSE BUCK: {0}, {1}".format(closeBuck[0].msg,
                                                closeBuck[1]))
        closeRecent = getClosestNeighborRecent(tweet_obj, closeBuck[1])
        closeoverall = decideClosest(closeBuck, closeRecent)
        other = closeoverall[0]
        if other:
            json_tweet['nearneigh'] = other.msgid
        else:
            json_tweet['nearneigh'] = -1
        json_tweet['cossim'] = closeoverall[1]
        channel.basic_publish(
            exchange='',
            routing_key='FYP.Q.GetStories.ClusteredTweetMessage',
            body=json.dumps(json_tweet),
            properties=pika.BasicProperties(
                delivery_mode=2,  # make message persistent
            ))
        # sys.stdout.write(json.dumps(t) + '\n')
    else:
        print('skipped')
        skipped += 1
    done += 1
    current = int(ts) / 1000
    if current - starttime > 900:
        aftertime = datetime.now()
        delta = aftertime - systime
        systime = aftertime
        dt = divmod(delta.seconds, 60)
        sys.stderr.write(
            str(done) + ' Tweets done in ' + str(dt[0]) + ' min ' +
            str(dt[1]) + ' sec.\n')
        starttime = current

    processed += 1
    if processed % 100 == 0:
        print('{0} tweets processed'.format(processed))
    def process_texts(self):
        relevant_words = []
        path = os.path.join('data', 'wiki')
        file_names = os.listdir(path)
        documents = []
        for file_name in file_names:
            file_path = os.path.join(path, file_name)
            f = open(file_path)
            documents.append((file_name, TextBlob(str.decode(f.read(), 'UTF-8', 'ignore'))))
            f.close()

        tfidf = TfIdf(documents)
        for file_name, document in documents:
            print file_name
            scores = {word: tfidf.compute_tfidf(word, document) for word in document.words}
            selected_scores = {}
            for word in scores:
                similars = sorted(self.get_similar(scores.keys(), word))
                selected_scores[similars[-1]] = scores[word]
            sorted_words = sorted(selected_scores.items(), key=lambda x: x[1], reverse=True)
            for word, score in sorted_words[:10]:
                if word not in relevant_words:
                    relevant_words.append(word)
        return set(relevant_words)
Exemplo n.º 28
0
    def open_file(self, widget,  name):
        text = "Select {0} Source File".format(name)
        filechooserdialog = gtk.FileChooserDialog(text, None, gtk.FILE_CHOOSER_ACTION_OPEN, (gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_OK, gtk.RESPONSE_OK))
        response = filechooserdialog.run()

        if response == gtk.RESPONSE_OK:
            self.__directories[name] = filechooserdialog.get_filename()
            self.tfidf = TfIdf(self.__directories['Documents'], self.__directories['Keywords'])
            self.query_expander = QueryExpander()
            self.query_expander.loadKeywords(self.__directories['Keywords'])
            print "directories"
            print self.__directories
            #self.tfidf.print_stemmed_keywords()
            self.show_keywords()

        filechooserdialog.destroy()
Exemplo n.º 29
0
def callback(ch, method, properties, body):
    global tweet_obj, done, starttime, skipped, processed, systime
    json_tweet = json.loads(body)
    msg = json_tweet["sanitized_text"]
    ts = int(json_tweet["timestamp"])
    msgid = int(json_tweet["id"])
    uid = int(json_tweet["user"]["id"])
    tweet_obj = Tweet(msg, ts, msgid, uid)
    if utils.qualified(tweet_obj, TAGS, IGNORE, MIN_TOKENS):
        incr = TfIdf.getVals(tweet_obj)
        # print tweet.getVector()
        buckets.updateRndVec(incr)
        closeBuck = getClosestNeighborBuckets(tweet_obj)
        print msg
        if closeBuck[0] is not None:
            print "CLOSE BUCK: {0}, {1}".format(closeBuck[0].msg, closeBuck[1])
        closeRecent = getClosestNeighborRecent(tweet_obj, closeBuck[1])
        closeoverall = decideClosest(closeBuck, closeRecent)
        other = closeoverall[0]
        if other:
            json_tweet["nearneigh"] = other.msgid
        else:
            json_tweet["nearneigh"] = -1
        json_tweet["cossim"] = closeoverall[1]
        channel.basic_publish(
            exchange="",
            routing_key="FYP.Q.GetStories.ClusteredTweetMessage",
            body=json.dumps(json_tweet),
            properties=pika.BasicProperties(delivery_mode=2),  # make message persistent
        )
        # sys.stdout.write(json.dumps(t) + '\n')
    else:
        print "skipped"
        skipped += 1
    done += 1
    current = int(ts) / 1000
    if current - starttime > 900:
        aftertime = datetime.now()
        delta = aftertime - systime
        systime = aftertime
        dt = divmod(delta.seconds, 60)
        sys.stderr.write(str(done) + " Tweets done in " + str(dt[0]) + " min " + str(dt[1]) + " sec.\n")
        starttime = current

    processed += 1
    if processed % 100 == 0:
        print "{0} tweets processed".format(processed)
Exemplo n.º 30
0
def get_sentences_to_add(prev_text_words, prev_sentence_ends, new_text_words,
                         new_sentence_ends, tfidf):
    prev_tfidf_vecs = get_tfidf_of_sentences(prev_text_words,
                                             prev_sentence_ends, tfidf)
    new_tfidf_vecs = get_tfidf_of_sentences(new_text_words, new_sentence_ends,
                                            tfidf)
    wanted_sentence_indices = list()
    for nidx, new_tfidf_vec in enumerate(new_tfidf_vecs):
        to_add = True
        for pidx, prev_tfidf_vec in enumerate(prev_tfidf_vecs):
            sim_val = TfIdf.sim(new_tfidf_vec, prev_tfidf_vec)
            if sim_val > 0.95:
                to_add = False
                # print sim_val, 'too similar'
                break
        if to_add:
            wanted_sentence_indices.append(nidx)
    return wanted_sentence_indices
Exemplo n.º 31
0
def train_setup(vocab_file, pos_file, neg_file, cluster_labels_file,
                validation_file):
    vocab = load_pickled(vocab_file)
    tfidf = TfIdf(vocab, [pos_file, neg_file])
    label_vectorizer = LabelVectorizer(load_pickled(cluster_labels_file))
    stemmer = MemoizedStemmer()

    pos_provider = TrainingSampleProvider(pos_file, 1, vocab, tfidf,
                                          label_vectorizer, stemmer)
    neg_provider = TrainingSampleProvider(neg_file, -1, vocab, tfidf,
                                          label_vectorizer, stemmer)

    merged = SampleMerger(pos_provider, neg_provider)

    validation_provider = ValidationSampleProvider(validation_file, None,
                                                   vocab, tfidf,
                                                   label_vectorizer, stemmer)

    return merged, validation_provider
Exemplo n.º 32
0
def train_setup():
    vocab = load_pickled('vocab.dat')
    tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt'])
    lda = LdaLoader('topics.lda', 200)
    label_vectorizer = LabelVectorizer(load_pickled('labels.dat'))
    stemmer = MemoizedStemmer()

    pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab,
                                          tfidf, lda, label_vectorizer,
                                          stemmer)
    neg_provider = TrainingSampleProvider('./data/train_neg.txt', -1, vocab,
                                          tfidf, lda, label_vectorizer,
                                          stemmer)

    merged = SampleMerger(pos_provider, neg_provider)

    validation_provider = ValidationSampleProvider('./data/test_data.txt',
                                                   None, vocab, tfidf, lda,
                                                   label_vectorizer, stemmer)

    return merged, validation_provider
Exemplo n.º 33
0
class KeyWordGetter():
    """
    Class to determine the significant unique keywords of a page.
    Uses TF-IDF algorithm (http://en.wikipedia.org/wiki/Tf%E2%80%93idf).
    """

    def __init__(self):
        self.myTfIdf = TfIdf("corpus10k.txt", "stopwords10k.txt")

        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        self.opener = opener

    def get_keywords_from_url(self, url, num_words=5, MAX_STR_LEN=1000):
        """
        Returns a list of tuples containing unique keywords of a page given a url and their
        significance as number between (0,1) using TF-IDF algorithm as a tuple.
        """
        clean_text = self.get_clean_text(url)

        if len(clean_text) > MAX_STR_LEN:
            clean_text = clean_text[:MAX_STR_LEN]

        keywords = []
        for pair in self.myTfIdf.get_doc_keywords(clean_text)[0: num_words]:
            keywords.append(pair)

        return keywords

    def get_clean_text(self, URL):
        """
        Returns the contents of a url's html page with tags removed
        """
        response = self.opener.open(URL)

        html = response.read()

        return nltk.clean_html(html)
Exemplo n.º 34
0
        alea_processed.append(get_bigrams(temp,3,True))

    with open(dir_out+"list_alea_bigrams.pck", 'wb') as handle:
        pickle.dump(alea_processed, handle)
    with open(dir_out+"list_alea_trigrams.pck", 'wb') as handle:
        pickle.dump(alea_tri_processed, handle)

    

    bgr_counter = parl_bigrams.ngram_fd
    parl_bgr_counter = [l.ngram_fd for l in parl_processed]
    docs_bgr_counter = [l.ngram_fd for l in alea_processed]
    docs_bgr_counter.append(bgr_counter)


    tfidf = TfIdf()
    tfidf_smooth = list() 
    for bgr in bgr_counter:
        tfidf_smooth.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_smooth(bgr,docs_bgr_counter))

    dic_tfidf_smooth = list(zip(bgr_counter.keys(), tfidf_smooth))
    dic_tfidf_smooth = sorted(dic_tfidf_smooth, key=lambda x: x[1], reverse=True)

    tot_counter = dict()
    for y in docs_bgr_counter:
        for k in y.keys(): tot_counter[k] = k in tot_counter and tot_counter[k]+y[k] or y[k]

tfidf_like = list()
for bgr in bgr_counter:
    tfidf_like.append(tfidf.tf(bgr,bgr_counter)*tfidf.idf_like(bgr,bgr_counter,tot_counter,docs_bgr_counter, parl_bgr_counter))
Exemplo n.º 35
0
class TweetAdder:

    #@perftest
    def __init__(self, sql_obj=None):
        if not sql_obj:
            self.sql = SQLQuery()
        else:
            self.sql = sql_obj

        self.tfidf_obj = TfIdf()
        self.ids = None

    def addTimelineTweet(self, timeline_tweet):
        """
        Converts timeline tweet to search api format and adds it as a celebrity tweet.
        """
        tweet = self.convertTimelineTweet(timeline_tweet)
        self.add(tweet, created_at_is_obj=True, tweet_table="tweets")

    def addNonCelebTimelineTweet(self, timeline_tweet):
        """
        Converts timeline tweet to Search API format and adds it as a non-celebrity tweet.
        """
        self.add(self.convertTimelineTweet(timeline_tweet), created_at_is_obj=True, tweet_table="tweets_non_celeb")

    def convertTimelineTweet(self, timeline_tweet):
        """
        Converts a timeline tweet to the format returned by the Search API.
        """
        tweet = {}
        created_at = replaceMonth(timeline_tweet['created_at'])
        dt = datetime.datetime(int(created_at[25:]), int(created_at[4:6]), int(created_at[7:9]),
            int(created_at[10:12]), int(created_at[13:15]), int(created_at[16:18]))
        tweet['created_at'] = dt

        tweet['from_user']         = timeline_tweet['user']['screen_name']
        tweet['from_user_id']      = timeline_tweet['user']['id']
        tweet['from_user_name']    = timeline_tweet['user']['name']
        tweet['geo']               = timeline_tweet['user']['location']
        tweet['id']                = timeline_tweet['id']
        tweet['iso_language_code'] = timeline_tweet['user']['lang']
        tweet['metadata']          = {'result_type':'timeline'}
        tweet['profile_image_url'] = timeline_tweet['user']['profile_image_url']
        tweet['source']            = timeline_tweet['source']
        tweet['text']              = timeline_tweet['text']
        tweet['to_user']           = timeline_tweet['in_reply_to_screen_name']
        tweet['to_user_id']        = timeline_tweet['in_reply_to_user_id']
        tweet['to_user_name']      = None

        return tweet
        
    def add(self, tweet, created_at_is_obj=False, tweet_table="tweets"):
        """
        Adds a tweet to tweet_table (celebrity tweet table by default).
        Tweet must be in the format provided by Search API.
        """

        if not self.ids:
            self.ids = [i[0] for i in self.sql.q("SELECT id FROM tweets")]

        debuglog.msg("Inserting tweet", tweet['id'])
        #debuglog.pprint_msg(tweet)

        if not created_at_is_obj:
            dt = datetime.datetime.strptime(replaceMonth(tweet['created_at'][5:25]),"%d %m %Y %H:%M:%S")
        else:
            dt = tweet['created_at']
            
        created_at = dt.strftime("%Y-%m-%d %H:%M:%S")
        
        dicvals = {'created_at':created_at,
                   'from_user':tweet['from_user'],
                   'from_user_id':tweet['from_user_id'],
                   'from_user_name':tweet['from_user_name'],
                   'geo':str(tweet['geo']),
                   'id':tweet['id'],
                   'iso_language_code':tweet['iso_language_code'],
                   'metadata':str(tweet['metadata']),
                   'profile_image_url':tweet['profile_image_url'],
                   'source':tweet['source'],
                   'text':tweet['text'],
                   'to_user':tweet['to_user'],
                   'to_user_id':tweet['to_user_id'],
                   'to_user_name':tweet['to_user_name']}


        dicq= "INSERT IGNORE INTO " + tweet_table

        dicq += """ VALUES(%(created_at)s,
                           %(from_user)s,
                           %(from_user_id)s,
                           %(from_user_name)s,
                           %(geo)s,
                           %(id)s,
                           %(iso_language_code)s,
                           %(metadata)s,
                           %(profile_image_url)s,
                           %(source)s,
                           %(text)s,
                           %(to_user)s,
                           %(to_user_id)s,
                           %(to_user_name)s)"""
        
        if tweet['id'] not in self.ids:
            succeeded = False
            try:
                self.sql.q(dicq,dicvals)
                succeeded = True
            except UnicodeEncodeError:
                try:
                    debuglog.msg("\tUNIDECODE ERROR, trying decode...")
                    for k in dicvals:
                        dicvals[k] = unidecode(dicvals[k])
                    self.sql.q(dicq,dicvals)
                    succeeded = True
                except:
                    debuglog.msg("\tUnidecode failed :(")

            
            if succeeded and tweet_table == 'tweets':
                tokens = self.tfidf_obj.get_tokens(tweet['text'])
                self.addTokens(tweet,tokens)
                self.addTokenMapping(tweet, tokens)

            return succeeded

        debuglog.msg("\ttweet already existed")
        return False

    def addTokens(self, tweet, tokens=None):
        if tokens is None:
            txt = tweet['text']
            tokens = self.tfidf_obj.get_tokens(txt)

        if not tokens or not len(tokens):
            return

        count = 0
        vals = {}
        q =  "INSERT IGNORE INTO tokens (token, type) VALUES"

        for token in tokens:
            #print(token)
            vals['token'+str(count)] = token[0]
            vals['type'+str(count)] = token[1]
            q += "(%(token"+str(count)+")s, %(type"+str(count)+")s),"
            count += 1
            
        q = q[:len(q)-1] #remove last comma
        self.sql.q(q,vals)

    def addTokenMapping(self, tweet, tokens=None):
        if tokens is None:
            txt = tweet['text']
            tokens = self.tfidf_obj.get_tokens(txt)

        if not tokens or not len(tokens):
            return

        count = 0
        vals = {'user':tweet['from_user'], 'tweet_id':tweet['id']}
        q = "INSERT INTO token_user_mapping (user, token, tweet_id) VALUES"
        for token in tokens:
            q += "(%(user)s, %(token"+str(count)+")s, %(tweet_id)s),"
            vals['token'+str(count)] = token[0]
            count+=1

        #print("token mapping query",q)
        q = q[:len(q)-1] #remove last comma
        self.sql.q(q,vals)

    def deleteCeleb(self, celeb):
        """
        Back up and delete data for a celeb who doesn't make the cut.
        """
        self.backupCeleb(celeb)

        vals = {'celeb':celeb}

        q = "DELETE FROM celebs WHERE user=%(celeb)s"
        self.sql.q(q, vals)

        q = "DELETE FROM celeb_tfidf WHERE user=%(celeb)s"
        self.sql.q(q, vals)

        q = "DELETE FROM token_user_mapping WHERE user=%(celeb)s"
        self.sql.q(q, vals)

        q = "DELETE FROM tweets WHERE from_user=%(celeb)s"
        self.sql.q(q, vals)

        print("Deleted",celeb)


    def backupCeleb(self, celeb):
        """
        Back up data for a celeb (before deleting them).
        """
        vals = {'celeb':celeb}

        q = "INSERT INTO celebs_deleted (user) VALUES(%(celeb)s)"
        self.sql.q(q, vals)

        q = "INSERT INTO celeb_tfidf_deleted (SELECT * FROM celeb_tfidf_all WHERE user=%(celeb)s)"
        self.sql.q(q, vals)

        q = "INSERT INTO token_user_mapping_deleted (SELECT * FROM token_user_mapping WHERE user=%(celeb)s)"
        self.sql.q(q, vals)

        q = "INSERT INTO tweets_deleted (SELECT * FROM tweets WHERE from_user=%(celeb)s)"
        self.sql.q(q, vals)

    def fixTokens(self):
        q = "SELECT text, from_user, id FROM tweets"

        results = self.sql.q(q)
        failures = []
        f = open('token_fix_failures.txt','w')
        for result in results:
            debuglog.msg("Adding tokens for tweet",result[2])
            try:
                self.addTokens({'text':result[0], 'from_user':result[1]})
                self.addTokenMapping({'text':result[0], 'from_user':result[1], 'id':result[2]})
            except:
                failures.append(result[2])
                debuglog.msg("\tAdding tokens failed!")
                debuglog.msg("\tFailures so far:",len(failures))
                f.write(result[2]+"\n")

        f.close()

        debuglog.msg(failures)

    def identityMissingTweets(self):
        f = open('added_tweets.txt')
        added_tweet_ids = [int(line.replace('\n','')) for line in f.readlines()]
        f.close()

        #pprint.pprint(added_tweet_ids)

        q = "SELECT id FROM tweets"
        all_tweet_ids = [result[0] for result in self.sql.q(q)]


        missing_tweets = list(filter(lambda x: x not in added_tweet_ids, all_tweet_ids))

        pprint.pprint(missing_tweets)

    def fixTokensInterrupted(self):
        f = open('missing_tweets2.txt')
        missing_tweets = [line.replace('\n','') for line in f.readlines()]

        q = "SELECT text, from_user, id FROM tweets WHERE id IN("
        vals = {}

        q += ','.join(missing_tweets) + ')'

        results = self.sql.q(q)
        #pprint.pprint(results)
        #return

        failures = []
        f = open('token_fix_failures.txt','w')
        for result in results:
            debuglog.msg("Adding tokens for tweet",result[2])
            try:
                self.addTokens({'text':result[0], 'from_user':result[1]})
                self.addTokenMapping({'text':result[0], 'from_user':result[1], 'id':result[2]})
            except:
                failures.append(result[2])
                debuglog.msg("\tAdding tokens failed!")
                debuglog.msg("\tFailures so far:",len(failures))
                f.write(result[2]+"\n")

        f.close()

        debuglog.msg(failures)

    def addTweetIdsToTokenUserMapping(self):
        q = "SELECT user FROM celebs WHERE user!='ladygaga'"
        celebs = [result[0] for result in self.sql.q(q)]

        count = 0
        for celeb in celebs:
            print("Adding tweet ids for", celeb)
            q = "SELECT id, text FROM tweets WHERE from_user=%(celeb)s"
            vals = {'celeb':celeb}
            celeb_tweets = self.sql.q(q, vals)

            num_tweets = str(len(celeb_tweets))
            tweet_count = 0
            for tweet in celeb_tweets:
                tokens = self.tfidf_obj.get_tokens(tweet[1])
                for token in tokens:
                    vals = {'celeb':celeb, 'token':token[0], 'tweet_id':tweet[0]}
                    q = "UPDATE token_user_mapping SET tweet_id=%(tweet_id)s WHERE user=%(celeb)s AND token=%(token)s AND tweet_id is null LIMIT 1;"

                    self.sql.q(q, vals)

                tweet_count += 1
                print("\t%s/%s tweets."%(tweet_count, num_tweets))

            count += 1
            print("%s%% of celebs updated."% str(100*count/float(len(celebs))))
from tfidf import TfIdf
import pandas as pd
corpuspath = '/Users/goksukara/Desktop/Projects/EclipseWorkspace/Specilization/PhytonCode/Data/'

if __name__ == "__main__":
    Tf_idf = TfIdf(corpuspath + 'Gensim_output')
    Tf_idf.loaddictionary()
    Tf_idf.buildmodel()
    Tf_idf.saveModel()
    Tf_idf.getTF_IDF()
    #print(Tf_idf.corpus_dict)
    #Tf_idf.listnhighIdfs(4)
Exemplo n.º 37
0
dir_in = "/Users/lucasso/Dropbox/UFMG/Processamento de Linguagem Natural/random_pck/docs/"
dir_parl = "/Users/lucasso/Documents/pck/"
dir_out = "/Users/lucasso/Dropbox/UFMG/Processamento de Linguagem Natural/"
file_parl = "/Users/lucasso/Dropbox/UFMG/Processamento de Linguagem Natural/random_pck/docs/deputados.pck"
tfidf_n = list()
tf_log_idf = list()
tfidf_like = list() 
corr = ""

with open(file_parl, 'rb') as handle:
    parl_counter = pickle.load(handle)

tot_counter,counter_list,_ = loadCounters(dir_in)
tot_counter_dep,counter_list_dep,pck= loadCounters(dir_parl)
tfidf = TfIdf()
for word in parl_counter:
    tf = tfidf.tf(word, parl_counter)
    idf = tfidf.idf(word,counter_list)
    log_idf = tfidf.idf_smooth(word,counter_list)
    ent_idf = tfidf.idf_like(word,parl_counter, tot_counter, counter_list, counter_list_dep)
    tfidf_n.append(tf*idf)
    tf_log_idf.append(tf*log_idf)
    tfidf_like.append(tf*ent_idf)

dic_tfidf= list(zip(parl_counter.keys(), tfidf_n))
dic_tf_log_idf= list(zip(parl_counter.keys(), tf_log_idf))
dic_tfidf_like= list(zip(parl_counter.keys(), tfidf_like))

"""
corr +=  "tfidf X tfidf_smooth: "+str(stats.spearmanr([v for i,v in dic_tfidf] ,[v for i,v in dic_tf_log_idf]))+"\n"
Exemplo n.º 38
0
class TestSequenceFunctions(unittest.TestCase):
    def setUp(self):
        self.unk_cutoff = 2
        self.vocab = TfIdf(unk_cutoff=self.unk_cutoff)

    def test_vocab(self):
        self.vocab.train_seen("a", 300)

        self.vocab.train_seen("b")
        self.vocab.train_seen("c")
        self.vocab.finalize()

        # Infrequent words should look the same
        self.assertEqual(self.vocab.vocab_lookup("b"),
                         self.vocab.vocab_lookup("c"))

        # Infrequent words should look the same as never seen words
        self.assertEqual(self.vocab.vocab_lookup("b"),
                         self.vocab.vocab_lookup("d"), "")

        # The frequent word should be different from the infrequent word
        self.assertNotEqual(self.vocab.vocab_lookup("a"),
                            self.vocab.vocab_lookup("b"))

    def test_censor(self):
        self.vocab.train_seen("a", 300)

        self.vocab.train_seen("b")
        self.vocab.train_seen("c")
        self.vocab.finalize()

        censored_a = [str(x) for x in self.vocab.tokenize("a b d")]
        censored_b = [str(x) for x in self.vocab.tokenize("d b a")]
        censored_c = [str(x) for x in self.vocab.tokenize("a b d")]
        censored_d = [str(x) for x in self.vocab.tokenize("b d a")]

        self.assertEqual(censored_a, censored_c)
        self.assertEqual(censored_b, censored_d)

        # Should add start and end tag
        print(censored_a)
        self.assertEqual(len(censored_a), 3)
        self.assertEqual(censored_a[0], censored_b[2])
        self.assertEqual(censored_a[1], censored_b[0])

    def test_tf(self):
        self.vocab.train_seen("a", 300)
        self.vocab.finalize()

        self.vocab.add_document("a a b")

        # Test MLE
        word_a = self.vocab.vocab_lookup("a")
        word_b = self.vocab.vocab_lookup("b")
        word_c = self.vocab.vocab_lookup("c")

        self.assertAlmostEqual(self.vocab.term_freq(word_a), 0.66666666)
        self.assertAlmostEqual(self.vocab.term_freq(word_b), 0.33333333)
        self.assertAlmostEqual(self.vocab.term_freq(word_c), 0.33333333)

    def test_df(self):
        self.vocab.train_seen("a", 300)
        self.vocab.train_seen("b", 100)
        self.vocab.finalize()

        self.vocab.add_document("a a b")
        self.vocab.add_document("b b c")
        self.vocab.add_document("a a a")
        self.vocab.add_document("a a a")

        # Test MLE
        word_a = self.vocab.vocab_lookup("a")
        word_b = self.vocab.vocab_lookup("b")
        word_c = self.vocab.vocab_lookup("c")
        word_d = self.vocab.vocab_lookup("d")

        self.assertAlmostEqual(self.vocab.inv_docfreq(word_a),
                               log10(1.3333333))
        self.assertAlmostEqual(self.vocab.inv_docfreq(word_b), log10(2.0))
        self.assertAlmostEqual(self.vocab.inv_docfreq(word_c), log10(4.0))
        self.assertAlmostEqual(self.vocab.inv_docfreq(word_d), log10(4.0))
Exemplo n.º 39
0
        categories_counter.append(Counter(tmp))
        test_data.append(categ[:k])


    print("process tfidf")
    tfidf_entropy = list()
    tfidf_smooth = list()
    tfidf_like = list()

    for i , data in enumerate(categories_counter):
        tmp_smooth = dict()
        tmp_like = dict()
        tmp_entropy = dict()
        print("dataset: " + str(i))
        for word in data:
            tf = TfIdf.tf(word, data)
            tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, categories_counter)
            tmp_smooth[word] = tf * TfIdf.idf_smooth(word, categories_counter)
            tmp_like[word] = tf * TfIdf.idf_like(word, i, categories_counter)
        tfidf_smooth.append(tmp_smooth)
        tfidf_like.append(tmp_like)
        tfidf_entropy.append(tmp_entropy)

    print("processing softmax confusion matrix")
    confusion_like = np.zeros(shape=(len(test_data), len(test_data)))
    confusion_smooth = np.zeros(shape=(len(test_data), len(test_data)))
    confusion_entropy = np.zeros(shape=(len(test_data), len(test_data)))
    for i, data in enumerate(test_data):
        for tw in data:
            j, value = classifier_s(tw, tfidf_like)
            confusion_like[i, j] += 1
Exemplo n.º 40
0
except IndexError:
    save_file = 'pickled_tfidf.pickle'

print "saving to ", save_file

try:
  with open(save_file) as rh:
    top_100 = cPickle.load(rh)
except IOError:
  top_100 = {}
print "proceeding with", len(top_100), "previous tfidf docs"

with open(save_file, 'w') as wh:
    wh.write('0\n')

comment_model = TfIdf(corpus_filename="idf_model_filteredsorted.txt", stopword_filename="curated_stopwords.txt", 
                        DEFAULT_IDF=0.0000001) #if not in idf model, give very low score, since model is filtered

#find the number of beers for progress indication
c.execute("SELECT id from beer")
total_beers = len(list(c.fetchall()))
print "calculating tfidf of ", total_beers, "beers."

c.execute("SELECT id, name FROM beer")
idx = 0 #don't want to unwrap the generator so we'll idex this way
worked = 0
for beer_id, name in c.fetchall():
    if idx%1000 == 0:
        print """*-*-*-* Finished {0}% of the processing.""".format(float(idx)/total_beers)
        with open(save_file, 'w') as wh:
            cPickle.dump(top_100, wh)
    idx += 1
Exemplo n.º 41
0
	def __init__(self, corpus_filename = None, stopword_filename = None, DEFAULT_IDF = 1.5):
		TfIdf.__init__(self, corpus_filename = corpus_filename, \
			stopword_filename = stopword_filename, DEFAULT_IDF = DEFAULT_IDF)
		self.init_file_count()
Exemplo n.º 42
0
    tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(parl_tw_processed))))
    tot_counter = Counter(tweets)


    parl_counters = list()
    for parl in parl_tw_processed:
        tw = list(itertools.chain.from_iterable(parl))
        parl_counters.append(Counter(tw))


    docs_counter =list()
    docs_counter.append(tot_counter)
    docs_counter.append(coleta1)
    docs_counter.append(coleta2)

    tfidf = TfIdf()

    tfidf_like = list()
    for word in tot_counter:
        tfidf_like.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, parl_counters))

    sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like))
    sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True)

    with open(dir_rob+"sort_tfidf_like.pck", 'wb') as handle:
        pickle.dump(sort_tfidf_like, handle)

    with open(dir_rob+"tfidf_like.pck", 'wb') as handle:
        pickle.dump(tfidf_like, handle)

    with open(dir_rob+"parl_tw_processed.pck", 'wb') as handle:
Exemplo n.º 43
0
class TweetAdder:

    #@perftest
    def __init__(self, sql_obj=None):
        if not sql_obj:
            self.sql = SQLQuery()
        else:
            self.sql = sql_obj

        self.tfidf_obj = TfIdf()
        self.ids = None

    def addTimelineTweet(self, timeline_tweet):
        """
        Converts timeline tweet to search api format and adds it as a celebrity tweet.
        """
        tweet = self.convertTimelineTweet(timeline_tweet)
        self.add(tweet, created_at_is_obj=True, tweet_table="tweets")

    def addNonCelebTimelineTweet(self, timeline_tweet):
        """
        Converts timeline tweet to Search API format and adds it as a non-celebrity tweet.
        """
        self.add(self.convertTimelineTweet(timeline_tweet),
                 created_at_is_obj=True,
                 tweet_table="tweets_non_celeb")

    def convertTimelineTweet(self, timeline_tweet):
        """
        Converts a timeline tweet to the format returned by the Search API.
        """
        tweet = {}
        created_at = replaceMonth(timeline_tweet['created_at'])
        dt = datetime.datetime(int(created_at[25:]), int(created_at[4:6]),
                               int(created_at[7:9]), int(created_at[10:12]),
                               int(created_at[13:15]), int(created_at[16:18]))
        tweet['created_at'] = dt

        tweet['from_user'] = timeline_tweet['user']['screen_name']
        tweet['from_user_id'] = timeline_tweet['user']['id']
        tweet['from_user_name'] = timeline_tweet['user']['name']
        tweet['geo'] = timeline_tweet['user']['location']
        tweet['id'] = timeline_tweet['id']
        tweet['iso_language_code'] = timeline_tweet['user']['lang']
        tweet['metadata'] = {'result_type': 'timeline'}
        tweet['profile_image_url'] = timeline_tweet['user'][
            'profile_image_url']
        tweet['source'] = timeline_tweet['source']
        tweet['text'] = timeline_tweet['text']
        tweet['to_user'] = timeline_tweet['in_reply_to_screen_name']
        tweet['to_user_id'] = timeline_tweet['in_reply_to_user_id']
        tweet['to_user_name'] = None

        return tweet

    def add(self, tweet, created_at_is_obj=False, tweet_table="tweets"):
        """
        Adds a tweet to tweet_table (celebrity tweet table by default).
        Tweet must be in the format provided by Search API.
        """

        if not self.ids:
            self.ids = [i[0] for i in self.sql.q("SELECT id FROM tweets")]

        debuglog.msg("Inserting tweet", tweet['id'])
        #debuglog.pprint_msg(tweet)

        if not created_at_is_obj:
            dt = datetime.datetime.strptime(
                replaceMonth(tweet['created_at'][5:25]), "%d %m %Y %H:%M:%S")
        else:
            dt = tweet['created_at']

        created_at = dt.strftime("%Y-%m-%d %H:%M:%S")

        dicvals = {
            'created_at': created_at,
            'from_user': tweet['from_user'],
            'from_user_id': tweet['from_user_id'],
            'from_user_name': tweet['from_user_name'],
            'geo': str(tweet['geo']),
            'id': tweet['id'],
            'iso_language_code': tweet['iso_language_code'],
            'metadata': str(tweet['metadata']),
            'profile_image_url': tweet['profile_image_url'],
            'source': tweet['source'],
            'text': tweet['text'],
            'to_user': tweet['to_user'],
            'to_user_id': tweet['to_user_id'],
            'to_user_name': tweet['to_user_name']
        }

        dicq = "INSERT IGNORE INTO " + tweet_table

        dicq += """ VALUES(%(created_at)s,
                           %(from_user)s,
                           %(from_user_id)s,
                           %(from_user_name)s,
                           %(geo)s,
                           %(id)s,
                           %(iso_language_code)s,
                           %(metadata)s,
                           %(profile_image_url)s,
                           %(source)s,
                           %(text)s,
                           %(to_user)s,
                           %(to_user_id)s,
                           %(to_user_name)s)"""

        if tweet['id'] not in self.ids:
            succeeded = False
            try:
                self.sql.q(dicq, dicvals)
                succeeded = True
            except UnicodeEncodeError:
                try:
                    debuglog.msg("\tUNIDECODE ERROR, trying decode...")
                    for k in dicvals:
                        dicvals[k] = unidecode(dicvals[k])
                    self.sql.q(dicq, dicvals)
                    succeeded = True
                except:
                    debuglog.msg("\tUnidecode failed :(")

            if succeeded and tweet_table == 'tweets':
                tokens = self.tfidf_obj.get_tokens(tweet['text'])
                self.addTokens(tweet, tokens)
                self.addTokenMapping(tweet, tokens)

            return succeeded

        debuglog.msg("\ttweet already existed")
        return False

    def addTokens(self, tweet, tokens=None):
        if tokens is None:
            txt = tweet['text']
            tokens = self.tfidf_obj.get_tokens(txt)

        if not tokens or not len(tokens):
            return

        count = 0
        vals = {}
        q = "INSERT IGNORE INTO tokens (token, type) VALUES"

        for token in tokens:
            #print(token)
            vals['token' + str(count)] = token[0]
            vals['type' + str(count)] = token[1]
            q += "(%(token" + str(count) + ")s, %(type" + str(count) + ")s),"
            count += 1

        q = q[:len(q) - 1]  #remove last comma
        self.sql.q(q, vals)

    def addTokenMapping(self, tweet, tokens=None):
        if tokens is None:
            txt = tweet['text']
            tokens = self.tfidf_obj.get_tokens(txt)

        if not tokens or not len(tokens):
            return

        count = 0
        vals = {'user': tweet['from_user'], 'tweet_id': tweet['id']}
        q = "INSERT INTO token_user_mapping (user, token, tweet_id) VALUES"
        for token in tokens:
            q += "(%(user)s, %(token" + str(count) + ")s, %(tweet_id)s),"
            vals['token' + str(count)] = token[0]
            count += 1

        #print("token mapping query",q)
        q = q[:len(q) - 1]  #remove last comma
        self.sql.q(q, vals)

    def deleteCeleb(self, celeb):
        """
        Back up and delete data for a celeb who doesn't make the cut.
        """
        self.backupCeleb(celeb)

        vals = {'celeb': celeb}

        q = "DELETE FROM celebs WHERE user=%(celeb)s"
        self.sql.q(q, vals)

        q = "DELETE FROM celeb_tfidf WHERE user=%(celeb)s"
        self.sql.q(q, vals)

        q = "DELETE FROM token_user_mapping WHERE user=%(celeb)s"
        self.sql.q(q, vals)

        q = "DELETE FROM tweets WHERE from_user=%(celeb)s"
        self.sql.q(q, vals)

        print("Deleted", celeb)

    def backupCeleb(self, celeb):
        """
        Back up data for a celeb (before deleting them).
        """
        vals = {'celeb': celeb}

        q = "INSERT INTO celebs_deleted (user) VALUES(%(celeb)s)"
        self.sql.q(q, vals)

        q = "INSERT INTO celeb_tfidf_deleted (SELECT * FROM celeb_tfidf_all WHERE user=%(celeb)s)"
        self.sql.q(q, vals)

        q = "INSERT INTO token_user_mapping_deleted (SELECT * FROM token_user_mapping WHERE user=%(celeb)s)"
        self.sql.q(q, vals)

        q = "INSERT INTO tweets_deleted (SELECT * FROM tweets WHERE from_user=%(celeb)s)"
        self.sql.q(q, vals)

    def fixTokens(self):
        q = "SELECT text, from_user, id FROM tweets"

        results = self.sql.q(q)
        failures = []
        f = open('token_fix_failures.txt', 'w')
        for result in results:
            debuglog.msg("Adding tokens for tweet", result[2])
            try:
                self.addTokens({'text': result[0], 'from_user': result[1]})
                self.addTokenMapping({
                    'text': result[0],
                    'from_user': result[1],
                    'id': result[2]
                })
            except:
                failures.append(result[2])
                debuglog.msg("\tAdding tokens failed!")
                debuglog.msg("\tFailures so far:", len(failures))
                f.write(result[2] + "\n")

        f.close()

        debuglog.msg(failures)

    def identityMissingTweets(self):
        f = open('added_tweets.txt')
        added_tweet_ids = [
            int(line.replace('\n', '')) for line in f.readlines()
        ]
        f.close()

        #pprint.pprint(added_tweet_ids)

        q = "SELECT id FROM tweets"
        all_tweet_ids = [result[0] for result in self.sql.q(q)]

        missing_tweets = list(
            filter(lambda x: x not in added_tweet_ids, all_tweet_ids))

        pprint.pprint(missing_tweets)

    def fixTokensInterrupted(self):
        f = open('missing_tweets2.txt')
        missing_tweets = [line.replace('\n', '') for line in f.readlines()]

        q = "SELECT text, from_user, id FROM tweets WHERE id IN("
        vals = {}

        q += ','.join(missing_tweets) + ')'

        results = self.sql.q(q)
        #pprint.pprint(results)
        #return

        failures = []
        f = open('token_fix_failures.txt', 'w')
        for result in results:
            debuglog.msg("Adding tokens for tweet", result[2])
            try:
                self.addTokens({'text': result[0], 'from_user': result[1]})
                self.addTokenMapping({
                    'text': result[0],
                    'from_user': result[1],
                    'id': result[2]
                })
            except:
                failures.append(result[2])
                debuglog.msg("\tAdding tokens failed!")
                debuglog.msg("\tFailures so far:", len(failures))
                f.write(result[2] + "\n")

        f.close()

        debuglog.msg(failures)

    def addTweetIdsToTokenUserMapping(self):
        q = "SELECT user FROM celebs WHERE user!='ladygaga'"
        celebs = [result[0] for result in self.sql.q(q)]

        count = 0
        for celeb in celebs:
            print("Adding tweet ids for", celeb)
            q = "SELECT id, text FROM tweets WHERE from_user=%(celeb)s"
            vals = {'celeb': celeb}
            celeb_tweets = self.sql.q(q, vals)

            num_tweets = str(len(celeb_tweets))
            tweet_count = 0
            for tweet in celeb_tweets:
                tokens = self.tfidf_obj.get_tokens(tweet[1])
                for token in tokens:
                    vals = {
                        'celeb': celeb,
                        'token': token[0],
                        'tweet_id': tweet[0]
                    }
                    q = "UPDATE token_user_mapping SET tweet_id=%(tweet_id)s WHERE user=%(celeb)s AND token=%(token)s AND tweet_id is null LIMIT 1;"

                    self.sql.q(q, vals)

                tweet_count += 1
                print("\t%s/%s tweets." % (tweet_count, num_tweets))

            count += 1
            print("%s%% of celebs updated." %
                  str(100 * count / float(len(celebs))))
Exemplo n.º 44
0
        print("Getting data from " + url.strip() + "...", end="", flush=True)
        response = get(url=url)

        print("done!\nParsing HTML data...", end="", flush=True)
        parser.feed(response.text)
        print("done!")

        keydict = parser.get_keydict()

        urldata = {"url": url, "keywords": keydict}

        id_md5 = hashlib.md5(url.encode()).hexdigest()

        docs[id_md5] = urldata

ti = TfIdf(docs)

for kd, d in docs.items():
    print("Processing document " + kd + "...", end="", flush=True)
    for kw, t in d['keywords'].items():
        docs[kd]['keywords'][kw]['tf_idf'] = ti.tf_idf(kw, kd)
    print("done!")

fname = 'webdirectory.txt'
print("Saving to file " + fname + "...", end="", flush=True)
with open(fname, 'w') as file:
    file.write(json.dumps(docs, sort_keys=False))

print("done!\nCompleted!")
Exemplo n.º 45
0


if __name__=='__main__':

    cf = configparser.ConfigParser()
    cf.read("file_path.properties")
    path = dict(cf.items("file_path"))
    dir_in = path['dir_in']
    dir_out = path['dir_out']
    dir_ale = path['dir_ale']
    dir_rob = path['dir_rob']

    tot_counter, parl_counter_list = load_counters(dir_out)
    tp = TextProcessor()
    tfidf = TfIdf()

    word_entropy = dict()
    for word in tot_counter:
        word_entropy[word] = tfidf.parl_entropy(word,tot_counter,parl_counter_list)

    freq = [int(math.pow(2,x)) for x in word_entropy.values() ]
plt.hist(freq, 15)
plt.xticks(np.arange(0,max(freq),20))
#plt.gca().set_yscale("log")
plt.xlabel("# de deputados que utilizaram a palavra" )
plt.ylabel("# palavras utilizadas pelos deputados" )
plt.show()
plt.clf()

Exemplo n.º 46
0
except IndexError:
    if (len(results) == 0):
        print("No results were found for this query.")
        exit()
    else:
        pass

for posting in list_of_postings:
    results = set(results).intersection(posting)

if (len(important) > 0):
    for res in results:
        important.append(res)

#vectorizer = TfidfVectorizer()
table = TfIdf()
G = nx.Graph()
#return urls corresponding to numbers
with open("url_files.csv") as f:
    urls = [row for row in csv.reader(f)]

    if len(results) != 0:

        for x in results:
            f = open(urls[x - 1][0])
            obj = json.load(f)

            soup = BeautifulSoup(obj["content"],
                                 "html.parser",
                                 from_encoding="iso-8859-1")
            joinedText = [
Exemplo n.º 47
0
class GUI:

    __directories = {
        "Documents": "data//documents-lab1.txt",
        "Keywords": "data//keywords-lab1.txt"
    }

    def do_search(self, widget, data):
        query = self.entry.get_text()
        result = self.tfidf.rank(query)
        self.result_view.show_documents(result)
        if self.is_query_expanding_active:
            new_queries = self.query_expander.expand(query)
            list = []
            for new_query in new_queries:
                if len(new_query) >= 1:
                    list.append(" ".join(new_query))
            self.query_expander_view.show_queries(list)

        #self.text_area.get_buffer().set_text(self.tfidf.get_result())
        
    def delete_event(self, widget, event, data=None):
        gtk.main_quit()
        return False

    def show_keywords(self):
        self.keywords_area.get_buffer().set_text(self.tfidf.get_keywords_string())

    def toggle_query_expanding(self, widget):
        self.is_query_expanding_active = not self.is_query_expanding_active
        if self.is_query_expanding_active:
            self.query_expander_container.show()
        else:
            self.query_expander_container.hide()
            self.query_expander_view.remove_old_buttons()
        print self.is_query_expanding_active

    def open_file(self, widget,  name):
        text = "Select {0} Source File".format(name)
        filechooserdialog = gtk.FileChooserDialog(text, None, gtk.FILE_CHOOSER_ACTION_OPEN, (gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_OK, gtk.RESPONSE_OK))
        response = filechooserdialog.run()

        if response == gtk.RESPONSE_OK:
            self.__directories[name] = filechooserdialog.get_filename()
            self.tfidf = TfIdf(self.__directories['Documents'], self.__directories['Keywords'])
            self.query_expander = QueryExpander()
            self.query_expander.loadKeywords(self.__directories['Keywords'])
            print "directories"
            print self.__directories
            #self.tfidf.print_stemmed_keywords()
            self.show_keywords()

        filechooserdialog.destroy()

    def __init__(self):
        self.query_expander = QueryExpander()
        self.query_expander.loadKeywords(self.__directories['Keywords'])
        #inits window and connects delete event
        print gtk.pygtk_version
        self.is_query_expanding_active = True
        self.tfidf = TfIdf(self.__directories['Documents'], self.__directories['Keywords'])
        print 'Dokument', self.tfidf.print_documents()
        self.window = gtk.Window(gtk.WINDOW_TOPLEVEL)
        self.window.set_title("Czesc Milosz!")
        self.window.connect("delete_event", self.delete_event)
        self.window.set_default_size(600,500)
        self.window.set_border_width(5)
        
        #prepare layout
        self.box1 = gtk.VBox(False, 5)
        self.window.add(self.box1)
        document_view = self.get_document_textarea_layout()
        self.box1.pack_start(self.get_menu_box(), False, False, 0)
        self.box1.pack_start(self.get_search_panel_layout(), False, False, 0)
        self.box1.pack_start(self.get_query_expander_view(), True, True, 0)
        self.box1.pack_start(self.get_result_layaut(), True, True, 0)
        self.box1.pack_start(document_view, True, True, 0)
        self.box1.pack_start(self.get_keywords_layout(), True, True, 0)
        self.show_keywords()
        #self.tfidf.print_stemmed_keywords()
        self.window.show_all()

    def get_query_expander_view(self):
        box = gtk.VBox()
        check_box = gtk.CheckButton("Query Expanding")
        check_box.set_active(True)
        check_box.connect("clicked", self.toggle_query_expanding)
        box.pack_start(check_box, False, False, 0)
        self.query_expander_container = gtk.ScrolledWindow()
        self.query_expander_container.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)
        self.query_expander_view = QueryExpanderView(self)
        self.query_expander_container.add_with_viewport(self.query_expander_view)
        box.pack_start(self.query_expander_container, True, True, 0)

        return box

    def get_menu_box(self):
        #menu
        mb = gtk.MenuBar()

        filemenu = gtk.Menu()
        file_item = gtk.MenuItem("File")
        file_item.set_submenu(filemenu)

        open_document_item = gtk.MenuItem("Open Documents Source")
        open_document_item.connect("activate", self.open_file, "Documents")

        open_keywords_item = gtk.MenuItem("Open Keywords Source")
        open_keywords_item.connect("activate", self.open_file, "Keywords")

        exit_item = gtk.MenuItem("Exit")
        exit_item.connect("activate", gtk.main_quit)

        filemenu.append(open_document_item)
        filemenu.append(open_keywords_item)
        filemenu.append(exit_item)

        mb.append(file_item)
        return mb

    def get_search_panel_layout(self):
        #prepare layout
        self.similar_list = []
        hbox = gtk.HBox(False, 5)
        self.entry = gtk.Entry()

        completion = gtk.EntryCompletion()
        self.liststore = gtk.ListStore(str)

        self.entry.set_completion(completion)
        completion.set_model(self.liststore)
        completion.set_text_column(0)

        for item in self.similar_list:
            self.liststore.append([item])



        hbox.pack_start(self.entry, True, True, 0)
        #prepare search button
        btn_search = gtk.Button("Szukaj")
        btn_search.connect("clicked", self.do_search, "button 2")
        hbox.pack_start(btn_search, True, True, 0)
        return hbox

    def get_result_layaut(self):
        sw = gtk.ScrolledWindow()
        sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)
        self.result_view = ResultView(self.document_area)
        sw.add_with_viewport(self.result_view)

        #prepare text area for results
        #self.text_area = gtk.TextView()
        #sw = gtk.ScrolledWindow()
        #sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)
        #self.text_area = gtk.TextView()
        #sw.add(self.text_area)
        return sw

    #def get_bottomarea_layout(self):
        #bottom = gtk.HBox(False, 0)
        #bottom.pack_start(self.get_document_textarea_layout(), False, False, 0)
        #bottom.pack_start(self.get_keywords_layout(), False, False, 0)
        #return bottom

    def get_document_textarea_layout(self):
        #prepare text area for results
        frame = gtk.Frame()
        sw = gtk.ScrolledWindow()
        sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)
        self.document_area = gtk.TextView()
        sw.add(self.document_area)
        frame.set_label("Document:")
        frame.add(sw)
        return frame

    def get_keywords_layout(self):
        #prepare text area for results
        frame = gtk.Frame()
        sw = gtk.ScrolledWindow()
        sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_AUTOMATIC)
        self.keywords_area = gtk.TextView()
        sw.add(self.keywords_area)
        frame.set_label("Keywords:")
        frame.add(sw)
        return frame
Exemplo n.º 48
0
    with open(dir_ale+"coleta3.pck", 'rb') as data_file:
        dataset.append(remove_irrelevant(pickle.load(data_file)))


    print("process tfidf")
    tfidf_entropy = list()
    tfidf_smooth = list()
    tfidf_like = list()

    for i , data in enumerate(dataset):
        tmp_smooth = dict()
        tmp_like = dict()
        tmp_entropy = dict()
        print("dataset: " + str(i))
        for word in data:
            tf = TfIdf.tf(word, data)
            tmp_entropy[word] = tf * TfIdf.idf_entropy(word, i, dataset)
            tmp_smooth[word] = tf * TfIdf.idf_smooth(word, dataset)
            tmp_like[word] = tf * TfIdf.idf_like(word, i, dataset)
        tfidf_smooth.append(tmp_smooth)
        tfidf_like.append(tmp_like)
        tfidf_entropy.append(tmp_entropy)


    print("save tfidf")
    with open(dir_out+"tfidf_entropy.pck", 'wb') as handle:
        pickle.dump(tfidf_entropy, handle)

    with open(dir_out+"tfidf_smooth.pck", 'wb') as handle:
        pickle.dump(tfidf_smooth, handle)
    
Exemplo n.º 49
0
def idf_like( word,parl_counter, tot_counter,doc_counter, counter_list_parl):
    return ((math.log2(len(doc_counter))-TfIdf.entropy(word,tot_counter,doc_counter))
        *TfIdf.parl_prob(word,parl_counter,doc_counter)*TfIdf.parl_entropy(word, tot_counter, counter_list_parl))
Exemplo n.º 50
0
    def test_similarity(self):
        table = TfIdf()
        table.add_document("doc1", [
            "The", "game", "of", "life", "is", "a", "game", "of",
            "everlasting", "learning"
        ])
        table.add_document(
            "doc2",
            ["The", "unexamined", "life", "is", "not", "worth", "living"])
        table.add_document("doc3", ["Never", "stop", "learning"])

        table.calculate_tf()
        table.calculate_idf()
        table.calculate_tf_idf()
        """self.assertEqual(
            table.similarities(["life","learning"]),
            [["foo", 1.0], ["bar", 0.707106781], ["baz", 0.707106781]])"""

        print(table.similarities(["life", "learning"]))
Exemplo n.º 51
0
def idf_pow( word,parl_counter, tot_counter,doc_counter, counter_list_parl,b1,b2):
        h_max = math.log2(len(doc_counter))
        h_word = TfIdf.entropy(word,tot_counter,doc_counter)
        x = math.pow(2,h_word)/math.pow(2,h_max)
        return (expon.pdf(h_word,scale=0.2)
            *TfIdf.parl_prob(word,parl_counter,doc_counter)*beta.pdf(x,b1,b2))
Exemplo n.º 52
0
 def setUp(self):
     self.unk_cutoff = 2
     self.vocab = TfIdf(unk_cutoff=self.unk_cutoff)
Exemplo n.º 53
0
def idf_like( word,parl_counter, tot_counter,doc_counter, counter_list_parl):
    return ((math.log2(len(doc_counter))-TfIdf.entropy(word,tot_counter,doc_counter))
        *TfIdf.parl_prob(word,parl_counter,doc_counter)*TfIdf.parl_entropy(word, tot_counter, counter_list_parl))


if __name__=='__main__':

cf = configparser.ConfigParser()
cf.read("file_path.properties")
path = dict(cf.items("file_path"))
dir_in = path['dir_in']
dir_out = path['dir_out']
dir_ale = path['dir_ale']
dir_pck = path['dir_pck']
tfidf = TfIdf()

with open(dir_out+"list_alea_bigrams.pck",'rb') as handle:
    ale_tweets = pickle.load(handle)
with open(dir_out+"list_dept_bigrams_.pck",'rb') as handle:
    parl_tweets = pickle.load(handle)

parl_bgr_counter = [l.ngram_fd for l in parl_tweets]
docs_bgr_counter = [l.ngram_fd for l in ale_tweets]
bgr_counter = dict()
for y in parl_bgr_counter:
    for k in y.keys(): bgr_counter[k] = k in bgr_counter and bgr_counter[k]+y[k] or y[k]

docs_bgr_counter.append(bgr_counter)

tot_counter = dict()
Exemplo n.º 54
0
    def __init__(self):
        self.myTfIdf = TfIdf("corpus10k.txt", "stopwords10k.txt")

        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        self.opener = opener
Exemplo n.º 55
0
 def __init__(self):
     pio.renderers.default = 'browser'
     tfidf = TfIdf()
     self.ids, self.titles, self.matrix = tfidf.get_matrix()
     self.vectorizer = tfidf.get_vectorizer()
Exemplo n.º 56
0
def menu():
    print("Que deseja fazer?")
    print("1 - Consultar a informação do site do jornal ABola")
    print("2 - Aplicar o algoritmo do TFIDF")
    print("3 - Sair")
    word = 0
    nword = 0
    narray = []
    j = 0
    for line in fileinput.input():
        if line.replace("\n", "") == "1":
            os.system('python3 web_scraper.py')
            print("Que deseja fazer?")
            print("1 - Consultar a informação do site do jornal ABola")
            print("2 - Aplicar o algoritmo do TFIDF")
            print("3 - Sair")
        elif (line.replace("\n", "") == "2") or (word > 0):
            if word == 0 and j == 0:
                if (os.path.isdir("artigos") == False):
                    print(
                        'Necessita de gerar primeiro o conteúdo. Escolha a opção 1'
                    )
                    print("Que deseja fazer?")
                    print("1 - Consultar a informação do site do jornal ABola")
                    print("2 - Aplicar o algoritmo do TFIDF")
                    print("3 - Sair")
                else:
                    filesA = os.listdir('artigos')
                    table = TfIdf()
                    for i in filesA:
                        with open('artigos/{}'.format(i), 'r') as content:
                            #print(content.read().split('h2'))
                            val = content.read().split('h2')
                            firstVal = val[0]
                            secondVal = val[1]
                            table.add_document(
                                'title{}'.format(i),
                                re.sub(r'[\W]', ' ', firstVal).lower().split())
                            table.add_document(
                                'text{}'.format(i),
                                re.sub(r'[\W]', ' ',
                                       secondVal).lower().split())
                    word += 1
                    print('Indique quantas palavras quer comparar:')
            elif (word == 1) and (j == 0):
                if (line.replace("\n", "").isnumeric() and int(line) > 1):
                    nword = int(line)
                    word += 1
                else:
                    print('Digite um número maior que 1')
            elif (word > 1) and (word <= nword) and (j == 0):
                if (line.replace("\n", "") != ''):
                    narray.append(line.replace("\n", "").lower())
                    word += 1
            else:
                j = 1
                if (j == 1):
                    if line.replace("\n", "") != '':
                        narray.append(line.replace("\n", "").lower())
                        j += 1
                if (j == 2):
                    print(narray)
                    fTDIDF = open('output' + narray[0] + '.html', 'w+')
                    fTDIDF.write(
                        '<h2>Resultados da aplicação do algoritmo:<h2>')
                    splitArray = {}
                    for s in table.similarities(narray):
                        if s[0].startswith('title'):
                            s[0] = s[0].replace('title', '')
                            if s[0] in splitArray.keys():
                                splitArray[s[0]] += s[1] * 0.7
                            else:
                                splitArray[s[0]] = s[1] * 0.7
                        elif s[0].startswith('text'):
                            s[0] = s[0].replace('text', '')
                            if s[0] in splitArray.keys():
                                splitArray[s[0]] += s[1] * 0.3
                            else:
                                splitArray[s[0]] = s[1] * 0.3

                    for elem in splitArray.keys():
                        fTDIDF.write(
                            '<p><h5><a href="artigos/{}" >'.format(elem) +
                            elem + '</a> -> ' + str(splitArray[elem]) +
                            '</h5></p>')

                    new = 2  # open in a new tab, if possible
                    url = "file:///home/ze/SPLN/WebScraper/output" + narray[
                        0] + ".html"
                    webbrowser.open(url, new=new)
                    word = 0
                    nword = 0
                    narray = []
                    print("Que deseja fazer?")
                    print("1 - Consultar a informação do site do jornal ABola")
                    print("2 - Aplicar o algoritmo do TFIDF")
                    print("3 - Sair")
        elif (line.replace("\n", "") == "3") and (word == 0):
            print("Obrigado pela sua visita")
            fileinput.close()
Exemplo n.º 57
0

tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(parl_tweets))))
tot_counter = Counter(tweets)

docs_counter = list()
for alea_tw in alea_tweets:
    tw = list(itertools.chain.from_iterable(alea_tw))
    docs_counter.append(Counter(tw))
docs_counter.append(tot_counter)

parl_counters = list()
for parl in parl_tweets:
    tw = list(itertools.chain.from_iterable(parl))
    parl_counters.append(Counter(tw))

tfidf = TfIdf()
tfidf_like_bi_trigrams = list()
for word in tot_counter:
    tfidf_like_bi_trigrams.append(tfidf.tf(word,tot_counter)*tfidf.idf_like(word,tot_counter,tot_counter,docs_counter, parl_counters))

sort_tfidf_like = list(zip(tot_counter.keys(), tfidf_like_bi_trigrams))
sort_tfidf_like = sorted(sort_tfidf_like, key=lambda x: x[1], reverse=True)

with open(dir_out+"sort_tfidf_like_bi_trigram.pck", 'wb') as handle:
    pickle.dump(sort_tfidf_like, handle)