Exemplo n.º 1
0
 def test_multiple_topics_with_stop_words(self):
     article = u"the banana a banana an banana or orange and orange"
     topics = get_topics(article)
     self.assertEqual(len(topics), 2)
     self.assertTrue("banana" in topics)
     self.assertTrue("orange" in topics)
     self.assertFalse("the" in topics)
Exemplo n.º 2
0
 def test_multiple_topics_with_stop_words(self):
     article = u"the banana a banana an banana or orange and orange"
     topics = get_topics(article)
     self.assertEqual(len(topics), 2)
     self.assertTrue("banana" in topics)
     self.assertTrue("orange" in topics)
     self.assertFalse("the" in topics)
Exemplo n.º 3
0
def get_synonyms(word):
    #Clean input word (lowercase, stemming, ect)
    # res = topics.get_topics(word)
    cleanedWord = word

    #Load trained model, !-CHANGE PATH TO PATH OF MODEL-!
    model = word2vec.load(
        "/Users/thiagolobo/Desktop/irproject/repo_new/stalkr/stalkr/vectors.bin"
    )

    try:
        #get similar words from model
        indexes, metrics = model.cosine(cleanedWord)
        synonyms = model.vocab[indexes]

        #Clean data
        string = ""
        for elem in synonyms:
            string = string + " " + elem
        cleanedSynonyms = topics.get_topics(string)

        #remove duplicate of search word
        wordList = list(cleanedSynonyms)
        while cleanedWord in wordList:
            wordList.remove(cleanedWord)

    except KeyError as e:
        #return empty list if word was not present in model
        wordList = []

    return wordList
Exemplo n.º 4
0
 def test_punctuation(self):
     article = u"the cat was on the mat."
     topics = get_topics(article)
     self.assertEqual(len(topics), 2)
     self.assertTrue("cat" in topics)
     self.assertTrue("mat" in topics)
     self.assertFalse("." in topics)
Exemplo n.º 5
0
 def test_punctuation(self):
     article = u"the cat was on the mat."
     topics = get_topics(article)
     self.assertEqual(len(topics), 2)
     self.assertTrue("cat" in topics)
     self.assertTrue("mat" in topics)
     self.assertFalse("." in topics)
Exemplo n.º 6
0
def tokenize(query):
    tokens = topics.get_topics(query)
    tokenized = {}
    for token in tokens:
        if token in tokenized:
            tokenized[token] += 1
        else:
            tokenized[token] = 1
    return tokenized
Exemplo n.º 7
0
def tokenize(query):
    tokens = topics.get_topics(query)
    tokenized = {}
    for token in tokens:
        if token in tokenized:
            tokenized[token] += 1
        else:
            tokenized[token] = 1
    return tokenized
Exemplo n.º 8
0
def list_topics(area_id):
    if users.login_id() == 0:
        return render_template("index.html", login="******")
    else:
        area_name = areas.get_areaname(area_id)
        tlist = topics.get_topics(area_id)
        return render_template("topics.html",
                               area_id=area_id,
                               area_name=area_name,
                               topics=tlist)
Exemplo n.º 9
0
    def __init__(self, grouped_tweets_csv_filename, plot_year):
        self.grouped_tweets_csv_filename = grouped_tweets_csv_filename
        self.year = plot_year

        self.topics = t.get_topics()
        self.female_topics = [
            'womens_rights', 'education', 'healthcare', 'familycare'
        ]
        self.male_topics = [
            'military', 'economics', 'foreign_affairs', 'policing'
        ]

        self.party_and_gender_colors = t.get_colors('gender_and_party')
        self.gender_colors = t.get_colors('gender')
        self.party_colors = t.get_colors('party')
Exemplo n.º 10
0
def main():
    print('Loading corpus...')
    chapters = load_corpus()

    print('Finding named entities...')
    entities_list = get_named_entities(chapters)
    entities_list = filter_entities(entities_list)

    print('Processing corpus...')
    chapters_processed = preprocess(chapters)

    print('Creating TermDocumentMatrix...')
    vocabulary, vector_space = get_termDocumentMatrix(chapters_processed)

    print('Computing entities relationships...')
    entity_matrix = get_entity_matrix(entities_list, vocabulary, vector_space)
    
    print('Topic extraction...')
    topics = get_topics(chapters_processed)

    print('Entities graph...')
    draw_entities_relation(entity_matrix, topics)
Exemplo n.º 11
0
    def get(self):
        cypher = self.db.cypher
        query = self.get_argument('q')
        alpha = float(self.get_argument('a', 0.5))
        prtype = self.get_argument('t', 'rankb')
        limit = int(self.get_argument('l', 50))

        users = recommend(query, alpha=alpha, pr_type=prtype, limit=limit)
        tokens = get_topics(query)

        rawtokens = query.split(" ")
        # Get synonyms for all tokens present in supplied query and flatten the
        # result into a single list.
        allsyn = sum([get_synonyms(token) for token in rawtokens], [])
        # Quote all synonyms.
        allsyn = [ "'" + s + "'" for s in allsyn]
        # Only keep the synonyms that exists in the database.
        cursor = self.db.cypher.execute("MATCH (w:Word) WHERE w.name IN [" + ",".join(allsyn) + "] RETURN w.name")
        synonyms = [w["w.name"] for w in cursor]

        res = {'users': users, 'tokens': tokens, 'synonyms': synonyms}
        self.write(res)
Exemplo n.º 12
0
    def get(self):
        cypher = self.db.cypher
        query = self.get_argument('q')
        alpha = float(self.get_argument('a', 0.5))
        prtype = self.get_argument('t', 'rankb')
        limit = int(self.get_argument('l', 50))

        users = recommend(query, alpha=alpha, pr_type=prtype, limit=limit)
        tokens = get_topics(query)

        rawtokens = query.split(" ")
        # Get synonyms for all tokens present in supplied query and flatten the
        # result into a single list.
        allsyn = sum([get_synonyms(token) for token in rawtokens], [])
        # Quote all synonyms.
        allsyn = ["'" + s + "'" for s in allsyn]
        # Only keep the synonyms that exists in the database.
        cursor = self.db.cypher.execute("MATCH (w:Word) WHERE w.name IN [" +
                                        ",".join(allsyn) + "] RETURN w.name")
        synonyms = [w["w.name"] for w in cursor]

        res = {'users': users, 'tokens': tokens, 'synonyms': synonyms}
        self.write(res)
from datetime import datetime

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import topics as t

topics = t.get_topics()
topic_colors = t.get_colors('topics')
topic_gendered_colors = t.get_colors('topics_gendered')


def replace_party(party):
    if party == 'Republican Party':
        return 'Republican'
    elif party == 'Democratic Party':
        return 'Democratic'
    else:
        return party


def date_to_month_and_year(date_string):
    date_string = standardize_date_string(date_string)
    month, day, year = date_string.split('/')
    month_and_year = month + '/' + year
    return month_and_year


def standardize_date_string(date_string):
    ret = ''
    split_string = date_string.split('/')
Exemplo n.º 14
0
"""Test topic matching and the like"""
import os
import unittest

import lib
import topics

os.chdir(lib.get_project_dir())
topic_list = topics.get_topics()


class TestMatching(unittest.TestCase):
    
    def test_matching(self):
        title_list = "the quick brown fox".split()
        for word in title_list:
            self.assertTrue(topics.matching(word, title_list))
        for word in title_list:
            self.assertTrue(topics.matching(word[1:-1], title_list))
        self.assertTrue(topics.matching("", []))
        self.assertTrue(topics.matching("unicode", ["unicode?"]))
if __name__ == "__main__":
    unittest.main()
Exemplo n.º 15
0
def main():
    only_max_topics = False  # true for only selecting max topic(s), false otherwise for all topics present
    skip_officials = False  # make true to not include official twitters in analysis

    threshold_percent = 0.1
    input_csv = 'C:\\Users\\Eleanor\\Dropbox\\Honors Research\\Tweets\\TwitterAPIWrapper\\tweets_2018.csv'
    # input_csv = 'sample_tweets.csv'
    output_csv = '2018_categorized_tweets_all_2.csv'

    topics = t.get_topics()

    start = time.time()
    print('reading')
    tweets_df = pd.read_csv(input_csv)
    print('cleaning')
    clean_data(tweets_df)

    if skip_officials:
        tweets_df = tweets_df[tweets_df['twitter_type'] != 'official_twitter']

    tweets_df['category'] = ''
    tweets_df['category_word_matches'] = ''
    tweets_df['category_topic_percent'] = 0

    for topic in topics:
        tweets_df[topic] = 0

    print('processing')
    for index, row in tweets_df.iterrows():
        if index % 1000 == 0:
            print(index)

        counter = {topic: 0 for topic in topics.keys()}
        counter_words = {topic: [] for topic in topics.keys()}
        for topic in topics:
            for phrase in topics[topic]:
                if ' ' in phrase:
                    joined_clean_text = ' '.join(row['cleaned_text'])
                    if phrase in joined_clean_text:
                        if ' ' in phrase and not really_in(
                                joined_clean_text, phrase):
                            continue
                        counter[topic] += 1
                        counter_words[topic].append(phrase)
                else:
                    if phrase in row['cleaned_text']:
                        counter[topic] += 1
                        counter_words[topic].append(phrase)

        if only_max_topics:
            max_topic = max(counter, key=lambda val: counter[val])
            max_topic_phrases = counter_words[max_topic]
            max_topic_percent = counter[max_topic] / len(
                row['cleaned_text']) if len(row['cleaned_text']) != 0 else 0
        else:
            max_topic = ''
            max_topic_phrases = []
            max_topic_percent = 1.0001 * threshold_percent

        if max_topic_percent <= threshold_percent:
            continue
        for topic in topics.keys():
            if topic == max_topic:
                continue
            topic_percent = counter[topic] / len(row['cleaned_text']) if len(
                row['cleaned_text']) != 0 else 0
            if topic_percent >= max_topic_percent:
                max_topic += ',' + topic
                max_topic_phrases += counter_words[topic]

        if max_topic == '':
            continue
        elif max_topic[0] == ',':
            max_topic = max_topic[1:]

        tweets_df.loc[index, ['category']] = max_topic
        tweets_df.loc[index,
                      ['category_word_matches']] = str(max_topic_phrases)
        tweets_df.loc[index, ['category_topic_percent']] = max_topic_percent

        for topic in max_topic.split(','):
            tweets_df.loc[index, [topic.strip()]] = 1

    print('writing')
    tweets_df.to_csv(output_csv)
    print(time.time() - start)
Exemplo n.º 16
0
    def group(categorized_tweets_csv_filename, candidates_csv_filename,
              output_filename):
        topics = get_topics()
        categorized_tweets_df = pd.read_csv(categorized_tweets_csv_filename)

        cols = [
            'candidate_id', 'tweet_favorites', 'tweet_retweets', 'category'
        ]
        new_tweets_df = pd.DataFrame(columns=cols)

        for index, row in categorized_tweets_df.iterrows():
            if index % 500 == 0:
                print(index)
            for topic in topics:
                if row[topic] == 1:
                    new_row = {
                        'candidate_id': row['candidate_id'],
                        'tweet_favorites': row['tweet_favorites'],
                        'tweet_retweets': row['tweet_retweets'],
                        'category': topic
                    }
                    new_tweets_df = new_tweets_df.append(new_row,
                                                         ignore_index=True)

        new_tweets_df['tweet_favorites'] = new_tweets_df[
            'tweet_favorites'].astype(int)
        new_tweets_df['tweet_retweets'] = new_tweets_df[
            'tweet_retweets'].astype(int)

        new_tweets_df = new_tweets_df.groupby(['candidate_id', 'category'
                                               ]).agg(['count', 'mean', 'sum'])
        new_tweets_df.columns = new_tweets_df.columns.map(
            '{0[0]}_{0[1]}'.format)
        new_tweets_df = new_tweets_df.reset_index()
        new_tweets_df.drop(['tweet_retweets_count'], axis=1, inplace=True)
        new_tweets_df.columns = [
            'candidate_id', 'category', 'count', 'tweet_favorites_mean',
            'tweet_favorites_sum', 'tweet_retweets_mean', 'tweet_retweets_sum'
        ]

        cols = ['candidate_id']
        for topic in topics:
            cols.append(topic + '_count')
            cols.append(topic + '_average_favorites')
            cols.append(topic + '_sum_favorites')
            cols.append(topic + '_average_retweets')
            cols.append(topic + '_sum_retweets')

        candidate_category_info_df = pd.DataFrame(columns=cols)
        current_cand_id = -1
        current_row = {}
        for index, row in new_tweets_df.iterrows():
            if index % 500 == 0:
                print(index)
            if row['candidate_id'] != current_cand_id:
                for topic in topics:
                    if topic + '_count' not in current_row:
                        current_row[topic + '_count'] = 0
                        current_row[topic + '_average_favorites'] = 0
                        current_row[topic + '_sum_favorites'] = 0
                        current_row[topic + '_average_retweets'] = 0
                        current_row[topic + '_sum_retweets'] = 0

                if current_cand_id != -1:
                    candidate_category_info_df = candidate_category_info_df.append(
                        current_row, ignore_index=True)
                current_row = {'candidate_id': row['candidate_id']}
                current_cand_id = row['candidate_id']

            category = row['category']
            current_row[category + '_count'] = row['count']
            current_row[category +
                        '_average_favorites'] = row['tweet_favorites_mean']
            current_row[category +
                        '_sum_favorites'] = row['tweet_favorites_sum']
            current_row[category +
                        '_average_retweets'] = row['tweet_retweets_mean']
            current_row[category + '_sum_retweets'] = row['tweet_retweets_sum']

        engagement_df = CandidateGrouper.get_engagement_df(
            categorized_tweets_df)
        candidates_df = pd.read_csv(candidates_csv_filename)
        candidates_df = candidates_df[['candidate_id', 'party', 'gender']]

        candidate_category_info_df = candidate_category_info_df.merge(
            engagement_df, on='candidate_id')
        candidate_category_info_df = candidate_category_info_df.merge(
            candidates_df, on='candidate_id')

        # This part calculates masculine/feminine engagement
        feminine_topics = [
            'womens_rights', 'education', 'healthcare', 'familycare'
        ]
        masculine_topics = [
            'military', 'economics', 'foreign_affairs', 'policing'
        ]
        candidate_category_info_df['masculine_count'] = sum(
            candidate_category_info_df['{0}_count'.format(topic)]
            for topic in masculine_topics)
        candidate_category_info_df['feminine_count'] = sum(
            candidate_category_info_df['{0}_count'.format(topic)]
            for topic in feminine_topics)

        candidate_category_info_df['masculine_total_engagement'] = sum(
            candidate_category_info_df['{0}_sum_favorites'.format(topic)]
            for topic in masculine_topics) + sum(
                candidate_category_info_df['{0}_sum_retweets'.format(topic)]
                for topic in masculine_topics)
        candidate_category_info_df['feminine_total_engagement'] = sum(
            candidate_category_info_df['{0}_sum_favorites'.format(topic)]
            for topic in feminine_topics) + sum(
                candidate_category_info_df['{0}_sum_retweets'.format(topic)]
                for topic in feminine_topics)

        candidate_category_info_df[
            'masculine_average_engagement'] = candidate_category_info_df[
                'masculine_total_engagement'] / candidate_category_info_df[
                    'masculine_count']
        candidate_category_info_df[
            'feminine_average_engagement'] = candidate_category_info_df[
                'feminine_total_engagement'] / candidate_category_info_df[
                    'feminine_count']

        candidate_category_info_df.to_csv(output_filename)
Exemplo n.º 17
0
    def __init__(self, grouped_tweets_csv_filename):
        self.grouped_tweets_csv_filename = grouped_tweets_csv_filename
        self.topics = t.get_topics()
        self.topic_colors = t.get_colors('topics')

        self.last_ys = None
Exemplo n.º 18
0
def process_text(data):
    tokens = topics.get_topics(data)
    return tokens
Exemplo n.º 19
0
 def test_topics_without_stop_words(self):
     article = u"banana banana banana"
     topics = get_topics(article)
     self.assertEqual(len(topics), 1)
     self.assertTrue("banana" in topics)
Exemplo n.º 20
0
 def test_multiple_topics_without_stop_words(self):
     article = u"banana banana banana orange orange"
     topics = get_topics(article)
     self.assertEqual(len(topics), 2)
     self.assertTrue("banana" in topics)
     self.assertTrue("orange" in topics)
Exemplo n.º 21
0
 def test_multiple_topics_without_stop_words(self):
     article = u"banana banana banana orange orange"
     topics = get_topics(article)
     self.assertEqual(len(topics), 2)
     self.assertTrue("banana" in topics)
     self.assertTrue("orange" in topics)
Exemplo n.º 22
0
 def test_topics_without_stop_words(self):
     article = u"banana banana banana"
     topics = get_topics(article)
     self.assertEqual(len(topics), 1)
     self.assertTrue("banana" in topics)