Exemplo n.º 1
0
 def __init__(self):
     self.db_connection = DBConnection()
     self.twitter_api = Twitter(os.environ.get(CREDS.TWITTER_KEY),
                                os.environ.get(CREDS.TWITTER_SECRET),
                                os.environ.get(CREDS.TWITTER_TOKEN),
                                os.environ.get(CREDS.TWITTER_TOKEN_SECRET),
                                self.db_connection)
Exemplo n.º 2
0
class DBFindTest(unittest.TestCase):
    def setUp(self):
        self.db_connection = DBConnection()

    def tearDown(self):
        self.db_connection.close()

    def test_find_document(self):
        result = self.db_connection.find_document(
            collection=DB.MP_COLLECTION,
            filter={"twitter_handle": "@theresa_may"},
            projection={
                "name": 1,
                "_id": 0
            })

        self.assertEqual(result[0]["name"], "Theresa May")

    def test_validate_twitter(self):
        twitter_api = Twitter(os.getenv(CREDS.TWITTER_KEY),
                              os.getenv(CREDS.TWITTER_SECRET),
                              os.getenv(CREDS.TWITTER_TOKEN),
                              os.getenv(CREDS.TWITTER_TOKEN_SECRET),
                              self.db_connection)

        self.assertTrue(expr=twitter_api.verify_credentials(),
                        msg="Could not validate Twitter credentials.")
 def __init__(self):
     self.db_connection = DBConnection()
     self.logger = logging.getLogger(__name__)
     self.api = PageviewsClient(
         "Mozilla/5.0 (X11; Linux x86_64)"
         " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
     )
Exemplo n.º 4
0
    def __init__(self):
        self.api = praw.Reddit(client_id='DiI57R025MBQLQ',
                               client_secret='4IaDtRqQrX4jIEDZeYqh_y4cJCA',
                               user_agent='script')

        self.db_connection = DBConnection()
        self.subreddit_list = []
        self.submission_list = []
Exemplo n.º 5
0
 def __init__(self):
     self.client = crowdflower.client.Client(
         os.getenv("CROWDFLOWER_API_KEY"))
     self.db_connection = DBConnection()
     self.api_key = os.getenv("CROWDFLOWER_API_KEY")
     self.judgements_session = requests.session()
     self.nlu = NaturalLanguageUnderstandingV1(
         version='2017-02-27',
         username="******",
         password="******")
Exemplo n.º 6
0
    def __init__(self, parent):
        self.db_connection = DBConnection()
        self.bulk_count = 0

        tk.Frame.__init__(self, parent)
        # create a prompt, an input box, an output label,
        # and a button to do the computation
        self.prompt = tk.Label(self,
                               text="Enter a number:",
                               anchor="w",
                               wraplength=500)
        # self.entry = tk.Entry(self)
        self.relevant = tk.Button(self,
                                  text="Relevant",
                                  command=self.calculate1)
        self.not_relevant = tk.Button(self,
                                      text="Not Relevant",
                                      command=self.calculate2)
        self.output = tk.Label(self, text="")

        # lay the widgets out on the screen.
        self.prompt.pack(side="top", fill="x")
        # self.entry.pack(side="top", fill="x", padx=20)
        self.output.pack(side="top", fill="x", expand=True)
        self.not_relevant.pack(side="bottom")
        self.relevant.pack(side="bottom")

        self.tweets = self.db_connection.find_document(
            collection=DB.RELEVANT_TWEET_COLLECTION,
            filter={
                "$and": [{
                    "crowdsourced": {
                        "$exists": False
                    }
                }, {
                    TWEET.SET_TO_FACTCHECK: {
                        "$exists": False
                    }
                }, {
                    TWEET.TOPICS: {
                        "$exists": True
                    }
                }]
            },
            #                                                                   {TWEET.SET_TO_FACTCHECK,
            projection={"text": 1},
            sort=True,
            sort_field="retweet_count",
            limit=500)
        self.current = self.tweets.next()
        self.bulk_op = self.db_connection.start_bulk_upsert(
            collection=DB.RELEVANT_TWEET_COLLECTION)
        self.bulk_count = 0
        self.prompt.configure(text=self.current["text"])
    def __init__(self):
        self.db_connection = DBConnection()
        self.sid = SentimentIntensityAnalyzer()
        self.nlu = NaturalLanguageUnderstandingV1(
            version='2017-02-27',
            username="******",
            password="******")

        self.twitter = Twitter(os.environ.get(CREDS.TWITTER_KEY),
                               os.environ.get(CREDS.TWITTER_SECRET),
                               os.environ.get(CREDS.TWITTER_TOKEN),
                               os.environ.get(CREDS.TWITTER_TOKEN_SECRET),
                               self.db_connection)
        self.session = requests.session()
        self.resolved_urls = []
Exemplo n.º 8
0
class TweetHandler(object):
    def __init__(self):
        self.db_connection = DBConnection()

    def get_clean(self, filter):
        """
        Get tweets for specific MP and clean tweet
        :param filter: Filter for selecting tweets to clean
        :return: Clean tweets for a given MP
        """
        tweets = self.db_connection.find_document(
            collection=DB.RELEVANT_TWEET_COLLECTION,
            filter=filter,
            projection={"text": 1})

        stopword_list = []
        stopword_file = open('stopwords.txt', 'r')
        for line in stopword_file:
            stopword_list.append(line.strip())
        stopword_list = stopword_list + stopwords.words('english')
        stop_words = set(stopword_list)
        tweets = map(lambda x: x["text"].lower(),
                     tweets)  # Combine list into just text content

        regex_remove = "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|&|amp|(\w+:\/\/\S+)|^RT|http.+?"
        tweets = [re.sub(regex_remove, '', tweet).strip() for tweet in tweets]
        clean_tweets = []
        # Stop word removal from tweet
        for tweet in tweets:
            clean_tweets.append(" ".join(word for word in tweet.split()
                                         if word not in stop_words))

        return clean_tweets
    def resolve_url(self, urls):
        db_connection = DBConnection()
        url_list = []
        try:
            r = requests.get(urls[1])
            if r.status_code != 200:
                longurl = None
            else:
                longurl = r.url

            self.resolved_urls.append((urls[0], longurl))
            r.close()

        except requests.exceptions.RequestException:
            return None
Exemplo n.º 10
0
def main():
    db_connection = DBConnection()
    twitter_api = Twitter(os.environ.get(CREDS.TWITTER_KEY),
                          os.environ.get(CREDS.TWITTER_SECRET),
                          os.environ.get(CREDS.TWITTER_TOKEN),
                          os.environ.get(CREDS.TWITTER_TOKEN_SECRET),
                          db_connection)

    if "trends" in sys.argv:
        if "historic" in sys.argv:
            date = datetime.today()
            day_end = date.day - 1
            month_end = date.month
            month = 1
            day = 2
            while month != month_end or day != day_end:
                twitter_api.get_historic_trends(month=month, day=day)
                time.sleep(3)
                day += 1
                if day % 30 == 0:
                    month += 1
                    day = 1

        globally = "global" in sys.argv
        is_uk = "UK" in sys.argv

        location = WOEIDS.UK
        if not is_uk and len(
                sys.argv) > 2:  # Check that no location has be inputted
            location = WOEIDS.USA

        while True:
            twitter_api.get_trends(location=location, globally=globally)
            time.sleep(60 * 60 * 2)  # Run every 2 hours

    elif "tweets" in sys.argv:
        historic = "historical" in sys.argv
        while True:
            twitter_api.update_all_tweets(historic=historic)
            if historic:
                break
Exemplo n.º 11
0
class TopicModel(object):
    def __init__(self):
        self.db_connection = DBConnection()

    def clean_tweet(self, tweet):
        regex_remove = "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|&|amp|(\w+:\/\/\S+)|^RT|http.+?"
        tweet_text = re.sub(regex_remove, '', tweet["text"]).strip()
        tweet_id = tweet["_id"]

        stopword_list = []
        stopword_file = open('stopwords.txt', 'r')
        for line in stopword_file:
            stopword_list.append(line.strip())
        stopword_list = stopword_list + stopwords.words('english')
        stop_words = set(stopword_list)
        tweet_text = " ".join(word.lower() for word in tweet_text.split()
                              if word.lower() not in stop_words)
        tweet["text"] = tweet_text
        return tweet

    def get_final_topics(self, topics):
        kw_list = []
        intact_kw_list = []
        for topic_kws in topics:
            topic_kws = re.findall('"([^"]*)"', topic_kws[1])
            kw_list = kw_list + topic_kws
            intact_kw_list.append(topic_kws)
            # clean_topics.append(clean_topics)
        top_kws = [kw for kw, kw_count in Counter(kw_list).most_common(30)]
        return (top_kws, intact_kw_list)
        # pass

    def model(self, mp_id):
        '''
        Topic model by MP
        :return:
        '''
        tweet_docs = []
        tweets = self.db_connection.find_document(
            collection=DB.RELEVANT_TWEET_COLLECTION,
            filter={"author_id": mp_id},
            projection={"text": 1})

        if tweets.count() > 0:
            for tweet in tweets:
                tweet_docs.append(self.clean_tweet(tweet))

            # dictionary = gensim.corpora.Dictionary(gen_docs)
            # corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
            # tf_idf = gensim.models.TfidfModel(corpus)

            gen_docs = [[
                w.lower() for w in word_tokenize(tweet['text'].lower())
            ] for tweet in tweet_docs]
            dictionary = corpora.Dictionary(gen_docs)
            # dictionary.save(os.path.join(TEMP_FOLDER, 'elon.dict'))  # store the dictionary, for future reference

            corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
            # corpora.MmCorpus.serialize()
            # corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'elon.mm'), corpus)  # store to disk, for later use

            tfidf = models.TfidfModel(corpus)  # step 1 -- initialize a model
            corpus_tfidf = tfidf[corpus]

            total_topics = 5

            total_topic_aggregation = 2
            i = 0
            possible_topics = []
            while i < total_topic_aggregation:
                possible_topics = possible_topics + models.LdaModel(
                    corpus, id2word=dictionary,
                    num_topics=total_topics).show_topics(total_topics, 5)
                i += 1

            topic_data = self.get_final_topics(topics=possible_topics)
            final_topics = []
            print "Top keywords:  %s " % topic_data[0]
            for batch in topic_data[1]:
                print batch
                print "----"
                decision = None
                while decision != "":
                    decision = raw_input()
                    if decision:
                        if decision.lower() not in final_topics:
                            final_topics.append(decision.lower())

            if final_topics:
                self.db_connection.update_mp(user_id=mp_id,
                                             update={MP.TOPICS: final_topics})
                for final_topic in final_topics:
                    self.db_connection.increment_field(
                        collection=DB.RELEVANT_TOPICS,
                        query={"name": final_topic},
                        field=TOPIC.IDENTIFIED_AS_TOPIC)

    def evaluate(self, mp_id):
        '''
        Topic model by MP
        :return:
        '''

        tweet_docs = []
        tweets = self.db_connection.find_document(
            collection=DB.RELEVANT_TWEET_COLLECTION,
            filter={"author_id": mp_id["_id"]},
            projection={"text": 1})

        tweet_count = tweets.count()
        if tweets.count() > 0:
            for tweet in tweets:
                tweet_docs.append(self.clean_tweet(tweet))

            # dictionary = gensim.corpora.Dictionary(gen_docs)
            # corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
            # tf_idf = gensim.models.TfidfModel(corpus)

            gen_docs = [[
                w.lower() for w in word_tokenize(tweet['text'].lower())
            ] for tweet in tweet_docs]
            dictionary = corpora.Dictionary(gen_docs)
            # dictionary.save(os.path.join(TEMP_FOLDER, 'elon.dict'))  # store the dictionary, for future reference

            corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
            # corpora.MmCorpus.serialize()
            # corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'elon.mm'), corpus)  # store to disk, for later use

            tfidf = models.TfidfModel(corpus)  # step 1 -- initialize a model
            corpus_tfidf = tfidf[corpus]

            total_topics = 5

            total_topic_aggregation = 2
            i = 0
            possible_topics = []

            model1 = models.LdaModel(corpus, id2word=dictionary, num_topics=5)
            model2 = models.LdaModel(corpus, id2word=dictionary, num_topics=10)
            model3 = models.LdaModel(corpus, id2word=dictionary, num_topics=20)
            model4 = models.LdaModel(corpus, id2word=dictionary, num_topics=30)
            model5 = models.LdaModel(corpus, id2word=dictionary, num_topics=40)
            model6 = models.LdaModel(corpus, id2word=dictionary, num_topics=50)
            model7 = models.LdaModel(corpus, id2word=dictionary, num_topics=60)
            model8 = models.LdaModel(corpus, id2word=dictionary, num_topics=70)
            model9 = models.LdaModel(corpus, id2word=dictionary, num_topics=80)
            model10 = models.LdaModel(corpus,
                                      id2word=dictionary,
                                      num_topics=90)
            model11 = models.LdaModel(corpus,
                                      id2word=dictionary,
                                      num_topics=100)

            # perplexity = model.bound(corpus=corpus, subsample_ratio=tweet_count/61152)
            perplexity1 = model1.bound(corpus=corpus,
                                       subsample_ratio=tweet_count / 61152)
            perplexity2 = model2.bound(corpus=corpus,
                                       subsample_ratio=tweet_count / 61152)
            perplexity3 = model3.bound(corpus=corpus,
                                       subsample_ratio=tweet_count / 61152)
            perplexity4 = model4.bound(corpus=corpus,
                                       subsample_ratio=tweet_count / 61152)
            perplexity5 = model5.bound(corpus=corpus,
                                       subsample_ratio=tweet_count / 61152)
            perplexity6 = model6.bound(corpus=corpus,
                                       subsample_ratio=tweet_count / 61152)
            perplexity7 = model7.bound(corpus=corpus,
                                       subsample_ratio=tweet_count / 61152)
            perplexity8 = model8.bound(corpus=corpus,
                                       subsample_ratio=tweet_count / 61152)
            perplexity9 = model9.bound(corpus=corpus,
                                       subsample_ratio=tweet_count / 61152)
            perplexity10 = model10.bound(corpus=corpus,
                                         subsample_ratio=tweet_count / 61152)
            perplexity11 = model11.bound(corpus=corpus,
                                         subsample_ratio=tweet_count / 61152)

            return [[
                perplexity1, perplexity2, perplexity3, perplexity4,
                perplexity5, perplexity6, perplexity7, perplexity8,
                perplexity9, perplexity10, perplexity11
            ], [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], mp["name"]]
Exemplo n.º 12
0
 def __init__(self):
     self.logger = logging.getLogger(__name__)
     self.api = NewsApiClient(api_key="0d0fe7063a414d63ad34d037d87ca92f")
     self.db_connection = DBConnection()
Exemplo n.º 13
0
class NewsClient(object):
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.api = NewsApiClient(api_key="0d0fe7063a414d63ad34d037d87ca92f")
        self.db_connection = DBConnection()

    def get_sources(self):
        '''
        Get all the sources used by NEWSAPI to insert into database
        :return:
        '''
        sources = self.api.get_sources()
        sources = sources[NEWS_API_PARAMS.SOURCE]
        sources_to_insert = []
        for source in sources:
            if source[NEWS_SOURCE.COUNTRY] in [
                    NEWS_COUNTRIES.UK, NEWS_COUNTRIES.USA
            ]:
                sources_to_insert.append({
                    NEWS_SOURCE.DESCRIPTION:
                    source[NEWS_SOURCE.DESCRIPTION],
                    NEWS_SOURCE.CATEGORY:
                    source[NEWS_SOURCE.CATEGORY],
                    NEWS_SOURCE.COUNTRY:
                    source[NEWS_SOURCE.COUNTRY],
                    NEWS_SOURCE.LANGUAGE:
                    source[NEWS_SOURCE.LANGUAGE],
                    NEWS_SOURCE.NAME:
                    source[NEWS_SOURCE.NAME],
                    NEWS_SOURCE.URL:
                    source[NEWS_SOURCE.URL],
                    NEWS_SOURCE.NEWS_API_ID:
                    source["id"],
                    NEWS_SOURCE.NEWS_API_FRIENDLY:
                    True
                })

        self.db_connection.bulk_insert(data=sources_to_insert,
                                       collection=DB.SOURCES_COLLECTION)

    def get_timestamps(self):
        news = self.db_connection.find_document(
            collection=DB.NEWS_ARTICLES,
            filter={},
            projection={NEWS_ARTICLE.PUBLISH_DATE})

        for piece in news:
            timestamp = calendar.timegm(piece['published_at'].timetuple())
            result_piece = self.db_connection.find_and_update(
                collection=DB.NEWS_ARTICLES,
                query={"_id": piece["_id"]},
                update={"$set": {
                    "timestamp": timestamp
                }})

    def get_articles(self, query=None, since=None):
        """
        :param query: Query for specific articles
        :param since: Datetime of the earliest date the articles can be
        :return:
        """
        articles_to_insert = []
        batch_size = 300
        article_count = 0
        page_no = 1
        stop_words = re.compile("|".join([
            "sport", "entertainment"
        ]))  # words, categories etc that are not important to collect

        sort_by = NEWS_API_PARAMS.SORT_BY_NEWEST
        sources = list(
            self.db_connection.find_document(
                collection=DB.SOURCES_COLLECTION,
                filter={NEWS_SOURCE.COUNTRY: NEWS_COUNTRIES.UK},
                projection={
                    NEWS_SOURCE.NEWS_API_ID: 1,
                    "_id": 0
                }))

        sources = map(lambda x: x[NEWS_SOURCE.NEWS_API_ID], sources)
        sources = ','.join(sources)

        if query:  # Sort by relevancy instead of newest if query placed
            sort_by = NEWS_API_PARAMS.SORT_BY_RELEVANCY

        if not since:
            since = datetime.now() - timedelta(days=30)

        count = 0
        while True:
            news_payload = self.api.get_everything(
                q=query,
                language='en',
                sources=sources,
                from_parameter=since,
                to='2018-01-15',
                sort_by=sort_by,
                page=page_no,
                page_size=NEWS_API_PARAMS.PAGE_SIZE)
            count += 1
            if 'articles' not in news_payload:
                self.logger.info("hit API limit, stopping")
                break

            total_articles = None
            if "totalResults" in news_payload:
                total_articles = news_payload["totalResults"]

            if "totalResults" in news_payload:
                total_articles = news_payload["totalResults"]

            raw_articles = None
            if "articles" in news_payload:
                article_count += len(news_payload["articles"])
                raw_articles = news_payload["articles"]

            if raw_articles:
                for article in raw_articles:
                    if not stop_words.search(
                            article["url"]
                    ):  # Avoid URLs with the given stop words in them
                        date = datetime.strptime(article["publishedAt"],
                                                 '%Y-%m-%dT%H:%M:%SZ')
                        doc = {
                            NEWS_ARTICLE.DESCRIPTION:
                            article["description"],
                            NEWS_ARTICLE.TITLE:
                            article["title"],
                            NEWS_ARTICLE.URL:
                            article["url"],
                            NEWS_ARTICLE.SOURCE:
                            article["source"]["name"],
                            NEWS_ARTICLE.PUBLISH_DATE:
                            date,
                            NEWS_ARTICLE.TIMESTAMP:
                            calendar.timegm(date.timetuple())
                        }
                        self.db_connection.insert_news_article(article=doc)

                        # articles_to_insert.append({
                        #     NEWS_ARTICLE.DESCRIPTION: article["description"],
                        #     NEWS_ARTICLE.TITLE: article["title"],
                        #     NEWS_ARTICLE.URL: article["url"],
                        #     NEWS_ARTICLE.SOURCE: article["source"]["name"],
                        #     NEWS_ARTICLE.PUBLISH_DATE: date,
                        #     NEWS_ARTICLE.TIMESTAMP: calendar.timegm(date.timetuple())
                        # })

            page_no += 1

            # if count >= 240:
            #     self.logger.info("Stopping news collection due to API limits")
            #     self.logger.info("last timestamp: %s" % calendar.timegm(date.timetuple()))
            #     break

            # if raw_articles:
            #     self.db_connection.bulk_insert(data=articles_to_insert, collection=DB.NEWS_ARTICLES)
            #     articles_to_insert = []

            if not raw_articles:
                break
Exemplo n.º 14
0
class Classifier(object):
    def __init__(self):
        self.db_connection = DBConnection()
        # self.iris = datasets.load_iris()
        # self.digits = datasets.load_digits()
        # self.classifier = svm.SVC(probability=True, kernel='linear')
        self.classifier = svm.SVC(probability=True,
                                  kernel='linear',
                                  C=1,
                                  gamma=1)
        self.clean_train_data = []
        self.classifier_predictions = None
        self.gold_results = None
        Cs = [0.001, 0.01, 0.1, 1, 10]
        gammas = [0.001, 0.01, 0.1, 1]
        self.coef = None
        self.raw_tweets = []
        self.features_names = [
            "tweet_%s" % TWEET.CHARACTER_COUNT,
            "tweet_%s" % TWEET.WORD_COUNT,
            "tweet_%s" % TWEET.CONTAINS_QM,
            "tweet_%s" % TWEET.CONTAINS_EM,
            "tweet_%s" % TWEET.CONTAINS_MULTIPLE_MARKS,
            "tweet_%s" % TWEET.FRACTION_CAPITALISED,
            "tweet_%s" % TWEET.CONTAINS_HAPPY_EMOJI,
            "tweet_%s" % TWEET.CONTAINS_SAD_EMOJI,
            "tweet_%s" % TWEET.CONTAINS_HAPPY_EMOTICON,
            "tweet_%s" % TWEET.CONTAINS_SAD_EMOTICON,
            "tweet_%s" % TWEET.CONTAINS_PRONOUNS,
            "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP10,
            "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP30,
            "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP50,
            "tweet_%s" % TWEET.MENTIONS_USER,
            "tweet_%s" % TWEET.CONTAINS_STOCK_SYMBOL,
            "tweet_%s" % TWEET.PUBLISH_WEEKDAY,
            "tweet_%s" % TWEET.POSITIVE_WORD_COUNT,
            "tweet_%s" % TWEET.NEGATIVE_WORD_COUNT,
            "tweet_%s" % TWEET.SENTIMENT_SCORE,
            "tweet_%s" % TWEET.AVERAGE_ENTITY_CERTAINTY,
            "tweet_%s" % TWEET.AVERAGE_KEYWORD_CERTAINTY,
            "tweet_%s" % TWEET.ENTITIES_COUNT,
            "tweet_%s" % TWEET.KEYWORDS_COUNT,
            "tweet_%s" % TWEET.RELEVANCY_DAY,
            "tweet_%s" % TWEET.RELEVANCY_WEEK,
            "tweet_%s" % TWEET.RELEVANCY_TWO_WEEKS,
            "tweet_%s" % TWEET.CONTAINS_FIGURES,
            "tweet_%s" % TWEET.FRAC_NOT_IN_DICT,
            "mp_%s" % MP.FOLLOWERS_COUNT,
            "mp_%s" % MP.FRIENDS_COUNT,
            "mp_%s" % MP.TWEET_COUNT,
            "mp_%s" % MP.IS_VERIFIED,
            "mp_%s" % MP.AVERAGE_NO_RETWEETS,
            "mp_%s" % MP.AVERAGE_NO_FAVOURITES,
            "mp_%s" % MP.ACCOUNT_DAYS,
            "topic_%s" % TOPIC.TWEET_COUNT,
            "topic_%s" % TOPIC.TWEET_AVERAGE_LENGTH,
            "topic_%s" % TOPIC.FRAC_CONTAINING_QM,
            "topic_%s" % TOPIC.FRAC_CONTAINING_EM,
            "topic_%s" % TOPIC.FRAC_CONTAINING_MULTIPLE_MARKS,
            "topic_%s" % TOPIC.FRAC_CONTAINING_HAPPY_EMOTICON,
            "topic_%s" % TOPIC.FRAC_CONTAINING_SAD_EMOTICON,
            "topic_%s" % TOPIC.FRAC_CONTAINING_HAPPY_EMOJI,
            "topic_%s" % TOPIC.FRAC_CONTAINING_SAD_EMOJI,
            "topic_%s" % TOPIC.FRAC_CONTAINING_PRONOUNS,
            "topic_%s" % TOPIC.FRAC_CONTAINING_FIGURES,
            "topic_%s" % TOPIC.FRAC_CONTAINING_UPPERCASE,
            "topic_%s" % TOPIC.FRAC_CONTAINING_URL,
            "topic_%s" % TOPIC.FRAC_CONTAINING_USER_MENTION,
            "topic_%s" % TOPIC.FRAC_CONTAINING_HASHTAGS,
            "topic_%s" % TOPIC.FRAC_CONTAINING_STOCK_SYMBOLS,
            "topic_%s" % TOPIC.AVERAGE_SENTIMENT_SCORE,
            "topic_%s" % TOPIC.FRAC_CONTAINING_POSITIVE_SENTIMENT,
            "topic_%s" % TOPIC.FRAC_CONTAINING_NEGATIVE_SENTIMENT,
            "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN10,
            "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN30,
            "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN50,
            "topic_%s" % TOPIC.DISTINCT_URLS_COUNT,
            "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_VISITED_URL,
            "topic_%s" % TOPIC.DISTINCT_HASHTAG_COUNT,
            "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_USED_HASHTAG,
            "topic_%s" % TOPIC.DISTINCT_USER_MENTION_COUNT,
            "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_MENTIONED_USER,
            "topic_%s" % TOPIC.DISTINCT_TWEET_AUTHOR_COUNT,
            "topic_%s" % TOPIC.FRAC_CONTAINING_TOP_AUTHOR,
            "topic_%s" % TOPIC.AVERAGE_AUTHOR_TWITTER_LIFE,
            "topic_%s" % TOPIC.AVERAGE_AUTHOR_TWEET_COUNT,
            "topic_%s" % TOPIC.AVERAGE_AUTHOR_FOLLOWER_COUNT,
            "topic_%s" % TOPIC.AVERAGE_AUTHOR_FRIEND_COUNT,
            "topic_%s" % TOPIC.FRAC_FROM_VERIFIED,
            "topic_%s" % TOPIC.AVERAGE_DAY_RELEVANCE,
            "topic_%s" % TOPIC.AVERAGE_WEEK_RELEVANCE,
            "topic_%s" % TOPIC.AVERAGE_2WEEK_RELEVANCE,
            "topic_%s" % TOPIC.AVERAGE_WORDS_NOT_IN_DICT
        ]

    def train(self, train_data, train_target):
        '''
        Trains SVM classifier based on the feature set acquired in feature_extractor
        Normalises data for optimal results
        Gets class decision and probabilistic result
        :return:
        '''
        # self.clean_train_data = []

        # Cleaning of data in preparation for training
        for tweet in train_data:
            tweet_block = [
                tweet[TWEET.CHARACTER_COUNT], tweet[TWEET.WORD_COUNT],
                int(tweet[TWEET.CONTAINS_QM]),
                int(tweet[TWEET.CONTAINS_EM]),
                int(tweet[TWEET.CONTAINS_MULTIPLE_MARKS]),
                tweet[TWEET.FRACTION_CAPITALISED],
                int(tweet[TWEET.CONTAINS_HAPPY_EMOJI]),
                int(tweet[TWEET.CONTAINS_SAD_EMOJI]),
                int(tweet[TWEET.CONTAINS_HAPPY_EMOTICON]),
                int(tweet[TWEET.CONTAINS_SAD_EMOTICON]),
                int(tweet[TWEET.CONTAINS_PRONOUNS]),
                int(tweet[TWEET.CONTAINS_DOMAIN_TOP10]),
                int(tweet[TWEET.CONTAINS_DOMAIN_TOP30]),
                int(tweet[TWEET.CONTAINS_DOMAIN_TOP50]),
                int(tweet[TWEET.MENTIONS_USER]),
                int(tweet[TWEET.CONTAINS_STOCK_SYMBOL]),
                tweet[TWEET.PUBLISH_WEEKDAY], tweet[TWEET.POSITIVE_WORD_COUNT],
                tweet[TWEET.NEGATIVE_WORD_COUNT], tweet[TWEET.SENTIMENT_SCORE],
                tweet[TWEET.AVERAGE_ENTITY_CERTAINTY],
                tweet[TWEET.AVERAGE_KEYWORD_CERTAINTY],
                tweet[TWEET.ENTITIES_COUNT], tweet[TWEET.KEYWORDS_COUNT],
                tweet[TWEET.RELEVANCY_DAY], tweet[TWEET.RELEVANCY_WEEK],
                tweet[TWEET.RELEVANCY_TWO_WEEKS],
                int(tweet[TWEET.CONTAINS_FIGURES]),
                tweet[TWEET.FRAC_NOT_IN_DICT]
            ]

            mp_data = self.db_connection.find_document(
                collection=DB.MP_COLLECTION,
                filter={"_id": tweet[TWEET.AUTHOR_ID]},
                projection={
                    MP.FOLLOWERS_COUNT: 1,
                    MP.FRIENDS_COUNT: 1,
                    MP.TWEET_COUNT: 1,
                    MP.IS_VERIFIED: 1,
                    MP.AVERAGE_NO_RETWEETS: 1,
                    MP.AVERAGE_NO_FAVOURITES: 1,
                    MP.ACCOUNT_DAYS: 1
                })
            for mp in mp_data:
                mp_block = [
                    mp[MP.FOLLOWERS_COUNT], mp[MP.FRIENDS_COUNT],
                    mp[MP.TWEET_COUNT],
                    int(mp[MP.IS_VERIFIED]), mp[MP.AVERAGE_NO_RETWEETS],
                    mp[MP.AVERAGE_NO_FAVOURITES], mp[MP.ACCOUNT_DAYS]
                ]
                break

            top_topic = max(tweet[TWEET.TOPICS],
                            key=lambda x: x[TOPIC.IDENTIFIED_AS_TOPIC])
            topics = self.db_connection.find_document(
                collection=DB.RELEVANT_TOPICS,
                filter={"_id": top_topic["_id"]})

            for topic in topics:
                topic_block = [
                    topic[TOPIC.TWEET_COUNT],
                    topic[TOPIC.TWEET_AVERAGE_LENGTH],
                    topic[TOPIC.FRAC_CONTAINING_QM],
                    topic[TOPIC.FRAC_CONTAINING_EM],
                    topic[TOPIC.FRAC_CONTAINING_MULTIPLE_MARKS],
                    topic[TOPIC.FRAC_CONTAINING_HAPPY_EMOTICON],
                    topic[TOPIC.FRAC_CONTAINING_SAD_EMOTICON],
                    topic[TOPIC.FRAC_CONTAINING_HAPPY_EMOJI],
                    topic[TOPIC.FRAC_CONTAINING_SAD_EMOJI],
                    topic[TOPIC.FRAC_CONTAINING_PRONOUNS],
                    topic[TOPIC.FRAC_CONTAINING_FIGURES],
                    topic[TOPIC.FRAC_CONTAINING_UPPERCASE],
                    topic[TOPIC.FRAC_CONTAINING_URL],
                    topic[TOPIC.FRAC_CONTAINING_USER_MENTION],
                    topic[TOPIC.FRAC_CONTAINING_HASHTAGS],
                    topic[TOPIC.FRAC_CONTAINING_STOCK_SYMBOLS],
                    topic[TOPIC.AVERAGE_SENTIMENT_SCORE],
                    topic[TOPIC.FRAC_CONTAINING_POSITIVE_SENTIMENT],
                    topic[TOPIC.FRAC_CONTAINING_NEGATIVE_SENTIMENT],
                    topic[TOPIC.FRAC_CONTAINING_DOMAIN10],
                    topic[TOPIC.FRAC_CONTAINING_DOMAIN30],
                    topic[TOPIC.FRAC_CONTAINING_DOMAIN50],
                    topic[TOPIC.DISTINCT_URLS_COUNT],
                    topic[TOPIC.FRAC_CONTAINING_MOST_VISITED_URL],
                    topic[TOPIC.DISTINCT_HASHTAG_COUNT],
                    topic[TOPIC.FRAC_CONTAINING_MOST_USED_HASHTAG],
                    topic[TOPIC.DISTINCT_USER_MENTION_COUNT],
                    topic[TOPIC.FRAC_CONTAINING_MOST_MENTIONED_USER],
                    topic[TOPIC.DISTINCT_TWEET_AUTHOR_COUNT],
                    topic[TOPIC.FRAC_CONTAINING_TOP_AUTHOR],
                    topic[TOPIC.AVERAGE_AUTHOR_TWITTER_LIFE],
                    topic[TOPIC.AVERAGE_AUTHOR_TWEET_COUNT],
                    topic[TOPIC.AVERAGE_AUTHOR_FOLLOWER_COUNT],
                    topic[TOPIC.AVERAGE_AUTHOR_FRIEND_COUNT],
                    topic[TOPIC.FRAC_FROM_VERIFIED],
                    topic[TOPIC.AVERAGE_DAY_RELEVANCE],
                    topic[TOPIC.AVERAGE_WEEK_RELEVANCE],
                    topic[TOPIC.AVERAGE_2WEEK_RELEVANCE],
                    topic[TOPIC.AVERAGE_WORDS_NOT_IN_DICT]
                ]
                break

            data_block = tweet_block + mp_block + topic_block
            self.clean_train_data.append(data_block)

        X = np.array(self.clean_train_data)
        X = preprocessing.scale(X)
        # self.get_best_hyperparameters(X=X[:-60], y=train_target)
        # self.classifier.fit(X=X[:-60], y=train_target)
        self.classifier.fit(X=X[:-150], y=train_target)
        self.coef = self.classifier.coef_  # here the weights of the features will be stored

        # self.get_feature_importance(self.classifier, feature_names=self.features_names, top_features=10)

        return X

    def predict(self, target_data):
        '''
        Get predictions for the target data
        :param target_data: array of feature sets for each tweet
        :return:
        '''
        predictions = self.classifier.predict(target_data)
        self.classifier_predictions = predictions.tolist()
        class_probabilities = self.classifier.predict_proba(target_data)
        # print predictions
        print class_probabilities
        print len(class_probabilities)

        for pos, tweet in enumerate(self.raw_tweets):
            prediction = predictions[pos]
            confidence_score = class_probabilities[pos, prediction]
            # confidence_score = class_probabilities[pos]
            verdict = bool(prediction == 1)

            clf.db_connection.find_and_update(
                collection=DB.RELEVANT_TWEET_COLLECTION,
                query={"_id": tweet[TWEET.ID]},
                update={
                    "$set": {
                        TWEET.CONFIDENCE_SCORE: confidence_score,
                        TWEET.PREDICTED_VERDICT: verdict
                    }
                })
            print pos
            print tweet
            print "----"

    def evaluate_classifier(self):
        target_names = ["false", "true"]
        # labeler1 = [2, 0, 2, 2, 0, 1]
        # labeler2 = [0, 0, 2, 2, 0, 2]
        print "predictions - true: %s" % self.classifier_predictions.count(1)
        print "predictions - false: %s" % self.classifier_predictions.count(0)
        print "ground truth - true: %s" % self.gold_results.count(1)
        print "ground truth - false: %s" % self.gold_results.count(0)
        kappa_score = cohen_kappa_score(self.classifier_predictions,
                                        self.gold_results)
        print("kappa score: %s" % kappa_score)
        print(
            classification_report(self.gold_results,
                                  self.classifier_predictions,
                                  target_names=target_names))
        tn, fp, fn, tp = confusion_matrix(self.gold_results,
                                          self.classifier_predictions).ravel()
        print(tn, fp, fn, tp)

    def get_best_hyperparameters(self, X, y):
        Cs = [0.001, 0.01, 0.1, 1, 10]
        gammas = [0.001, 0.01, 0.1, 1]
        param_grid = {'C': Cs, 'gamma': gammas}
        grid_search = GridSearchCV(svm.SVC(kernel='linear'), param_grid, cv=25)
        grid_search.fit(X, y)
        grid_search.best_params_
        print grid_search.best_params_

    def get_feature_importance(self,
                               classifier,
                               feature_names,
                               top_features=20):
        rcParams.update({'figure.autolayout': True})
        # imp = self.coef
        # imp, names = zip(*sorted(zip(imp, self.features_names)))
        # plt.barh(range(len(names)), imp, align='center')
        # plt.yticks(range(len(names)), names)
        # plt.show()
        coef = classifier.coef_.ravel()
        top_positive_coefficients = np.argsort(coef)[-top_features:]
        top_negative_coefficients = np.argsort(coef)[:top_features]
        top_coefficients = np.hstack(
            [top_negative_coefficients, top_positive_coefficients])
        # create plot
        plt.figure(figsize=(10, 10))
        colors = ["red" if c < 0 else "green" for c in coef[top_coefficients]]
        plt.bar(np.arange(2 * top_features),
                coef[top_coefficients],
                color=colors)
        feature_names = np.array(feature_names)
        plt.xticks(np.arange(0, 2 * top_features),
                   feature_names[top_coefficients],
                   rotation=60,
                   ha="right")
        # plt.xticks(np.arange(0, 1 + 2 * top_features), feature_names[top_coefficients], rotation=90)
        plt.ylabel("Feature coefficient")
        plt.xlabel("Feature name")
        plt.show()

    def get_ground_truth_set(self):
        tweets = self.db_connection.find_document(
            collection=DB.RELEVANT_TWEET_COLLECTION,
            filter={
                "$and": [{
                    TWEET.AGGREGATE_LABEL: {
                        "$exists": False
                    }
                }, {
                    TWEET.TOPICS: {
                        "$exists": True
                    }
                }, {
                    TWEET.ENTITIES_COUNT: {
                        "$gt": 0
                    }
                }, {
                    TWEET.SET_TO_FACTCHECK: True
                }]
            },
            projection={"text": 1})

        print tweets.count()
        total_count = 0
        for tweet in tweets:
            print tweet
            print "-------------"
            verdict = raw_input("Is the tweet true?\n")
            verdict = int("y" in verdict)
            self.db_connection.find_and_update(
                collection=DB.RELEVANT_TWEET_COLLECTION,
                query={"_id": tweet["_id"]},
                update={
                    "$set": {
                        TWEET.AGGREGATE_LABEL: verdict,
                        TWEET.GOLDEN: True
                    }
                })

            total_count += 1

    def populate_authors(self):
        valid_tweets = self.db_connection.find_document(
            collection=DB.RELEVANT_TWEET_COLLECTION,
            filter={TWEET.CONFIDENCE_SCORE: {
                "$exists": True
            }},
            projection={
                TWEET.AUTHOR_ID: 1,
                TWEET.CONFIDENCE_SCORE: 1
            })

        for tweet in valid_tweets:
            self.db_connection.find_and_update(
                collection=DB.MP_COLLECTION,
                query={MP.ID: tweet[TWEET.AUTHOR_ID]},
                update={
                    "$inc": {
                        MP.FACTUAL_SCORE: tweet[TWEET.CONFIDENCE_SCORE],
                        MP.NO_FACT_CHECKED_TWEETS: 1
                    }
                })
Exemplo n.º 15
0
class Reddit(object):
    def __init__(self):
        self.api = praw.Reddit(client_id='DiI57R025MBQLQ',
                               client_secret='4IaDtRqQrX4jIEDZeYqh_y4cJCA',
                               user_agent='script')

        self.db_connection = DBConnection()
        self.subreddit_list = []
        self.submission_list = []

    def get_subreddit(self, name):
        # assume you have a Reddit instance bound to variable `reddit`
        subreddit = self.api.subreddit(name)
        self.subreddit_list.append(subreddit)

        # print(subreddit.display_name)
        # print(subreddit.title)
        # print(subreddit.description)

    def get_top_comments(self, subreddit):
        count = 0
        for submission in subreddit.new():
        # for submission in subreddit.top(time_filter='month'):
        # for submission in subreddit.new(limit=1000):
            count +=1
            # print(dir(submission))
            if "spoiler" in submission.title.lower() or submission.spoiler:
                comments = submission.comments.list()
                for comment in comments:
                    
                print(submission.title)
                print(submission.selftext)
                if submission.selftext != "":
                    doc = {
                        SPOILER.ID: submission.id,
                        SPOILER.TITLE: submission.title,
                        SPOILER.CONTENT: submission.selftext,
                        SPOILER.SHOW: 'Breaking Bad',
                    }
                    try:
                        self.db_connection.insert(doc)
                    except Exception:
                        pass



            # print(submission.title)
            # print(submission.selftext)
            # comments = submission.comments.list()
            # for comment in comments:
            #     print(comment.body)
            # print(submission.title)  # Output: the submission's title
            # print(submission.score)  # Output: the submission's score
            # print(submission.id)  # Output: the submission's ID
            # print(submission.url)
            # submission.# Output: the URL the submission points to
            # print("-------")
            # or the submission's URL if it's a self post

            # top_level_comments = list(submission.comments)
            # all_comments = submission.comments.list()
        print(count)

if __name__ == "__main__":
    r = Reddit()
    r.get_subreddit(name="breakingbad")
    r.get_top_comments(subreddit=r.subreddit_list[0])
Exemplo n.º 16
0
 def __init__(self):
     self.db_connection = DBConnection()
     # self.iris = datasets.load_iris()
     # self.digits = datasets.load_digits()
     # self.classifier = svm.SVC(probability=True, kernel='linear')
     self.classifier = svm.SVC(probability=True,
                               kernel='linear',
                               C=1,
                               gamma=1)
     self.clean_train_data = []
     self.classifier_predictions = None
     self.gold_results = None
     Cs = [0.001, 0.01, 0.1, 1, 10]
     gammas = [0.001, 0.01, 0.1, 1]
     self.coef = None
     self.raw_tweets = []
     self.features_names = [
         "tweet_%s" % TWEET.CHARACTER_COUNT,
         "tweet_%s" % TWEET.WORD_COUNT,
         "tweet_%s" % TWEET.CONTAINS_QM,
         "tweet_%s" % TWEET.CONTAINS_EM,
         "tweet_%s" % TWEET.CONTAINS_MULTIPLE_MARKS,
         "tweet_%s" % TWEET.FRACTION_CAPITALISED,
         "tweet_%s" % TWEET.CONTAINS_HAPPY_EMOJI,
         "tweet_%s" % TWEET.CONTAINS_SAD_EMOJI,
         "tweet_%s" % TWEET.CONTAINS_HAPPY_EMOTICON,
         "tweet_%s" % TWEET.CONTAINS_SAD_EMOTICON,
         "tweet_%s" % TWEET.CONTAINS_PRONOUNS,
         "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP10,
         "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP30,
         "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP50,
         "tweet_%s" % TWEET.MENTIONS_USER,
         "tweet_%s" % TWEET.CONTAINS_STOCK_SYMBOL,
         "tweet_%s" % TWEET.PUBLISH_WEEKDAY,
         "tweet_%s" % TWEET.POSITIVE_WORD_COUNT,
         "tweet_%s" % TWEET.NEGATIVE_WORD_COUNT,
         "tweet_%s" % TWEET.SENTIMENT_SCORE,
         "tweet_%s" % TWEET.AVERAGE_ENTITY_CERTAINTY,
         "tweet_%s" % TWEET.AVERAGE_KEYWORD_CERTAINTY,
         "tweet_%s" % TWEET.ENTITIES_COUNT,
         "tweet_%s" % TWEET.KEYWORDS_COUNT,
         "tweet_%s" % TWEET.RELEVANCY_DAY,
         "tweet_%s" % TWEET.RELEVANCY_WEEK,
         "tweet_%s" % TWEET.RELEVANCY_TWO_WEEKS,
         "tweet_%s" % TWEET.CONTAINS_FIGURES,
         "tweet_%s" % TWEET.FRAC_NOT_IN_DICT,
         "mp_%s" % MP.FOLLOWERS_COUNT,
         "mp_%s" % MP.FRIENDS_COUNT,
         "mp_%s" % MP.TWEET_COUNT,
         "mp_%s" % MP.IS_VERIFIED,
         "mp_%s" % MP.AVERAGE_NO_RETWEETS,
         "mp_%s" % MP.AVERAGE_NO_FAVOURITES,
         "mp_%s" % MP.ACCOUNT_DAYS,
         "topic_%s" % TOPIC.TWEET_COUNT,
         "topic_%s" % TOPIC.TWEET_AVERAGE_LENGTH,
         "topic_%s" % TOPIC.FRAC_CONTAINING_QM,
         "topic_%s" % TOPIC.FRAC_CONTAINING_EM,
         "topic_%s" % TOPIC.FRAC_CONTAINING_MULTIPLE_MARKS,
         "topic_%s" % TOPIC.FRAC_CONTAINING_HAPPY_EMOTICON,
         "topic_%s" % TOPIC.FRAC_CONTAINING_SAD_EMOTICON,
         "topic_%s" % TOPIC.FRAC_CONTAINING_HAPPY_EMOJI,
         "topic_%s" % TOPIC.FRAC_CONTAINING_SAD_EMOJI,
         "topic_%s" % TOPIC.FRAC_CONTAINING_PRONOUNS,
         "topic_%s" % TOPIC.FRAC_CONTAINING_FIGURES,
         "topic_%s" % TOPIC.FRAC_CONTAINING_UPPERCASE,
         "topic_%s" % TOPIC.FRAC_CONTAINING_URL,
         "topic_%s" % TOPIC.FRAC_CONTAINING_USER_MENTION,
         "topic_%s" % TOPIC.FRAC_CONTAINING_HASHTAGS,
         "topic_%s" % TOPIC.FRAC_CONTAINING_STOCK_SYMBOLS,
         "topic_%s" % TOPIC.AVERAGE_SENTIMENT_SCORE,
         "topic_%s" % TOPIC.FRAC_CONTAINING_POSITIVE_SENTIMENT,
         "topic_%s" % TOPIC.FRAC_CONTAINING_NEGATIVE_SENTIMENT,
         "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN10,
         "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN30,
         "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN50,
         "topic_%s" % TOPIC.DISTINCT_URLS_COUNT,
         "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_VISITED_URL,
         "topic_%s" % TOPIC.DISTINCT_HASHTAG_COUNT,
         "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_USED_HASHTAG,
         "topic_%s" % TOPIC.DISTINCT_USER_MENTION_COUNT,
         "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_MENTIONED_USER,
         "topic_%s" % TOPIC.DISTINCT_TWEET_AUTHOR_COUNT,
         "topic_%s" % TOPIC.FRAC_CONTAINING_TOP_AUTHOR,
         "topic_%s" % TOPIC.AVERAGE_AUTHOR_TWITTER_LIFE,
         "topic_%s" % TOPIC.AVERAGE_AUTHOR_TWEET_COUNT,
         "topic_%s" % TOPIC.AVERAGE_AUTHOR_FOLLOWER_COUNT,
         "topic_%s" % TOPIC.AVERAGE_AUTHOR_FRIEND_COUNT,
         "topic_%s" % TOPIC.FRAC_FROM_VERIFIED,
         "topic_%s" % TOPIC.AVERAGE_DAY_RELEVANCE,
         "topic_%s" % TOPIC.AVERAGE_WEEK_RELEVANCE,
         "topic_%s" % TOPIC.AVERAGE_2WEEK_RELEVANCE,
         "topic_%s" % TOPIC.AVERAGE_WORDS_NOT_IN_DICT
     ]
Exemplo n.º 17
0
from __future__ import unicode_literals
from __future__ import division

import json

from bson.objectid import ObjectId
from django.http import HttpResponse
from ingest_engine.twitter_ingest import Twitter
from cons import CREDS, DB, TWEET, MP
from db_engine import DBConnection
from django.shortcuts import render
from .models import MemberParliament, Tweet
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
db_connection = DBConnection()
twitter_api = Twitter(os.environ.get(CREDS.TWITTER_KEY),
                          os.environ.get(CREDS.TWITTER_SECRET),
                          os.environ.get(CREDS.TWITTER_TOKEN),
                          os.environ.get(CREDS.TWITTER_TOKEN_SECRET),
                          db_connection)


def index(request):
    if 'mp_search' in request.GET:
        mp_name = request.GET['mp_search']
        mp_list = db_connection.find_document(collection=DB.MP_COLLECTION,
                                              filter={"name": {"$regex": mp_name.title()}})
    # if 'mp_search' in request.GET:
    #     mp_name = request.GET['mp_search']
    #     mp_list = MemberParliament.objects.filter(name__contains=mp_name.title())
Exemplo n.º 18
0
    def __init__(self, parent):
        self.db_connection = DBConnection()
        self.bulk_count = 0

        tk.Frame.__init__(self, parent)
        # create a prompt, an input box, an output label,
        # and a button to do the computation
        self.prompt = tk.Label(self,
                               text="Enter a number:",
                               anchor="w",
                               wraplength=500)
        self.entities_prompt = tk.Label(self,
                                        text="entities",
                                        anchor="w",
                                        wraplength=500)
        # self.entry = tk.Entry(self)
        self.tp = tk.Button(self,
                            text="Is an entity and API says it's an entity",
                            command=self.calculate1)
        self.tn = tk.Button(self,
                            text="Is not an entity, API does not include it",
                            command=self.calculate2)
        self.fp = tk.Button(self,
                            text='Is not an entity, API includes it',
                            command=self.calculate3)
        self.fn = tk.Button(self,
                            text='Is an entity, API does not include it',
                            command=self.calculate4)
        self.output = tk.Label(self, text="")

        # lay the widgets out on the screen.
        self.prompt.pack(side="top", fill="x")
        self.entities_prompt.pack(side="bottom")
        # self.entry.pack(side="top", fill="x", padx=20)
        self.output.pack(side="top", fill="x", expand=True)
        self.fn.pack(side="bottom")
        self.fp.pack(side="bottom")
        self.tn.pack(side="bottom")
        self.tp.pack(side="bottom")

        self.tweets = self.db_connection.get_random_sample(
            collection=DB.RELEVANT_TWEET_COLLECTION,
            query={
                "$and": [{
                    TWEET.SET_TO_FACTCHECK: True
                }, {
                    TWEET.ENTITIES_COUNT: {
                        "$eq": 1
                    }
                }]
            },
            size=200)

        self.current = self.tweets.next()
        self.prompt.configure(text=self.current["text"])
        self.entities_prompt.configure(
            text="Entities: %s" %
            [x['entity'] for x in self.current["entities"]])
        self.tp = 0
        self.tn = 0
        self.fp = 0
        self.fn = 0
Exemplo n.º 19
0
class CrowdFlower(object):
    def __init__(self):
        self.client = crowdflower.client.Client(
            os.getenv("CROWDFLOWER_API_KEY"))
        self.db_connection = DBConnection()
        self.api_key = os.getenv("CROWDFLOWER_API_KEY")
        self.judgements_session = requests.session()
        self.nlu = NaturalLanguageUnderstandingV1(
            version='2017-02-27',
            username="******",
            password="******")

        # self.connection = crowdflower.Connection(api_key=os.getenv("CROWDFLOWER_API_KEY"))

    def chunks(self, l, n):
        """Yield successive n-sized chunks from l."""
        for i in range(0, len(l), n):
            yield l[i:i + n]

    def get_jobs(self):
        job = self.client.get_job(1239688)
        job.cml = """
<div class="html-element-wrapper">
	<h2>The tweet you are evaluating is:</h2>
	<p>{{tweet_content}}</p>
	<h2><strong>This tweet's entities are:</strong></h2>
	<ul>
		{% for entity in entity_list %}
			<li>{{ entity_list[forloop.index0] }}</li>
		{% endfor %}
	</ul>
</div>
<cml:radios label="Do you understand the tweet?" validates="required" gold="true">
	<cml:radio label="Yes" value="yes" />
	<cml:radio label="No" value="no" />
</cml:radios>
<cml:select label="Please indicate the first entity of your relation: NOTE: THIS MUST BE DIFFERENT FROM THE SECOND ENTITY." validates="required" gold="true">
	{% for entity in entity_list %}
		<cml:option label="{{ entity_list[forloop.index0] }}" value="{{ entity_list[forloop.index0] }}" />
	{% endfor %}
</cml:select>
<cml:select label="Please indicate the second entity of your relation: NOTE: THIS MUST BE DIFFERENT FROM THE FIRST ENTITY." validates="required" gold="true">
	{% for entity in entity_list %}
		<cml:option label="{{ entity_list[forloop.index0] }}" value="{{ entity_list[forloop.index0] }}" />
	{% endfor %}
</cml:select>
<cml:text label="What is the SIMPLE relationship between the entities you have chosen" validates="required" gold="true" />
	<cml:radios label="Do you think the topic of the tweet is politically important?" validates="required" gold="true">
	<cml:radio label="Yes" value="yes" />
	<cml:radio label="No" value="no" />
</cml:radios><cml:text label="What is the first word in the tweet?" validates="required" gold="true" />
        """
        job.update()

    def get_judgements(self, job_id):
        page_no = 1
        index_resolver = {
            "tweet1": 0,
            "tweet2": 1,
            "tweet3": 2,
            "tweet4": 3,
            "tweet5": 4,
            "tweet6": 5,
            "tweet7": 6,
            "tweet8": 7,
            "tweet9": 8,
            "tweet10": 9
        }
        results = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id, self.api_key, page_no))

        content = json.loads(results.content)
        no_count = 0
        yes_count = 0
        for key, result in content.iteritems():
            answers = result[CF.FACTCHECKABLE_ANSWERS]
            answers = answers['res']
            tweets_to_check = {}
            for answer in answers:
                if len(answer) != 10:
                    for tweet in answer:
                        if tweet not in tweets_to_check:
                            tweets_to_check[tweet] = 1

                        else:
                            tweets_to_check[tweet] = tweets_to_check[tweet] + 1

            tweet_list = result[CF.TWEET_LIST]

            for tweet, occurrence in tweets_to_check.iteritems():
                text = tweet_list[index_resolver.get(tweet)]
                if occurrence > 1:
                    yes_count += 1
                    # text = tweet_list[index_resolver.get(tweet)]
                    # self.db_connection.find_and_update(collection=DB.RELEVANT_TWEET_COLLECTION,
                    #                                    query={"text": text, TWEET.SET_TO_FACTCHECK: {"$exists": False}},
                    #                                    update={"$set": {TWEET.SET_TO_FACTCHECK: True}})

                else:
                    self.db_connection.find_and_update(
                        collection=DB.RELEVANT_TWEET_COLLECTION,
                        query={
                            "text": text,
                            TWEET.SET_TO_FACTCHECK: {
                                "$exists": False
                            }
                        },
                        update={"$set": {
                            TWEET.SET_TO_FACTCHECK: False
                        }})

        print yes_count
        print no_count

        # json_result = json.loads(results)

    def get_fact_opinion(self, job_id):
        crowd_data = []
        tweet_list = []
        job = self.client.get_job(job_id)
        tweets = self.db_connection.find_document(
            collection=DB.TWEET_COLLECTION,
            filter={
                "$and": [{
                    "created_at_epoch": {
                        "$gt": 1520812800
                    }
                }, {
                    "created_at_epoch": {
                        "$lt": 1523491200
                    }
                }, {
                    "entities": {
                        "$exists": True
                    }
                }, {
                    "keywords": {
                        "$exists": True
                    }
                }]
            },
            projection={
                "text": 1,
                "entities": 1,
                "keywords": 1
            },
            limit=2000,
            sort=True,
            sort_field="retweet_count")

        for tweet in tweets:
            if len(tweet['entities']) > 2 and len(tweet['keywords']) > 2:
                tweet_list.append(tweet['text'])

        data_list = list(self.chunks(tweet_list, 10))  # Chunk data
        for data in data_list:
            if len(data) == 10:
                crowd_data.append({"tweet_list": data})

        job.upload(data=crowd_data, force=True)

    def process_job(self):
        data_list = []
        job = self.client.get_job(1256982)
        tweets = self.db_connection.find_document(
            collection=DB.TWEET_COLLECTION,
            filter={
                "$and": [{
                    "created_at_epoch": {
                        "$gt": 1520812800
                    }
                }, {
                    "created_at_epoch": {
                        "$lt": 1523491200
                    }
                }, {
                    "entities": {
                        "$exists": True
                    }
                }, {
                    "keywords": {
                        "$exists": True
                    }
                }]
            },
            projection={
                "text": 1,
                "entities": 1,
                "keywords": 1
            })

        for tweet in tweets:
            if len(tweet['entities']) > 2 and len(tweet['keywords']) > 2:
                entities = []
                for entity_data in tweet['entities']:
                    if entity_data['entity'] not in entities:
                        entities.append(entity_data['entity'])

                data_list.append({
                    "tweet_content": tweet["text"],
                    "entity_list": entities,
                    "keyword_list": tweet["keywords"],
                    "full_list": entities + tweet["keywords"]
                })

        job.upload(data=data_list, force=True)

    def update_data(self, tweet_content, entity_list):
        job = self.client.get_job(1239688)
        data_list = []
        entity_amount = 0
        data = {CF.TWEET_CONTENT: tweet_content, "entity_list": entity_list}
        for index, entity in enumerate(entity_list):
            entity_no = index + 1
            data['entity%s' % entity_no] = entity
            data['dropdown%s' % entity_no] = entity
            entity_amount += 1

        data[CF.ENTITY_AMOUNT] = entity_amount

        if len(entity_list) < 10:
            # empty_entities = 10 - len(entity_list)
            for i in range(len(entity_list) + 1, 11):
                data['entity%s' % i] = ""

        data_list.append(data)

        job.upload(data=data_list, force=True)

    def fact_checking_processing(self, job_id):
        data_list = []
        job = self.client.get_job(job_id)
        # tweets = self.db_connection.find_document(collection=DB.RELEVANT_TWEET_COLLECTION,
        #                                           filter={"$and":[{TWEET.SET_TO_FACTCHECK: True},
        #                                                           {"crowdsourced":{"$exists": False}}]},
        #                                           projection={TWEET.TEXT: 1})

        tweets = list(
            self.db_connection.get_random_sample(
                collection=DB.RELEVANT_TWEET_COLLECTION,
                query={
                    "$and": [{
                        "set_to_factcheck": True
                    }, {
                        "crowdsourced": {
                            "$exists": False
                        }
                    }]
                },
                size=300))

        bulk_op = self.db_connection.start_bulk_upsert(
            collection=DB.RELEVANT_TWEET_COLLECTION)
        # print tweets.count()
        for tweet in tweets:
            data_list.append({"tweet": tweet["text"]})
            self.db_connection.add_to_bulk_upsert(query={"_id": tweet["_id"]},
                                                  data={"crowdsourced": True},
                                                  bulk_op=bulk_op)

        self.db_connection.end_bulk_upsert(bulk_op=bulk_op)

        job.upload(data=data_list, force=True)

    def get_old_judgements(self, job_id):
        page_no = 1
        crowd_results = {}
        results = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id, self.api_key, page_no))

        content = json.loads(results.content)
        for key, result in content.iteritems():
            crowd_results[result['tweet_content']] = {
                "first_entity":
                result[
                    "please_indicate_the_first_entity_of_your_link_note_this_must_be_different_from_the_second_entity"]
                ["res"],
                "second_entity":
                result[
                    "please_indicate_the_second_entity_of_your_link_note_this_must_be_different_from_the_first_entity"]
                ["res"],
                "simple_relation":
                result[
                    "what_is_the_simple_link_between_the_entities_you_have_chosen"],
                "entity_list":
                result["entity_list"]
            }

        total_tp = 0
        total_fn = 0
        correct_relations = 0
        incorrect_relations = 0
        wrong_instructions = 0
        for key, value in crowd_results.iteritems():
            print key
            print "---------------------------------------"
            print "entity list: %s" % value["entity_list"]
            print "first_entities: %s" % value['first_entity']
            print "Second entities: %s" % value['second_entity']
            print "Simple relation: %s" % value['simple_relation']
            print "---------------------------------------"
            tp = int(raw_input("tp?\n"))
            total_tp += tp
            fn = int(raw_input("fn?\n"))
            total_fn += fn
            corr_r = int(raw_input("correct relations (small verb ?\n"))
            correct_relations += corr_r
            incc_r = int(raw_input("incorrect relations (small verb) ?\n"))
            incorrect_relations += incc_r
            wrong_ins = int(raw_input("wrong instructions (long phrase) ?\n"))
            wrong_instructions += wrong_ins
            print "---------------------------------------\n\n\n\n\n\n\n\n\n\n\n"

        print "tp: %d" % total_tp
        print "fn: %d" % total_fn
        print "correct relations: %d" % correct_relations
        print "incorrect relations: %d" % incorrect_relations
        print "wrong instructions: %d" % wrong_instructions

    def check_relations(self, job_id):
        page_no = 1
        tweet_list = []
        results = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id, self.api_key, page_no))

        content = json.loads(results.content)
        for key, result in content.iteritems():
            tweet_list.append(result['tweet_content'])

        total_relations = len(tweet_list)
        valid_relations = 0
        for tweet in tweet_list:
            relations = self.nlu.analyze(
                text=tweet,
                features=Features(semantic_roles=SemanticRolesVerb()))
            print tweet
            semantic_roles = relations["semantic_roles"]
            for entry in semantic_roles:
                print "subject: %s" % entry["subject"]["text"]
                print "verb: %s" % entry["action"]["text"]
                if "object" in entry:
                    print "object: %s" % entry["object"]["text"]
                print "------------------------------------------"
                valid = raw_input("valid?\n")
                if valid == "y":
                    valid_relations += 1

        print valid_relations

    def get_factchecking_judgements(self, job_id):
        index_resolver = {
            'almost_definitely_true': 1,
            'likely_to_be_false': 0,
            'almost_definitely_false': 0,
            'very_ambiguous__i_really_cant_decide': -1
        }

        page_no = 2
        results = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id, self.api_key, page_no))

        content = json.loads(results.content)
        for key, result in content.iteritems():
            almost_definitely_true_count = 0
            likely_to_be_false_count = 0
            almost_definitely_false_count = 0
            ambiguous_count = 0
            source_list = []
            author_list = []

            tweet = result['tweet']
            evidence = result['evidence']['res']
            source_list = result['source']
            author_list = result['author']
            aggregate_rating = index_resolver.get(result['rating']['agg'])
            for value in result['rating']['res']:
                if value == 'almost_definitely_true':
                    almost_definitely_true_count += 1

                elif value == 'likely_to_be_false':
                    likely_to_be_false_count += 1

                elif value == 'almost_definitely_false':
                    almost_definitely_false_count += 1

                elif value == 'very_ambiguous__i_really_cant_decide':
                    ambiguous_count += 1

            doc = {
                TWEET.ALMOST_DEFINITELY_TRUE_COUNT:
                almost_definitely_true_count,
                TWEET.LIKELY_TO_BE_FALSE_COUNT:
                likely_to_be_false_count,
                TWEET.ALMOST_DEFINITELY_FALSE_COUNT:
                almost_definitely_false_count,
                TWEET.AMBIGUOUS_COUNT:
                ambiguous_count,
                TWEET.AGGREGATE_LABEL:
                aggregate_rating,
                TWEET.TOTAL_CROWDSOURCING_COUNT:
                almost_definitely_true_count + likely_to_be_false_count +
                almost_definitely_false_count + ambiguous_count,
                TWEET.EVIDENCE:
                evidence,
                TWEET.CROWDSOURCING_SOURCE_LIST:
                source_list,
                TWEET.CROWDSOURCING_AUTHOR_LIST:
                author_list
            }

            self.db_connection.find_and_update(
                collection=DB.RELEVANT_TWEET_COLLECTION,
                query={"text": tweet},
                update={"$set": doc})

    def evaluate_interesting_statements(self, job_id):
        # index_resolver = {
        #     "tweet1": res
        # }
        page_no = 2
        tp = 0
        tn = 0
        fp = 0
        fn = 0

        results = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id, self.api_key, page_no))

        content = json.loads(results.content)
        total_judgements = 0
        for key, result in content.iteritems():
            labels = result[
                'tick_the_box_of_the_tweets_that_are_politically_important_andor_worth_factchecking'][
                    'res']
            for entry in labels:
                all_tweets = [
                    "tweet1", "tweet2", "tweet3", "tweet4", "tweet5", "tweet6",
                    "tweet7", "tweet8", "tweet9", "tweet10"
                ]
                for tweet_value in entry:
                    tweet = self.index_resolver(
                        list_to_check=result['tweet_list'], value=tweet_value)
                    verdict = self.db_connection.find_document(
                        collection=DB.RELEVANT_TWEET_COLLECTION,
                        filter={
                            "text": tweet
                        },
                        projection={
                            TWEET.SET_TO_FACTCHECK: 1
                        }).next()

                    if TWEET.SET_TO_FACTCHECK not in verdict:
                        verdict = False

                    else:
                        verdict = verdict[TWEET.SET_TO_FACTCHECK]

                    if verdict:
                        tp += 1

                    else:
                        fp += 1

                    all_tweets.remove(tweet_value)
                    total_judgements += 1

                for tweet_value in all_tweets:
                    tweet = self.index_resolver(
                        list_to_check=result['tweet_list'], value=tweet_value)
                    verdict = self.db_connection.find_document(
                        collection=DB.RELEVANT_TWEET_COLLECTION,
                        filter={
                            "text": tweet
                        },
                        projection={
                            TWEET.SET_TO_FACTCHECK: 1
                        }).next()

                    if TWEET.SET_TO_FACTCHECK not in verdict:
                        verdict = False

                    else:
                        verdict = verdict[TWEET.SET_TO_FACTCHECK]

                    if verdict:
                        fn += 1

                    else:
                        tn += 1

        print "tp: %s" % tp
        print "tn: %s" % tn
        print "fp: %s" % fp
        print "fn: %s" % fn
        print "total judgements: %s" % total_judgements

        # for value in result["tick_the_box_of_the_tweets_that_are_politically_important_andor_worth_factchecking"]["res"]:

    def evaluate_factchecking(self, job_id):
        page_no = 4
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        index_resolver = {
            'almost_definitely_true': 1,
            'likely_to_be_false': 0,
            'almost_definitely_false': 0,
            'very_ambiguous__i_really_cant_decide': -1
        }

        results = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id, self.api_key, page_no))

        content = json.loads(results.content)
        total_judgements = 0
        for key, result in content.iteritems():
            tweet = result['tweet']
            print tweet
            ratings = result['rating']['res']
            print ratings
            verdict = raw_input("Is the tweet true?\n")
            verdict = verdict == "y"
            for rating in ratings:

                judgement = index_resolver.get(rating)
                if verdict:
                    if judgement == 1:
                        tp += 1

                    elif judgement == 0:
                        fn += 1

                else:
                    if judgement == 1:
                        fp += 1

                    elif judgement == 0:
                        tn += 1

                total_judgements += 1

        print "tp: %s" % tp
        print "tn: %s" % tn
        print "fp: %s" % fp
        print "fn: %s" % fn
        print "total judgements: %s" % total_judgements

    def index_resolver(self, list_to_check, value):
        resolver = {
            "tweet1": list_to_check[0],
            "tweet2": list_to_check[1],
            "tweet3": list_to_check[2],
            "tweet4": list_to_check[3],
            "tweet5": list_to_check[4],
            "tweet6": list_to_check[5],
            "tweet7": list_to_check[6],
            "tweet8": list_to_check[7],
            "tweet9": list_to_check[8],
            "tweet10": list_to_check[9],
        }

        return resolver.get(value)

    def evaluate_worker_background(self):
        countries = {}
        total = 0
        with open("background.txt") as f:
            for line in f:
                total += 1
                if line not in countries:
                    countries[line] = 1

                else:
                    countries[line] = countries[line] + 1

        print countries
        print "bnt"

    def evalute_factchecking_info(self, job_id1, job_id2):
        almost_definitely_true_count = 0
        likely_to_be_false_count = 0
        almost_definitely_false_count = 0
        ambiguous_count = 0
        page_no = 1
        results1 = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id1, self.api_key, 1))

        results2 = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id2, self.api_key, 1))

        results3 = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id1, self.api_key, 2))

        results4 = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id2, self.api_key, 2))

        results5 = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id1, self.api_key, 3))

        results6 = self.judgements_session.get(
            url=
            "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s"
            % (job_id2, self.api_key, 3))

        content1 = json.loads(results1.content)
        content2 = json.loads(results2.content)
        content3 = json.loads(results3.content)
        content4 = json.loads(results4.content)
        content5 = json.loads(results5.content)
        content6 = json.loads(results6.content)
        final_source_dict = {}
        final_author_dict = {}
        for key, result in content1.iteritems():
            source_list = result['source']
            author_list = result['author']
            # for source in source_list:
            #     source = source.split("//")[1].split(".")[0]
            #     if source not in final_source_dict:
            #         final_source_dict[source] = 1
            #
            #     else:
            #         final_source_dict[source] = final_source_dict[source] + 1
            #
            # for author in author_list:
            #     author = author.encode('ascii', 'ignore')
            #     if author not in final_author_dict:
            #         final_author_dict[author] = 1
            #
            #     else:
            #         final_author_dict[author] = final_author_dict[author] + 1

            for value in result['rating']['res']:
                if value == 'almost_definitely_true':
                    almost_definitely_true_count += 1

                elif value == 'likely_to_be_false':
                    likely_to_be_false_count += 1

                elif value == 'almost_definitely_false':
                    almost_definitely_false_count += 1

                elif value == 'very_ambiguous__i_really_cant_decide':
                    ambiguous_count += 1

        for key, result in content2.iteritems():
            source_list = result['source']
            author_list = result['author']
            # for source in source_list:
            #     source = source.split("//")[1].split(".")[0]
            #     if source not in final_source_dict:
            #         final_source_dict[source] = 1
            #
            #     else:
            #         final_source_dict[source] = final_source_dict[source] + 1
            #
            # for author in author_list:
            #     author = author.encode('ascii', 'ignore')
            #     if author not in final_author_dict:
            #         final_author_dict[author] = 1
            #
            #     else:
            #         final_author_dict[author] = final_author_dict[author] + 1

            for value in result['rating']['res']:
                if value == 'almost_definitely_true':
                    almost_definitely_true_count += 1

                elif value == 'likely_to_be_false':
                    likely_to_be_false_count += 1

                elif value == 'almost_definitely_false':
                    almost_definitely_false_count += 1

                elif value == 'very_ambiguous__i_really_cant_decide':
                    ambiguous_count += 1

        for key, result in content3.iteritems():
            source_list = result['source']
            author_list = result['author']
            # for source in source_list:
            #     source = source.split("//")[1].split(".")[0]
            #     if source not in final_source_dict:
            #         final_source_dict[source] = 1
            #
            #     else:
            #         final_source_dict[source] = final_source_dict[source] + 1
            #
            # for author in author_list:
            #     author = author.encode('ascii', 'ignore')
            #     if author not in final_author_dict:
            #         final_author_dict[author] = 1
            #
            #     else:
            #         final_author_dict[author] = final_author_dict[author] + 1

            for value in result['rating']['res']:
                if value == 'almost_definitely_true':
                    almost_definitely_true_count += 1

                elif value == 'likely_to_be_false':
                    likely_to_be_false_count += 1

                elif value == 'almost_definitely_false':
                    almost_definitely_false_count += 1

                elif value == 'very_ambiguous__i_really_cant_decide':
                    ambiguous_count += 1

        for key, result in content4.iteritems():
            source_list = result['source']
            author_list = result['author']
            # for source in source_list:
            #     source = source.split("//")[1].split(".")[0]
            #     if source not in final_source_dict:
            #         final_source_dict[source] = 1
            #
            #     else:
            #         final_source_dict[source] = final_source_dict[source] + 1
            #
            # for author in author_list:
            #     author = author.encode('ascii', 'ignore')
            #     if author not in final_author_dict:
            #         final_author_dict[author] = 1
            #
            #     else:
            #         final_author_dict[author] = final_author_dict[author] + 1

            for value in result['rating']['res']:
                if value == 'almost_definitely_true':
                    almost_definitely_true_count += 1

                elif value == 'likely_to_be_false':
                    likely_to_be_false_count += 1

                elif value == 'almost_definitely_false':
                    almost_definitely_false_count += 1

                elif value == 'very_ambiguous__i_really_cant_decide':
                    ambiguous_count += 1

        for key, result in content5.iteritems():
            source_list = result['source']
            author_list = result['author']
            # for source in source_list:
            #     source = source.split("//")[1].split(".")[0]
            #     if source not in final_source_dict:
            #         final_source_dict[source] = 1
            #
            #     else:
            #         final_source_dict[source] = final_source_dict[source] + 1
            #
            # for author in author_list:
            #     author = author.encode('ascii', 'ignore')
            #     if author not in final_author_dict:
            #         final_author_dict[author] = 1
            #
            #     else:
            #         final_author_dict[author] = final_author_dict[author] + 1

            for value in result['rating']['res']:
                if value == 'almost_definitely_true':
                    almost_definitely_true_count += 1

                elif value == 'likely_to_be_false':
                    likely_to_be_false_count += 1

                elif value == 'almost_definitely_false':
                    almost_definitely_false_count += 1

                elif value == 'very_ambiguous__i_really_cant_decide':
                    ambiguous_count += 1

        for key, result in content6.iteritems():
            source_list = result['source']
            author_list = result['author']
            # for source in source_list:
            #     source = source.split("//")[1].split(".")[0]
            #     if source not in final_source_dict:
            #         final_source_dict[source] = 1
            #
            #     else:
            #         final_source_dict[source] = final_source_dict[source] + 1
            #
            # for author in author_list:
            #     author = author.encode('ascii', 'ignore')
            #     if author not in final_author_dict:
            #         final_author_dict[author] = 1
            #
            #     else:
            #         final_author_dict[author] = final_author_dict[author] + 1

            for value in result['rating']['res']:
                if value == 'almost_definitely_true':
                    almost_definitely_true_count += 1

                elif value == 'likely_to_be_false':
                    likely_to_be_false_count += 1

                elif value == 'almost_definitely_false':
                    almost_definitely_false_count += 1

                elif value == 'very_ambiguous__i_really_cant_decide':
                    ambiguous_count += 1

        print "bant"
class WikiIngest(object):
    def __init__(self):
        self.db_connection = DBConnection()
        self.logger = logging.getLogger(__name__)
        self.api = PageviewsClient(
            "Mozilla/5.0 (X11; Linux x86_64)"
            " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
        )

    def get_top_articles(self, time_collect=None, historic=False):
        if not historic:
            time_collect = datetime.now() - timedelta(days=1)

        results = self.api.top_articles(project=WIKI_SOURCES.ENGLISH_WIKIPEDIA,
                                        year=time_collect.year,
                                        month=time_collect.month,
                                        day=time_collect.day)

        timestamp = calendar.timegm(time_collect.timetuple())
        articles_to_insert = []
        bulk_op = None
        if historic:
            bulk_op = self.db_connection.start_bulk_upsert(
                collection=DB.WIKI_TRENDS)
        for result in results:
            name = result["article"]
            if "_" in name:
                name = name.replace("_", " ")

            doc = {
                WIKI_TREND.NAME: name,
                WIKI_TREND.RANK: int(result["rank"]),
                WIKI_TREND.VIEWS: int(result["views"]),
                WIKI_TREND.TIMESTAMP: timestamp,
                WIKI_TREND.DATE_OBJECT: time_collect,
                WIKI_TREND.DATE_STRING: time_collect.strftime("%A %B %d %Y"),
                WIKI_TREND.MONTH: time_collect.strftime("%B").lower(),
                WIKI_TREND.WEEKDAY: time_collect.strftime("%A").lower(),
                WIKI_TREND.MONTH_DAY: int(time_collect.strftime("%d")),
                WIKI_TREND.YEAR: time_collect.strftime("%Y")
            }

            if historic:
                self.db_connection.add_to_bulk_upsert(query={
                    "$and": [{
                        WIKI_TREND.NAME: name
                    }, {
                        WIKI_TREND.DATE_STRING:
                        time_collect.strftime("%A %B %d %Y")
                    }]
                },
                                                      data=doc,
                                                      bulk_op=bulk_op)

            else:
                articles_to_insert.append(doc)

        if historic:
            self.db_connection.end_bulk_upsert(bulk_op=bulk_op)

        else:
            self.db_connection.bulk_insert(data=articles_to_insert,
                                           collection=DB.WIKI_TRENDS)
Exemplo n.º 21
0
class FeatureExtractor(object):
    def __init__(self):
        self.db_connection = DBConnection()
        self.sid = SentimentIntensityAnalyzer()
        self.nlu = NaturalLanguageUnderstandingV1(
            version='2017-02-27',
            username="******",
            password="******")

        self.twitter = Twitter(os.environ.get(CREDS.TWITTER_KEY),
                               os.environ.get(CREDS.TWITTER_SECRET),
                               os.environ.get(CREDS.TWITTER_TOKEN),
                               os.environ.get(CREDS.TWITTER_TOKEN_SECRET),
                               self.db_connection)
        self.session = requests.session()
        self.resolved_urls = []
        # self.session = requests.session()

    def get_extra_features(self, tweets):
        '''
        Gets extra features such as whether tweet contains figures and percentage of words not in dictionary
        :param tweets:
        :return:
        '''
        english_dict = enchant.Dict("en_GB")
        bulk_op = self.db_connection.start_bulk_upsert(
            collection=DB.RELEVANT_TWEET_COLLECTION)
        bulk_count = 0
        for tweet in tweets:
            not_english = 0
            text = re.sub(r"http\S+", "", tweet['text'])
            figures = re.findall("-?\d+", text)
            no_words = len(re.findall(r'\w+', text))
            has_figures = len(figures) > 0
            clean_text = ''.join([i for i in text if not i.isdigit()])
            clean_text = re.sub(r'[^\w]', ' ', clean_text)
            for word in clean_text.split():
                if not english_dict.check(word):
                    not_english += 1

            doc = {
                TWEET.CONTAINS_FIGURES: has_figures,
                TWEET.FRAC_NOT_IN_DICT: not_english / no_words
            }

            self.db_connection.add_to_bulk_upsert(query={"_id": tweet["_id"]},
                                                  data=doc,
                                                  bulk_op=bulk_op)
            bulk_count += 1
            if bulk_count % 100 == 0:
                self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
                bulk_op = self.db_connection.start_bulk_upsert(
                    collection=DB.RELEVANT_TWEET_COLLECTION)
                logger.info("Pushing 100 extra featured tweets to DB")

        if bulk_count > 0 and bulk_count % 100 != 0:
            self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
            logger.info("Final DB push for tweets with extra features")

    def chunks(self, l, n):
        """Yield successive n-sized chunks from l."""
        for i in range(0, len(l), n):
            yield l[i:i + n]

    def resolve_url(self, urls):
        db_connection = DBConnection()
        url_list = []
        try:
            r = requests.get(urls[1])
            if r.status_code != 200:
                longurl = None
            else:
                longurl = r.url

            self.resolved_urls.append((urls[0], longurl))
            r.close()

        except requests.exceptions.RequestException:
            return None

    def fetch_url(self, url):
        # urlHandler = urllib2.urlopen(url[1])
        # print urlHandler
        # session = requests.Session()  # so connections are recycled
        resp = requests.head(url[1], allow_redirects=True, timeout=3)
        # if resp.status_code == 200 or resp.status_code == 302:
        self.resolved_urls.append((url[0], resp.url))
        resp.close()
        # print "appended"

    def convert_weekday(self, weekday):
        week_dict = {
            "Monday": WEEKDAY.MONDAY,
            "Tuesday": WEEKDAY.TUESDAY,
            "Wednesday": WEEKDAY.WEDNESDAY,
            "Thursday": WEEKDAY.THURSDAY,
            "Friday": WEEKDAY.FRIDAY,
            "Saturday": WEEKDAY.SATURDAY,
            "Sunday": WEEKDAY.SUNDAY
        }

        return week_dict.get(weekday)

    def get_top_websites(self):
        domains_to_insert = []
        rank = 0
        with open("top_news_domains", "rb") as f:
            for line in f:
                line = line.decode("utf8").strip()
                if "Website" in line:
                    rank += 1
                    domain_info = {
                        DOMAIN.URL: line.split(" ")[1],
                        DOMAIN.RANK: rank
                    }
                    domains_to_insert.append(domain_info)

        f.close()
        self.db_connection.bulk_insert(data=domains_to_insert,
                                       collection=DB.TOP_NEWS_DOMAINS)

    def aggregate_urls(self, tweets):
        urls_list = []
        resolved_urls = []
        bulk_count = 0
        bulk_op = self.db_connection.start_bulk_upsert(
            collection=DB.RELEVANT_TWEET_COLLECTION)
        # pool = ThreadPool(100)
        for tweet in tweets:
            # urls = re.findall(r'(https?://[^\s]+)', tweet["text"])
            urls = re.findall(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                tweet["text"])
            if len(urls) > 0:
                for url in urls:
                    urls_list.append((tweet["_id"], url))

        url_chunks = self.chunks(urls_list, 100)
        for chunk in url_chunks:

            pool = ThreadPool(100)
            pool.imap_unordered(self.fetch_url, chunk)
            pool.close()
            pool.join()
            pool.terminate()

            for tweet_id, long_url in self.resolved_urls:
                self.db_connection.add_to_bulk_upsert_push(
                    query={"_id": tweet_id},
                    field=TWEET.RESOLVED_URLS,
                    value=long_url,
                    bulk_op=bulk_op)

                bulk_count += 1

            try:
                if bulk_count != 0:
                    self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
                    bulk_op = self.db_connection.start_bulk_upsert(
                        collection=DB.RELEVANT_TWEET_COLLECTION)
                    logger.info("pushing %d updates to database" % bulk_count)
                    bulk_count = 0

            except InvalidOperation as e:
                logger.warn(e)

            urls_list = []
            # resolved_urls = []
            self.resolved_urls = []

        if bulk_count != 0:
            self.db_connection.end_bulk_upsert(bulk_op=bulk_op)

    def get_tweet_urls(self, tweets):

        urls_list = []
        resolved_urls = []
        bulk_count = 0
        bulk_op = self.db_connection.start_bulk_upsert(
            collection=DB.RELEVANT_TWEET_COLLECTION)
        # pool = ThreadPool(100)
        for tweet in tweets:
            # urls = re.findall(r'(https?://[^\s]+)', tweet["text"])
            urls = re.findall(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                tweet["text"])
            if len(urls) > 0:
                for url in urls:
                    urls_list.append((tweet["_id"], url))

        url_chunks = self.chunks(urls_list, 100)
        for chunk in url_chunks:

            # if len(urls_list) != 0 and len(urls_list) % 100 == 0:
            # threads = [threading.Thread(target=self.fetch_url, args=(url,)) for url in urls_list]
            # for thread in threads:
            #     thread.start()
            # for thread in threads:
            #     thread.join()
            pool = ThreadPool(100)
            pool.imap_unordered(self.fetch_url, chunk)
            pool.close()
            pool.join()
            pool.terminate()

            # rs = (grequests.head(u[1], timeout=2) for u in urls_list)
            # resolved = grequests.map(rs, exception_handler=exception_handler)
            # for index, long_url in enumerate(self.resolved_urls):
            # for tweet_id, long_url in self.resolved_urls:
            # if long_url:
            # long_url = long_url.url
            # tweet_id = urls_list[index][0]

            # for tweet_id, long_url in pool.map(self.resolve_url, urls_list):
            #     resolved_urls.append((tweet_id, long_url))

            for tweet_id, long_url in self.resolved_urls:
                top10 = False
                top30 = False
                top50 = False
                doc = {TWEET.VERIFIED_URLS: True}
                url = long_url.split("://")[1]
                if re.match(r'^www.', url):
                    try:
                        url = url.split("www.")[1]
                    except IndexError:
                        continue

                if "/" in url:
                    url = url.split("/")[0]

                if len(url.split('.')[0]) > 1:
                    # regexp = re.compile("/.*%s.*/" % url, re.IGNORECASE)
                    regexp = "/.*%s.*/" % url
                    # match = self.db_connection.find_document(collection=DB.TOP_NEWS_DOMAINS,
                    #                                          filter={"url": {"$regex": url}})

                    match = self.db_connection.find_document(
                        collection=DB.TOP_NEWS_DOMAINS, filter={"url": url})

                    for domain in match:
                        rank = domain["rank"]
                        if not top10:
                            top10 = rank <= 10

                        if not top30:
                            top30 = 11 <= rank <= 30

                        if not top50:
                            top50 = 31 <= rank <= 50

                    if top10:
                        doc[TWEET.CONTAINS_DOMAIN_TOP10] = top10

                    if top30:
                        doc[TWEET.CONTAINS_DOMAIN_TOP30] = top30

                    if top50:
                        doc[TWEET.CONTAINS_DOMAIN_TOP50] = top50

                self.db_connection.add_to_bulk_upsert(query={"_id": tweet_id},
                                                      data=doc,
                                                      bulk_op=bulk_op)
                bulk_count += 1

            try:
                if bulk_count != 0:
                    self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
                    bulk_op = self.db_connection.start_bulk_upsert(
                        collection=DB.RELEVANT_TWEET_COLLECTION)
                    logger.info("pushing %d updates to database" % bulk_count)
                    bulk_count = 0

            except InvalidOperation as e:
                logger.warn(e)

            urls_list = []
            # resolved_urls = []
            self.resolved_urls = []

        if bulk_count != 0:
            self.db_connection.end_bulk_upsert(bulk_op=bulk_op)

    def get_tweet_features(self, tweets):
        '''
        Given a list of tweets, extracts the necessary features for this tweet for the classifier
        This includes a tweet's:
        - Number of characters
        - Number of words
        - Contains a question mark
        - Contains an exclamation mark
        - Fraction of capital letters
        - Are there multiple exclamation marks or question marks
        - Contains happy emoji(s)
        - Contains unhappy emoji(s)
        - Contains happy emoticon
        - Contains unhappy emoticon
        - Contains pronouns
        - No.of URLS
        - Contains popular domain top 10
        - Contains popular domain top 30
        - Contains popular domain top 50
        - Mentions user
        - Contains hashtag
        - Contains stock symbol e.g. $GOOGL
        - Day of the week in which tweet was made: - Monday = 1 ...... Sunday = 7
        - No.of positive words
        - No.of negative words
        - Total final sentiment score
        - Relevance score from news: day, week, 2weeks
        - No.of entities extracted
        - No.of keywords extracted
        - Average certainty of entities extracted
        - Average relevance of keywords extracted
        :param tweets: List of tweets to perform feature extraction
        :return:
        '''

        bulk_op = self.db_connection.start_bulk_upsert(
            collection=DB.RELEVANT_TWEET_COLLECTION)
        bulk_count = 0
        for tweet in tweets:
            text = re.sub(r'http\S+', '', tweet['text'])  # Remove links
            capitalised = sum(1 for c in text if c.isupper())
            text = text.lower()
            timestamp = tweet['created_at_epoch']
            no_chars = len(re.sub(r"\s+", "", text))
            no_words = len(re.findall(r'\w+', text))
            capitalised = capitalised / no_chars
            contains_qm = "?" in text
            contains_em = "!" in text
            multiple_marks = text.count("?") > 1 or text.count("!") > 1
            # happy_emoji = []

            # Pronoun extraction
            tokens = nltk.word_tokenize(text)
            pos_tags = nltk.pos_tag(tokens)
            has_personal_pronoun = False
            for tag in pos_tags:
                has_personal_pronoun = tag[0] in ['PRP', 'PRP$']
                if has_personal_pronoun:
                    break

            # Extracting user mentions
            user_mentions = re.findall("(^|[^@\w])@(\w{1,15})", text)
            user_mentions = [mention[1] for mention in user_mentions]
            # Extracting stock symbols
            stock_result = re.findall("$([a-zA-Z0-9]{1,15})", text)

            day_of_week = datetime.fromtimestamp(timestamp).strftime("%A")

            # Extracting emoticons
            happy_emoticons = """
            :‑) :) :-] :] :-3 :3 :-> :> 8-) 8) :-} :} :o) :c) :^) =] =) :‑D :D 8‑D 8D x‑D xD X‑D XD =D =3 B^D :-)) :'‑) 
            :') :‑P :P :‑p :p =p >:P
            """.split()

            sad_emoticons = """
            :‑( :( :‑c :c :‑< :< :‑[ :[ :-|| >:[ :{	:@ >:( :'‑( :'( D‑': D:< D: D8 D; D= DX 
            :‑/ :/ :‑. >:\ >:/ :\ =/ =\	:L =L :S
            """.split()

            happy_emoticon_pattern = "|".join(map(re.escape, happy_emoticons))
            sad_emoticon_pattern = "|".join(map(re.escape, sad_emoticons))

            happy_emoticon_count = re.findall(happy_emoticon_pattern, text)
            sad_emoticon_count = re.findall(sad_emoticon_pattern, text)

            # Extracting emojis
            happy_emoji_count = len(
                [c for c in text.split() if c in EMOJI_HAPPY])
            sad_emoji_count = len(
                [c for c in text.split() if c in EMOJI_UNHAPPY])

            # Extracting sentiment score and its components

            sentiment_score = 0
            pos_word_count = 0
            neg_word_count = 0

            for word in text.split():
                with open('positive_words.txt') as positive_file:
                    if word in positive_file.read().split():
                        pos_word_count += 1

                    else:
                        positive_file.close()
                        with open('negative_words.txt') as negative_file:
                            if word in negative_file.read().split():
                                neg_word_count += 1

            # Domain extraction
            top10 = False
            top30 = False
            top50 = False
            if TWEET.LINKS in tweet:
                for url in tweet[TWEET.LINKS]:
                    try:
                        url = requests.head(url, allow_redirects=True).url
                        url = url.split("://")[1]
                        if re.match(r'^www.', url):
                            try:
                                url = url.split("www.")[1]
                            except IndexError:
                                url = url.split("www3.")[1]

                        if "/" in url:
                            url = url.split("/")[0]

                        if len(url.split('.')[0]) > 1:
                            # regexp = re.compile("/.*%s.*/" % url, re.IGNORECASE)
                            regexp = "/.*%s.*/" % url
                            match = self.db_connection.find_document(
                                collection=DB.TOP_NEWS_DOMAINS,
                                filter={"url": {
                                    "$regex": url
                                }})

                            for domain in match:
                                rank = domain["rank"]
                                top10 = rank <= 10
                                top30 = 11 <= rank <= 30
                                top50 = 31 <= rank <= 50
                    except ConnectionError as e:
                        logger.warn(e)

            # Certainty extraction
            entity_certainty = 0
            keyword_certainty = 0
            for entity in tweet[TWEET.ENTITIES]:
                entity_certainty += entity['certainty']

            for keyword in tweet[TWEET.KEYWORDS]:
                keyword_certainty += keyword['certainty']

            # Sentiment extraction

            try:
                sentiment_response = self.nlu.analyze(
                    text=text, features=Features(sentiment=SentimentOptions()))
                sentiment_score += sentiment_response['sentiment']['document'][
                    'score']
            except WatsonApiException as e:
                logger.warn(e.message)
                sentiment_score = 0

            doc = {
                TWEET.CHARACTER_COUNT: no_chars,
                TWEET.WORD_COUNT: no_words,
                TWEET.CONTAINS_QM: contains_qm,
                TWEET.CONTAINS_EM: contains_em,
                TWEET.CONTAINS_MULTIPLE_MARKS: multiple_marks,
                TWEET.FRACTION_CAPITALISED: capitalised,
                TWEET.CONTAINS_HAPPY_EMOJI: happy_emoji_count > 0,
                TWEET.CONTAINS_SAD_EMOJI: sad_emoji_count > 0,
                TWEET.CONTAINS_HAPPY_EMOTICON: len(happy_emoticon_count) > 0,
                TWEET.CONTAINS_SAD_EMOTICON: len(sad_emoticon_count) > 0,
                TWEET.CONTAINS_PRONOUNS: has_personal_pronoun,
                TWEET.MENTIONED_USERS: user_mentions,
                TWEET.MENTIONS_USER: len(user_mentions) > 0,
                TWEET.CONTAINS_STOCK_SYMBOL: len(stock_result) > 0,
                TWEET.PUBLISH_WEEKDAY: self.convert_weekday(day_of_week),
                TWEET.POSITIVE_WORD_COUNT: pos_word_count,
                TWEET.NEGATIVE_WORD_COUNT: neg_word_count,
                TWEET.SENTIMENT_SCORE: sentiment_score,
                TWEET.ENTITIES_COUNT: len(tweet[TWEET.ENTITIES]),
                TWEET.KEYWORDS_COUNT: len(tweet[TWEET.KEYWORDS]),
                TWEET.CONTAINS_DOMAIN_TOP10: top10,
                TWEET.CONTAINS_DOMAIN_TOP30: top30,
                TWEET.CONTAINS_DOMAIN_TOP50: top50
            }

            if len(tweet[TWEET.ENTITIES]) == 0:
                doc[TWEET.AVERAGE_ENTITY_CERTAINTY] = 0

            else:
                doc[TWEET.AVERAGE_ENTITY_CERTAINTY] = entity_certainty / len(
                    tweet[TWEET.ENTITIES])

            if len(tweet[TWEET.KEYWORDS]) == 0:
                doc[TWEET.AVERAGE_KEYWORD_CERTAINTY] = 0
            else:
                doc[TWEET.AVERAGE_KEYWORD_CERTAINTY] = keyword_certainty / len(
                    tweet[TWEET.KEYWORDS])
            # TWEET.AVERAGE_ENTITY_CERTAINTY: entity_certainty / len(tweet[TWEET.ENTITIES]),
            # TWEET.AVERAGE_KEYWORD_CERTAINTY: keyword_certainty / len(tweet[TWEET.KEYWORDS]),

            self.db_connection.add_to_bulk_upsert(query={"_id": tweet["_id"]},
                                                  data=doc,
                                                  bulk_op=bulk_op)

            bulk_count += 1

            if bulk_count % 100 == 0:
                self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
                bulk_op = self.db_connection.start_bulk_upsert(
                    collection=DB.RELEVANT_TWEET_COLLECTION)

        if bulk_count % 100 != 0:
            self.db_connection.end_bulk_upsert(bulk_op=bulk_op)

    def get_user_features(self, users):
        '''
        Given a list of users, extracts the necessary features for this user for the classifier
        The feature list includes:
        - Amount of days until now since user created account
        - Number of tweets
        - Number of followers
        - Number of followees
        - Is verified (1 if verified)
        - Has non empty description
        - Average number of retweets
        - Average number of favourites

        :param users:
        :return:
        '''

        for user in users:
            tweet_info = self.db_connection.find_document(
                collection=DB.RELEVANT_TWEET_COLLECTION,
                filter={"author_handle": user["twitter_handle"]},
                projection={
                    "retweet_count": 1,
                    "favourites_count": 1
                })
            cursor_count = tweet_info.count()
            total_retweets = 0
            total_favourites = 0
            if cursor_count > 0:
                for tweet in tweet_info:
                    total_favourites += tweet["favourites_count"]
                    total_retweets += tweet["retweet_count"]

                total_retweets = total_retweets / cursor_count
                total_favourites = total_favourites / cursor_count

            user_data = self.twitter.api.GetUser(user_id=user["_id"])
            created_at = datetime.strptime(user_data.created_at,
                                           '%a %b %d %H:%M:%S +0000 %Y')
            final_date = datetime(year=2018, month=4, day=15)
            days_since = (final_date - created_at).days
            timestamp = calendar.timegm(created_at.timetuple())

            if user_data.status:
                doc = {
                    MP.IS_VERIFIED: user_data.verified,
                    MP.FRIENDS_COUNT: user_data.friends_count,
                    MP.AVERAGE_NO_FAVOURITES: total_favourites,
                    MP.AVERAGE_NO_RETWEETS: total_retweets,
                    MP.NON_EMPTY_DESCRIPTION: len(user_data.description) > 0,
                    MP.ACCOUNT_DAYS: days_since,
                    MP.CREATED_AT: created_at,
                    MP.CREATED_AT_EPOCH: timestamp
                }
            self.db_connection.find_and_update(collection=DB.MP_COLLECTION,
                                               query={"_id": user["_id"]},
                                               update={"$set": doc})

    def get_topic_features(self, topics):
        '''
        Extract features for a given topic, including:
        - amount of tweets
        - Average length
        - Fraction containing questioning mark
        - Fraction containing exclamation mark
        - Fraction containing multiple question marks/multiple exclamation marks
        - Fraction containing happy emoticon, sad emoticon, happy emoji, sad emoji
        - Fraction containing pronouns
        - Fraction containing 30% of characters uppercased
        - Fraction containing a URL
        - Fraction containing a user mention
        - Fraction containing hashtags
        - Fraction containing stock symbols
        - Average sentiment score
        - Fraction containing positive sentiment score
        - Fraction containing negative sentiment score
        - Fraction containing popular domain top 10
        - Fraction containing popular domain top 30
        - Fraction containing popular domain top 50
        - Number of distinct URLs
        - Fraction containing most visited URL
        - Number of distinct short URLs
        - Number of distinct hashtags
        - Fraction containing most used hashtag
        - Number of distinct users mentioned
        - Fraction containing most mentioned user
        - Number of distinct tweet authors
        - Fraction of tweets by most frequent author
        - Author average twitter life
        - Author average amount of tweets
        - Author average amount of followers
        - Author average amount of friends
        - Fraction of tweets from verified users
        - Fraction with authors with description
        :param topics:
        :return:
        '''

        for topic in topics:
            tweet_bulk_op = self.db_connection.start_bulk_upsert(
                collection=DB.RELEVANT_TWEET_COLLECTION)
            # matching_tweets = self.db_connection.find_document(collection=DB.RELEVANT_TWEET_COLLECTION,
            #                                                    filter={"$and":[{"text": {"$regex": " %s | #%s " % topic["name"],
            #                                                                     "$options": "-i"}},
            #                                                                    {"text": {
            #                                                                        "$regex": " #%s " % topic["name"],
            #                                                                        "$options": "-i"}}]})

            matching_tweets = self.db_connection.find_document(
                collection=DB.RELEVANT_TWEET_COLLECTION,
                filter={
                    "text": {
                        "$regex":
                        " %s | #%s |%s | %s|#%s | #%s" %
                        (topic["name"], topic["name"], topic["name"],
                         topic["name"], topic["name"], topic["name"]),
                        "$options":
                        "-i"
                    }
                })

            # matching_tweets = self.db_connection.find_document(collection=DB.RELEVANT_TWEET_COLLECTION,
            #                                                    filter={"text": {"$regex": " %s | #" % topic["name"],
            #                                                                     "$options": "-i"}})

            # matching_tweets1 = list(matching_tweets1)
            #
            # matching_tweets2 = self.db_connection.find_document(collection=DB.RELEVANT_TWEET_COLLECTION,
            #                                                    filter={"text": {"$regex": " #%s " % topic["name"],
            #                                                                     "$options": "-i"}})

            total = matching_tweets.count()
            print total
            tweet_length = 0
            contains_qm = 0
            contains_em = 0
            contains_multiple_marks = 0
            contains_happy_emoticon = 0
            contains_sad_emoticon = 0
            contains_happy_emoji = 0
            contains_sad_emoji = 0
            contains_pronouns = 0
            contains_uppercase = 0
            contains_figures = 0
            contains_url = 0
            contains_user_mention = 0
            contains_hashtag = 0
            contains_stock_symbols = 0
            sentiment_score = 0
            positive_sentiment = 0
            negative_sentiment = 0
            top10 = 0
            top30 = 0
            top50 = 0

            distinct_urls_count = 0
            most_visited_url_count = 0
            distinct_hashtag_count = 0
            most_used_hashtag_count = 0
            distinct_user_mention_count = 0
            most_mentioned_user_count = 0
            distinct_tweet_author_count = 0
            top_author_tweets_count = 0
            author_twitter_life = 0
            author_follower_count = 0
            author_friend_count = 0
            author_tweet_count = 0
            verified = 0
            day_relevance = 0
            week_relevance = 0
            two_week_relevance = 0
            words_not_in_dict = 0

            # Distinctions

            distinct_urls = {}
            distinct_hashtags = {}
            distinct_user_mentions = {}
            distinct_authors = {}

            # total_tweets = list(matching_tweets1) + list(matching_tweets2)

            if total > 0:
                for tweet in matching_tweets:
                    self.db_connection.add_to_bulk_upsert_addtoset(
                        query={TWEET.ID: tweet["_id"]},
                        field=TWEET.TOPICS,
                        value={
                            "_id":
                            topic["_id"],
                            TOPIC.IDENTIFIED_AS_TOPIC:
                            topic[TOPIC.IDENTIFIED_AS_TOPIC]
                        },
                        bulk_op=tweet_bulk_op)

                    # {"_id": topic["_id"],
                    # TOPIC.IDENTIFIED_AS_TOPIC: topic[TOPIC.IDENTIFIED_AS_TOPIC]}}},
                    #   bulk_op=tweet_bulk_op)

                    tweet_length += tweet[TWEET.CHARACTER_COUNT]
                    if tweet[TWEET.CONTAINS_QM]:
                        contains_qm += 1

                    if tweet[TWEET.CONTAINS_EM]:
                        contains_em += 1

                    if tweet[TWEET.CONTAINS_MULTIPLE_MARKS]:
                        contains_multiple_marks += 1

                    if tweet[TWEET.CONTAINS_HAPPY_EMOTICON]:
                        contains_happy_emoticon += 1

                    if tweet[TWEET.CONTAINS_SAD_EMOTICON]:
                        contains_sad_emoticon += 1

                    if tweet[TWEET.CONTAINS_HAPPY_EMOJI]:
                        contains_happy_emoji += 1

                    if tweet[TWEET.CONTAINS_SAD_EMOJI]:
                        contains_sad_emoji += 1

                    if tweet[TWEET.CONTAINS_PRONOUNS]:
                        contains_pronouns += 1

                    if tweet[TWEET.CONTAINS_FIGURES]:
                        contains_figures += 1

                    if tweet[TWEET.FRACTION_CAPITALISED] >= 0.3:
                        contains_uppercase += 1

                    urls = re.findall(r'(https?://[^\s]+)', tweet[TWEET.TEXT])
                    if len(urls) > 0:
                        contains_url += 1
                        if TWEET.RESOLVED_URLS in tweet:
                            for c, url in enumerate(
                                    tweet[TWEET.RESOLVED_URLS]):
                                if url not in distinct_urls:
                                    if url.split("//")[1].split(
                                            "/"
                                    )[0] != "twitter.com":  # Ignore twitter domain URLs
                                        distinct_urls[url] = 1

                                else:
                                    distinct_urls[url] = distinct_urls[url] + 1

                    if tweet[TWEET.MENTIONS_USER]:
                        contains_user_mention += 1

                    if TWEET.MENTIONED_USERS in tweet:
                        if len(tweet[TWEET.MENTIONED_USERS]) > 0:
                            for mentioned_user in tweet[TWEET.MENTIONED_USERS]:
                                if mentioned_user not in distinct_user_mentions:
                                    distinct_user_mentions[mentioned_user] = 1

                                else:
                                    distinct_user_mentions[
                                        mentioned_user] = distinct_user_mentions[
                                            mentioned_user] + 1

                    if TWEET.HASHTAGS in tweet:
                        if len(tweet[TWEET.HASHTAGS]) > 0:
                            contains_hashtag += 1
                            for hashtag in tweet[TWEET.HASHTAGS]:
                                if hashtag not in distinct_hashtags:
                                    distinct_hashtags[hashtag] = 1

                                else:
                                    distinct_hashtags[
                                        hashtag] = distinct_hashtags[
                                            hashtag] + 1

                    if tweet[TWEET.CONTAINS_STOCK_SYMBOL]:
                        contains_stock_symbols += 1

                    sentiment_score += tweet[TWEET.SENTIMENT_SCORE]
                    if tweet[TWEET.SENTIMENT_SCORE] >= 0:
                        positive_sentiment += 1

                    if tweet[TWEET.SENTIMENT_SCORE] < 0:
                        negative_sentiment += 1

                    if tweet[TWEET.CONTAINS_DOMAIN_TOP10]:
                        top10 += 1

                    if tweet[TWEET.CONTAINS_DOMAIN_TOP30]:
                        top30 += 1

                    if tweet[TWEET.CONTAINS_DOMAIN_TOP50]:
                        top50 += 1

                    author_info = self.db_connection.find_document(
                        collection=DB.MP_COLLECTION,
                        filter={"_id": tweet[TWEET.AUTHOR_ID]})[0]

                    if author_info[MP.TWITTER_HANDLE] not in distinct_authors:
                        distinct_authors[author_info[MP.TWITTER_HANDLE]] = 1
                        if author_info[MP.IS_VERIFIED]:
                            verified += 1

                        author_twitter_life += author_info[MP.ACCOUNT_DAYS]
                        author_follower_count += author_info[
                            MP.FOLLOWERS_COUNT]
                        author_friend_count += author_info[MP.FRIENDS_COUNT]
                        author_tweet_count += author_info[MP.TWEET_COUNT]

                    else:
                        distinct_authors[author_info[
                            MP.TWITTER_HANDLE]] = distinct_authors[author_info[
                                MP.TWITTER_HANDLE]] + 1

                        # if author_info[MP.IS_VERIFIED]:
                        #     verified += 1

                    day_relevance += tweet[TWEET.RELEVANCY_DAY]
                    week_relevance += tweet[TWEET.RELEVANCY_WEEK]
                    two_week_relevance += tweet[TWEET.RELEVANCY_TWO_WEEKS]
                    words_not_in_dict += tweet[TWEET.FRAC_NOT_IN_DICT]

                distinct_urls_count += len(distinct_urls)
                if distinct_urls_count > 0:
                    top_url = max(distinct_urls.iteritems(),
                                  key=operator.itemgetter(1))[0]

                distinct_hashtag_count += len(distinct_hashtags)
                if distinct_hashtag_count > 0:
                    top_hashtag = max(distinct_hashtags.iteritems(),
                                      key=operator.itemgetter(1))[0]

                distinct_user_mention_count += len(distinct_user_mentions)
                if distinct_user_mention_count > 0:
                    top_user_mention = max(distinct_user_mentions.iteritems(),
                                           key=operator.itemgetter(1))[0]

                distinct_tweet_author_count += len(distinct_authors)
                if distinct_tweet_author_count > 0:
                    top_author = max(distinct_authors.iteritems(),
                                     key=operator.itemgetter(1))[0]

                # for tweet in matching_tweets:
                #     if top_url in tweet[TWEET.RESOLVED_URLS]:
                #         most_visited_url_count += 1
                #
                #     if top_hashtag in tweet[TWEET.HASHTAGS]:
                #         most_used_hashtag_count += 1
                #
                #     if top_user_mention in tweet[TWEET.MENTIONED_USERS]:
                #         most_mentioned_user_count += 1
                #
                #     if tweet[TWEET.AUTHOR_HANDLE] == top_author:
                #         top_author_tweets_count += 1

                doc = {
                    TOPIC.TWEET_COUNT:
                    total,
                    TOPIC.TWEET_AVERAGE_LENGTH:
                    tweet_length / total,
                    TOPIC.FRAC_CONTAINING_QM:
                    contains_qm / total,
                    TOPIC.FRAC_CONTAINING_EM:
                    contains_em / total,
                    TOPIC.FRAC_CONTAINING_MULTIPLE_MARKS:
                    contains_multiple_marks / total,
                    TOPIC.FRAC_CONTAINING_HAPPY_EMOTICON:
                    contains_happy_emoticon / total,
                    TOPIC.FRAC_CONTAINING_SAD_EMOTICON:
                    contains_sad_emoticon / total,
                    TOPIC.FRAC_CONTAINING_HAPPY_EMOJI:
                    contains_happy_emoji / total,
                    TOPIC.FRAC_CONTAINING_SAD_EMOJI:
                    contains_sad_emoji / total,
                    TOPIC.FRAC_CONTAINING_PRONOUNS:
                    contains_pronouns / total,
                    TOPIC.FRAC_CONTAINING_FIGURES:
                    contains_figures / total,
                    TOPIC.FRAC_CONTAINING_UPPERCASE:
                    contains_uppercase / total,
                    TOPIC.FRAC_CONTAINING_URL:
                    contains_url / total,
                    TOPIC.FRAC_CONTAINING_USER_MENTION:
                    contains_user_mention / total,
                    TOPIC.FRAC_CONTAINING_HASHTAGS:
                    contains_hashtag / total,
                    TOPIC.FRAC_CONTAINING_STOCK_SYMBOLS:
                    contains_stock_symbols / total,
                    TOPIC.AVERAGE_SENTIMENT_SCORE:
                    sentiment_score / total,
                    TOPIC.FRAC_CONTAINING_POSITIVE_SENTIMENT:
                    positive_sentiment / total,
                    TOPIC.FRAC_CONTAINING_NEGATIVE_SENTIMENT:
                    negative_sentiment / total,
                    TOPIC.FRAC_CONTAINING_DOMAIN10:
                    top10 / total,
                    TOPIC.FRAC_CONTAINING_DOMAIN30:
                    top30 / total,
                    TOPIC.FRAC_CONTAINING_DOMAIN50:
                    top50 / total,
                    TOPIC.DISTINCT_URLS_COUNT:
                    distinct_urls_count,
                    TOPIC.DISTINCT_HASHTAG_COUNT:
                    distinct_hashtag_count,
                    TOPIC.DISTINCT_USER_MENTION_COUNT:
                    distinct_user_mention_count,
                    TOPIC.DISTINCT_TWEET_AUTHOR_COUNT:
                    distinct_tweet_author_count,
                    TOPIC.AVERAGE_AUTHOR_TWITTER_LIFE:
                    author_twitter_life / distinct_tweet_author_count,
                    TOPIC.AVERAGE_AUTHOR_TWEET_COUNT:
                    author_tweet_count / distinct_tweet_author_count,
                    TOPIC.AVERAGE_AUTHOR_FOLLOWER_COUNT:
                    author_follower_count / distinct_tweet_author_count,
                    TOPIC.AVERAGE_AUTHOR_FRIEND_COUNT:
                    author_friend_count / distinct_tweet_author_count,
                    TOPIC.FRAC_FROM_VERIFIED:
                    verified / distinct_tweet_author_count,
                    TOPIC.AVERAGE_DAY_RELEVANCE:
                    day_relevance / total,
                    TOPIC.AVERAGE_WEEK_RELEVANCE:
                    week_relevance / total,
                    TOPIC.AVERAGE_2WEEK_RELEVANCE:
                    two_week_relevance / total,
                    TOPIC.AVERAGE_WORDS_NOT_IN_DICT:
                    words_not_in_dict / total,
                }

                if distinct_urls_count > 0:
                    doc[TOPIC.
                        FRAC_CONTAINING_MOST_VISITED_URL] = distinct_urls.get(
                            top_url) / total

                else:
                    doc[TOPIC.FRAC_CONTAINING_MOST_VISITED_URL] = 0

                if distinct_hashtag_count > 0:
                    doc[TOPIC.
                        FRAC_CONTAINING_MOST_USED_HASHTAG] = distinct_hashtags.get(
                            top_hashtag) / total

                else:
                    doc[TOPIC.FRAC_CONTAINING_MOST_USED_HASHTAG] = 0

                if distinct_user_mention_count > 0:
                    doc[TOPIC.
                        FRAC_CONTAINING_MOST_MENTIONED_USER] = distinct_user_mentions.get(
                            top_user_mention) / total

                else:
                    doc[TOPIC.FRAC_CONTAINING_MOST_MENTIONED_USER] = 0

                if distinct_tweet_author_count > 0:
                    doc[TOPIC.
                        FRAC_CONTAINING_TOP_AUTHOR] = distinct_authors.get(
                            top_author) / total

                else:
                    doc[TOPIC.FRAC_CONTAINING_TOP_AUTHOR] = 0

                # self.db_connection.update_many(collection=DB.RELEVANT_TWEET_COLLECTION,
                #                                query={"$in": tweet_id_list},
                #                                update={"$push": {TWEET.TOPICS: {"_id": topic["_id"],
                #                                         TOPIC.IDENTIFIED_AS_TOPIC: topic[TOPIC.IDENTIFIED_AS_TOPIC]}}})
                self.db_connection.find_and_update(
                    collection=DB.RELEVANT_TOPICS,
                    query={"_id": topic["_id"]},
                    update={"$set": doc})

                self.db_connection.end_bulk_upsert(bulk_op=tweet_bulk_op)

    def get_topics_for_lost_tweets(self):
        tweets = self.db_connection.find_document(
            collection=DB.RELEVANT_TWEET_COLLECTION,
            filter={
                "$and": [{
                    "aggregate_label": {
                        "$exists": True
                    }
                }, {
                    "topics": {
                        "$exists": False
                    }
                }]
            })

        for tweet in tweets:
            print tweet['text']
            topic = raw_input("topic:\n")
            possible_topic = self.db_connection.find_document(
                collection=DB.RELEVANT_TOPICS, filter={"name": topic.lower()})

            if possible_topic.count() > 0:
                found_topic = possible_topic.next()
                self.db_connection.find_and_update(
                    collection=DB.RELEVANT_TWEET_COLLECTION,
                    query={"_id": tweet["_id"]},
                    update={
                        "$set": {
                            "topics": [{
                                "_id":
                                found_topic["_id"],
                                TOPIC.IDENTIFIED_AS_TOPIC:
                                found_topic[TOPIC.IDENTIFIED_AS_TOPIC]
                            }]
                        }
                    })
Exemplo n.º 22
0
class Human(object):
    def __init__(self):
        self.db_connection = DBConnection()

    def label(self, label=False, fact_checking=False):
        '''

        :param label: Determine whether worth fact-checking
        :param fact_checking: Determine the truth of it
        :return:
        '''
        start_epoch = 1520812800
        # tweet_test = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION,
        #                                                   query={"$and":[{"crowdsourced": {"$exists": False}},
        #                                                                  {"created_at_epoch": {"$gt": start_epoch}}]},
        #                                                   size=100))
        #
        # tweets = self.db_connection.find_document(collection=DB.RELEVANT_TWEET_COLLECTION,
        #                                           filter={"$and":[{"crowdsourced": {"$exists": False}},
        #                                                           {"created_at_epoch": {"$gt": start_epoch}}]},
        #                                           projection={"text": 1})

        # print tweet_test.count()
        # print tweets.count()
        # print tweet_test[0]['text']
        # # for tweet in tweet_test:
        # #     print tweet['text']
        # #     break
        # print tweets[0]['text']
        #     # print tweet['text']

        bulk_op = self.db_connection.start_bulk_upsert(
            collection=DB.RELEVANT_TWEET_COLLECTION)
        bulk_count = 0
        if label:
            # tweets = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION,
            #                                                    query={"$and": [{"crowdsourced": {"$exists": False}},
            #                                                                   {TWEET.SET_TO_FACTCHECK:
            #                                                                        {"$exists": False}}]},
            #                                                    size=500))

            tweets = self.db_connection.find_document(
                collection=DB.RELEVANT_TWEET_COLLECTION,
                filter={
                    "$and": [{
                        "crowdsourced": {
                            "$exists": False
                        }
                    }, {
                        TWEET.SET_TO_FACTCHECK: {
                            "$exists": False
                        }
                    }]
                },
                #                                                                   {TWEET.SET_TO_FACTCHECK,
                projection={"text": 1},
                sort=True,
                sort_field="retweet_count",
                limit=500)

            for tweet in tweets:
                print tweet['text']
                worth = raw_input()
                if worth == "y":
                    self.db_connection.add_to_bulk_upsert(
                        query={"_id": tweet["_id"]},
                        data={TWEET.SET_TO_FACTCHECK: True},
                        bulk_op=bulk_op)

                else:
                    self.db_connection.add_to_bulk_upsert(
                        query={"_id": tweet["_id"]},
                        data={TWEET.SET_TO_FACTCHECK: False},
                        bulk_op=bulk_op)

                bulk_count += 1
                print "\n"

                if bulk_count != 0 and bulk_count % 100 == 0:
                    self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
                    bulk_op = self.db_connection.start_bulk_upsert(
                        collection=DB.RELEVANT_TWEET_COLLECTION)

        if fact_checking:
            tweets = list(
                self.db_connection.get_random_sample(
                    collection=DB.RELEVANT_TWEET_COLLECTION,
                    query={
                        "$and": [{
                            "crowdsourced": {
                                "$exists": False
                            }
                        }, {
                            TWEET.SET_TO_FACTCHECK: True
                        }]
                    },
                    size=100))

            for tweet in tweets:
                print tweet['text']
                rating = raw_input()
                self.db_connection.add_to_bulk_upsert(
                    query={"_id": tweet["_id"]},
                    data={TWEET.LABEL: rating == " "},
                    bulk_op=bulk_op)
                print "---\n"
                bulk_count += 1

                if bulk_count % 100 == 0:
                    self.db_connection.end_bulk_upsert()
                    bulk_op = self.db_connection.start_bulk_upsert(
                        collection=DB.RELEVANT_TWEET_COLLECTION)

        if bulk_count != 0 and bulk_count % 100 == 0:
            self.db_connection.end_bulk_upsert(bulk_op=bulk_op)

    def entity_measure(self):
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        count = 0
        try:
            tweets = self.db_connection.get_random_sample(
                collection=DB.RELEVANT_TWEET_COLLECTION,
                query={
                    "$and": [{
                        TWEET.SET_TO_FACTCHECK: True
                    }, {
                        TWEET.ENTITIES_COUNT: {
                            "$eq": 1
                        }
                    }]
                },
                size=1)

            total = 5
            for tweet in tweets:

                print tweet['text']
                print '----ENTITIES----'
                entities = [x['entity'] for x in tweet['entities']]
                print entities
                print '----INPUT-------'

                tp_input = int(
                    raw_input("Is an entity and API says it's an entity\n"))
                if tp_input == 0:
                    fp += 1

                tn_input = int(
                    raw_input(
                        "Is not an entity, API says it's not an entity\n"))
                fn_input = int(raw_input("Is an entity, API says it's not \n"))
                print "\n\n\n"

                tp += tp_input
                tn += tn_input
                fn += fn_input
                count += 1
                total -= 1
                print "total: %s" % total

            print "tp: %s" % tp
            print "tn: %s" % tn
            print "fp: %s" % fp
            print "fn: %s" % fn

        except Exception as e:
            print e
            print "count: %s" % count
            print "tp: %s" % tp
            print "tn: %s" % tn
            print "fp: %s" % fp
            print "fn: %s" % fn

    def mp_evaluation(self):
        total_rank = 0
        topic_count = 0
        mp_topics = self.db_connection.find_document(
            collection=DB.MP_COLLECTION,
            filter={"topics": {
                "$exists": True
            }},
            projection={"topics": 1})
        for topics in mp_topics:
            for topic in topics["topics"]:
                average_rank = 0
                rank = self.db_connection.find_document(
                    collection=DB.RELEVANT_TOPICS,
                    filter={"name": topic},
                    projection={"identified_as_topic": 1})

                for item in rank:
                    average_rank += item["identified_as_topic"]

                # for item in rank:
                # print item
                # average_rank += rank

            total_rank += average_rank / len(topics['topics'])
            # print "b"

            # topic_count += len(topics["topics"])
        count = mp_topics.count()
        print total_rank / count
        print topic_count
Exemplo n.º 23
0
 def __init__(self):
     self.db_connection = DBConnection()
Exemplo n.º 24
0
class HumanComputeGUI(tk.Frame):
    def __init__(self, parent):
        self.db_connection = DBConnection()
        self.bulk_count = 0

        tk.Frame.__init__(self, parent)
        # create a prompt, an input box, an output label,
        # and a button to do the computation
        self.prompt = tk.Label(self,
                               text="Enter a number:",
                               anchor="w",
                               wraplength=500)
        # self.entry = tk.Entry(self)
        self.relevant = tk.Button(self,
                                  text="Relevant",
                                  command=self.calculate1)
        self.not_relevant = tk.Button(self,
                                      text="Not Relevant",
                                      command=self.calculate2)
        self.output = tk.Label(self, text="")

        # lay the widgets out on the screen.
        self.prompt.pack(side="top", fill="x")
        # self.entry.pack(side="top", fill="x", padx=20)
        self.output.pack(side="top", fill="x", expand=True)
        self.not_relevant.pack(side="bottom")
        self.relevant.pack(side="bottom")

        self.tweets = self.db_connection.find_document(
            collection=DB.RELEVANT_TWEET_COLLECTION,
            filter={
                "$and": [{
                    "crowdsourced": {
                        "$exists": False
                    }
                }, {
                    TWEET.SET_TO_FACTCHECK: {
                        "$exists": False
                    }
                }, {
                    TWEET.TOPICS: {
                        "$exists": True
                    }
                }]
            },
            #                                                                   {TWEET.SET_TO_FACTCHECK,
            projection={"text": 1},
            sort=True,
            sort_field="retweet_count",
            limit=500)
        self.current = self.tweets.next()
        self.bulk_op = self.db_connection.start_bulk_upsert(
            collection=DB.RELEVANT_TWEET_COLLECTION)
        self.bulk_count = 0
        self.prompt.configure(text=self.current["text"])

        # for tweet in tweets:

    def calculate1(self):
        try:

            self.db_connection.add_to_bulk_upsert(
                query={"_id": self.current["_id"]},
                data={TWEET.SET_TO_FACTCHECK: True},
                bulk_op=self.bulk_op)
            self.bulk_count += 1
            if self.bulk_count != 0 and self.bulk_count % 100 == 0:
                self.db_connection.end_bulk_upsert(bulk_op=self.bulk_op)
                self.bulk_op = self.db_connection.start_bulk_upsert(
                    collection=DB.RELEVANT_TWEET_COLLECTION)

            self.current = self.tweets.next()
            if self.current:
                self.prompt.configure(
                    text=self.current['text'].encode('ascii', 'ignore'))

            else:
                if self.bulk_count != 0 and self.bulk_count % 100 == 0:
                    self.db_connection.end_bulk_upsert(bulk_op=self.bulk_op)
            # result = self.not_relevant.getboolean(False)

        except Exception as e:
            print e

        # set the output widget to have our result
        # self.output.configure(text=result)

    def calculate2(self):
        try:

            result = self.relevant.getboolean(True)
            self.db_connection.add_to_bulk_upsert(
                query={"_id": self.current["_id"]},
                data={TWEET.SET_TO_FACTCHECK: False},
                bulk_op=self.bulk_op)
            self.bulk_count += 1
            if self.bulk_count != 0 and self.bulk_count % 100 == 0:
                self.db_connection.end_bulk_upsert(bulk_op=self.bulk_op)
                self.bulk_op = self.db_connection.start_bulk_upsert(
                    collection=DB.RELEVANT_TWEET_COLLECTION)

            self.current = self.tweets.next()
            if self.current:
                self.prompt.configure(
                    text=self.current['text'].encode('ascii', 'ignore'))

            else:
                if self.bulk_count != 0 and self.bulk_count % 100 == 0:
                    self.db_connection.end_bulk_upsert(bulk_op=self.bulk_op)

            # result = self.not_relevant.getboolean(False)

        except Exception as e:
            print e

        # set the output widget to have our result
        # self.output.configure(text=result)

    def label(self, label=False, fact_checking=False):
        '''

        :param label: Determine whether worth fact-checking
        :param fact_checking: Determine the truth of it
        :return:
        '''
        start_epoch = 1520812800

        bulk_op = self.db_connection.start_bulk_upsert(
            collection=DB.RELEVANT_TWEET_COLLECTION)
        bulk_count = 0
        if label:
            # tweets = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION,
            #                                                    query={"$and": [{"crowdsourced": {"$exists": False}},
            #                                                                   {TWEET.SET_TO_FACTCHECK:
            #                                                                        {"$exists": False}}]},
            #                                                    size=500))

            tweets = self.db_connection.find_document(
                collection=DB.RELEVANT_TWEET_COLLECTION,
                filter={
                    "$and": [{
                        "crowdsourced": {
                            "$exists": False
                        }
                    }, {
                        TWEET.SET_TO_FACTCHECK: {
                            "$exists": False
                        }
                    }]
                },
                #                                                                   {TWEET.SET_TO_FACTCHECK,
                projection={"text": 1},
                sort=True,
                sort_field="retweet_count",
                limit=500)

            for tweet in tweets:
                print tweet['text']
                worth = raw_input()
                if worth == "y":
                    self.db_connection.add_to_bulk_upsert(
                        query={"_id": tweet["_id"]},
                        data={TWEET.SET_TO_FACTCHECK: True},
                        bulk_op=bulk_op)

                else:
                    self.db_connection.add_to_bulk_upsert(
                        query={"_id": tweet["_id"]},
                        data={TWEET.SET_TO_FACTCHECK: False},
                        bulk_op=bulk_op)

                self.bulk_count += 1
                print "\n"

                if self.bulk_count != 0 and self.bulk_count % 100 == 0:
                    self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
                    bulk_op = self.db_connection.start_bulk_upsert(
                        collection=DB.RELEVANT_TWEET_COLLECTION)

        # if fact_checking:
        #     tweets = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION,
        #                                                        query={"$and": [{"crowdsourced": {"$exists": False}},
        #                                                                        {TWEET.SET_TO_FACTCHECK: True}]},
        #                                                        size=100))
        #
        #     for tweet in tweets:
        #         print tweet['text']
        #         rating = raw_input()
        #         self.db_connection.add_to_bulk_upsert(query={"_id": tweet["_id"]},
        #                                               data={TWEET.LABEL: rating == " "},
        #                                               bulk_op=bulk_op)
        #         print "---\n"
        #         bulk_count += 1
        #
        #         if bulk_count % 100 == 0:
        #             self.db_connection.end_bulk_upsert()
        #             bulk_op = self.db_connection.start_bulk_upsert(collection=DB.RELEVANT_TWEET_COLLECTION)
        #
        if self.bulk_count != 0 and self.bulk_count % 100 == 0:
            self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
Exemplo n.º 25
0
 def setUp(self):
     self.db_connection = DBConnection()
Exemplo n.º 26
0
class HumanComputeGUIEntity(tk.Frame):
    def __init__(self, parent):
        self.db_connection = DBConnection()
        self.bulk_count = 0

        tk.Frame.__init__(self, parent)
        # create a prompt, an input box, an output label,
        # and a button to do the computation
        self.prompt = tk.Label(self,
                               text="Enter a number:",
                               anchor="w",
                               wraplength=500)
        self.entities_prompt = tk.Label(self,
                                        text="entities",
                                        anchor="w",
                                        wraplength=500)
        # self.entry = tk.Entry(self)
        self.tp = tk.Button(self,
                            text="Is an entity and API says it's an entity",
                            command=self.calculate1)
        self.tn = tk.Button(self,
                            text="Is not an entity, API does not include it",
                            command=self.calculate2)
        self.fp = tk.Button(self,
                            text='Is not an entity, API includes it',
                            command=self.calculate3)
        self.fn = tk.Button(self,
                            text='Is an entity, API does not include it',
                            command=self.calculate4)
        self.output = tk.Label(self, text="")

        # lay the widgets out on the screen.
        self.prompt.pack(side="top", fill="x")
        self.entities_prompt.pack(side="bottom")
        # self.entry.pack(side="top", fill="x", padx=20)
        self.output.pack(side="top", fill="x", expand=True)
        self.fn.pack(side="bottom")
        self.fp.pack(side="bottom")
        self.tn.pack(side="bottom")
        self.tp.pack(side="bottom")

        self.tweets = self.db_connection.get_random_sample(
            collection=DB.RELEVANT_TWEET_COLLECTION,
            query={
                "$and": [{
                    TWEET.SET_TO_FACTCHECK: True
                }, {
                    TWEET.ENTITIES_COUNT: {
                        "$eq": 1
                    }
                }]
            },
            size=200)

        self.current = self.tweets.next()
        self.prompt.configure(text=self.current["text"])
        self.entities_prompt.configure(
            text="Entities: %s" %
            [x['entity'] for x in self.current["entities"]])
        self.tp = 0
        self.tn = 0
        self.fp = 0
        self.fn = 0

        # for tweet in tweets:

    def calculate1(self):
        try:

            self.tp += 1
            self.current = self.tweets.next()
            if self.current:
                self.prompt.configure(
                    text=self.current['text'].encode('ascii', 'ignore'))
                self.entities_prompt.configure(
                    text="Entities: %s" %
                    [x['entity'] for x in self.current["entities"]])

            else:
                print "tp: %s" % self.tp
                print "tn: %s" % self.tn
                print "fp: %s" % self.fp
                print "fn: %s" % self.fn

        except Exception as e:
            print e
            print "tp: %s" % self.tp
            print "tn: %s" % self.tn
            print "fp: %s" % self.fp
            print "fn: %s" % self.fn

    def calculate2(self):
        try:

            self.tn += 1
            self.current = self.tweets.next()
            if self.current:
                self.prompt.configure(
                    text=self.current['text'].encode('ascii', 'ignore'))
                self.entities_prompt.configure(
                    text="Entities: %s" %
                    [x['entity'] for x in self.current["entities"]])

            else:
                print "tp: %s" % self.tp
                print "tn: %s" % self.tn
                print "fp: %s" % self.fp
                print "fn: %s" % self.fn
        except Exception as e:
            print e
            print "tp: %s" % self.tp
            print "tn: %s" % self.tn
            print "fp: %s" % self.fp
            print "fn: %s" % self.fn

    def calculate3(self):
        try:

            self.fp += 1
            self.current = self.tweets.next()
            if self.current:
                self.prompt.configure(
                    text=self.current['text'].encode('ascii', 'ignore'))
                self.entities_prompt.configure(
                    text="Entities: %s" %
                    [x['entity'] for x in self.current["entities"]])

            else:
                print "tp: %s" % self.tp
                print "tn: %s" % self.tn
                print "fp: %s" % self.fp
                print "fn: %s" % self.fn
        except Exception as e:
            print e
            print "tp: %s" % self.tp
            print "tn: %s" % self.tn
            print "fp: %s" % self.fp
            print "fn: %s" % self.fn

    def calculate4(self):
        try:

            self.fn += 1
            self.current = self.tweets.next()
            if self.current:
                self.prompt.configure(
                    text=self.current['text'].encode('ascii', 'ignore'))
                self.entities_prompt.configure(
                    text=[x['entity'] for x in self.current["entities"]])

            else:
                print "tp: %s" % self.tp
                print "tn: %s" % self.tn
                print "fp: %s" % self.fp
                print "fn: %s" % self.fn
        except Exception as e:
            print e
            print "tp: %s" % self.tp
            print "tn: %s" % self.tn
            print "fp: %s" % self.fp
            print "fn: %s" % self.fn

    def label(self, label=False, fact_checking=False):
        '''

        :param label: Determine whether worth fact-checking
        :param fact_checking: Determine the truth of it
        :return:
        '''
        start_epoch = 1520812800

        bulk_op = self.db_connection.start_bulk_upsert(
            collection=DB.RELEVANT_TWEET_COLLECTION)
        bulk_count = 0
        if label:
            # tweets = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION,
            #                                                    query={"$and": [{"crowdsourced": {"$exists": False}},
            #                                                                   {TWEET.SET_TO_FACTCHECK:
            #                                                                        {"$exists": False}}]},
            #                                                    size=500))

            tweets = self.db_connection.find_document(
                collection=DB.RELEVANT_TWEET_COLLECTION,
                filter={
                    "$and": [{
                        "crowdsourced": {
                            "$exists": False
                        }
                    }, {
                        TWEET.SET_TO_FACTCHECK: {
                            "$exists": False
                        }
                    }]
                },
                #                                                                   {TWEET.SET_TO_FACTCHECK,
                projection={"text": 1},
                sort=True,
                sort_field="retweet_count",
                limit=500)

            for tweet in tweets:
                print tweet['text']
                worth = raw_input()
                if worth == "y":
                    self.db_connection.add_to_bulk_upsert(
                        query={"_id": tweet["_id"]},
                        data={TWEET.SET_TO_FACTCHECK: True},
                        bulk_op=bulk_op)

                else:
                    self.db_connection.add_to_bulk_upsert(
                        query={"_id": tweet["_id"]},
                        data={TWEET.SET_TO_FACTCHECK: False},
                        bulk_op=bulk_op)

                self.bulk_count += 1
                print "\n"

                if self.bulk_count != 0 and self.bulk_count % 100 == 0:
                    self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
                    bulk_op = self.db_connection.start_bulk_upsert(
                        collection=DB.RELEVANT_TWEET_COLLECTION)

        # if fact_checking:
        #     tweets = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION,
        #                                                        query={"$and": [{"crowdsourced": {"$exists": False}},
        #                                                                        {TWEET.SET_TO_FACTCHECK: True}]},
        #                                                        size=100))
        #
        #     for tweet in tweets:
        #         print tweet['text']
        #         rating = raw_input()
        #         self.db_connection.add_to_bulk_upsert(query={"_id": tweet["_id"]},
        #                                               data={TWEET.LABEL: rating == " "},
        #                                               bulk_op=bulk_op)
        #         print "---\n"
        #         bulk_count += 1
        #
        #         if bulk_count % 100 == 0:
        #             self.db_connection.end_bulk_upsert()
        #             bulk_op = self.db_connection.start_bulk_upsert(collection=DB.RELEVANT_TWEET_COLLECTION)
        #
        if self.bulk_count != 0 and self.bulk_count % 100 == 0:
            self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
Exemplo n.º 27
0
class Relevancy(object):
    def __init__(self):
        self.db_connection = DBConnection()
        self.twitter_api = Twitter(os.environ.get(CREDS.TWITTER_KEY),
                                   os.environ.get(CREDS.TWITTER_SECRET),
                                   os.environ.get(CREDS.TWITTER_TOKEN),
                                   os.environ.get(CREDS.TWITTER_TOKEN_SECRET),
                                   self.db_connection)

    def clean_tweet(self, tweet):
        regex_remove = "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|&amp;|amp|(\w+:\/\/\S+)|^RT|http.+?"
        tweet_text = re.sub(regex_remove, '', tweet["text"]).strip()
        tweet_id = tweet["_id"]

        stopword_list = []
        stopword_file = open('stopwords.txt', 'r')
        for line in stopword_file:
            stopword_list.append(line.strip())
        stopword_list = stopword_list + stopwords.words('english')
        stop_words = set(stopword_list)
        tweet_text = " ".join(word for word in tweet_text.split()
                              if word not in stop_words)
        tweet["text"] = tweet_text
        return tweet

    def cleaner(self, tweets):
        '''
        Remove tweets that are too insignificant to classify for relevance score e.g. tweets with one word
        :param tweets: list of tweets to clean
        :return:
        '''
        for tweet in tweets:
            try:
                if tweet['text']:
                    tweet_data = self.twitter_api.get_status(
                        tweet_id=tweet["_id"])
                    lang = detect(tweet['text'])
                    if tweet_data.in_reply_to_status_id:  # It's a reply, not worth fact-checking
                        self.db_connection.delete_tweet(tweet_id=tweet["_id"])

                    elif lang != 'en':
                        self.db_connection.delete_tweet(tweet_id=tweet["_id"])

                    elif len(re.findall(r'\w+', tweet['text'])) <= 10:
                        self.db_connection.delete_tweet(tweet_id=tweet["_id"])

                    elif tweet['text'].count('@') > 4:
                        self.db_connection.delete_tweet(tweet_id=tweet["_id"])

                    elif tweet['text'].count('#') > 4:
                        self.db_connection.delete_tweet(tweet_id=tweet["_id"])

            except LangDetectException as e:
                self.db_connection.delete_tweet(tweet['text'])

    def get_prediction_model(self, timestamp, time_interval):
        '''
        Given a timestamp, gets relevant:
         - News articles
         - Trends
        Builds the contents of these into similarity measure object
        :param tweet: Tweet to analyse
        :param timestamp: Timestamp to analyse from
        :param time_interval: Interval of time for which to create the similarity
        :return: similarity measure object to query tweets against
        '''

        start_timestamp = timestamp - time_interval

        articles = []
        articles_ingest = self.db_connection.find_document(
            collection=DB.NEWS_ARTICLES,
            filter={
                "$and": [{
                    "timestamp": {
                        "$gt": start_timestamp
                    }
                }, {
                    "timestamp": {
                        "$lt": timestamp
                    }
                }]
            },
            projection={
                "title": 1,
                "description": 1
            })

        if articles_ingest.count() > 0:
            for article in articles_ingest:
                if 'description' in article:
                    if article['description']:
                        articles.append(article['description'])

                if 'title' in article:
                    if article['title']:
                        articles.append(article['title'])

            gen_docs = [[w.lower() for w in word_tokenize(text)]
                        for text in articles]
            dictionary = gensim.corpora.Dictionary(gen_docs)
            corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
            tf_idf = gensim.models.TfidfModel(corpus)
            # sims = gensim.similarities.Similarity('gensim', tf_idf[corpus], num_features=len(dictionary))

            index = gensim.similarities.MatrixSimilarity(
                tf_idf[corpus], num_features=len(dictionary))

            return [index, dictionary, tf_idf]

        else:
            return None

    def calculate_relevance(self, tweets, timestamp, time_interval):
        start_timestamp = timestamp - time_interval

        model = self.get_prediction_model(timestamp=timestamp,
                                          time_interval=time_interval)
        if model:
            twitter_trends_ingest = self.db_connection.find_document(
                collection=DB.TWITTER_TRENDS,
                filter={
                    "$and": [{
                        "timestamp_epoch": {
                            "$gt": start_timestamp
                        }
                    }, {
                        "timestamp_epoch": {
                            "$lt": timestamp
                        }
                    }]
                },
                projection={"name": 1})

            wiki_trends_ingest = self.db_connection.find_document(
                collection=DB.WIKI_TRENDS,
                filter={
                    "$and": [{
                        "epoch_timestamp": {
                            "$gt": start_timestamp
                        }
                    }, {
                        "epoch_timestamp": {
                            "$lt": timestamp
                        }
                    }]
                },
                projection={
                    "name": 1,
                    "rank": 1
                })

            bulk_op = self.db_connection.start_bulk_upsert(
                collection=DB.RELEVANT_TWEET_COLLECTION)
            bulk_count = 0
            for tweet in tweets:
                tweet = self.clean_tweet(tweet)
                query_doc = [w.lower() for w in word_tokenize(tweet['text'])]
                query_doc_bow = model[1].doc2bow(query_doc)
                query_doc_tf_idf = model[2][query_doc_bow]

                sims = model[0][query_doc_tf_idf]
                relevance = sims[sims != 0].mean()

                if not math.isnan(relevance):
                    twitter_trends = []
                    wiki_trends = []

                    for trend in twitter_trends_ingest:
                        twitter_trends.append(trend['name'])

                    for trend in wiki_trends_ingest:
                        wiki_trends.append(trend['name'])

                    for trend in twitter_trends:
                        if trend in tweet:
                            relevance += (
                                0.1 * relevance
                            )  # 10% relevance booster for each trend

                    for trend in wiki_trends:
                        if trend in tweet:
                            relevance += (1 - ((trend["rank"] - 1) / 1000)) * (
                                0.1 * relevance)  # Scaled booster

                    relevance = float(relevance)

                    if time_interval == TIME_INTERVAL.DAY:
                        self.db_connection.add_to_bulk_upsert(
                            query={"_id": tweet["_id"]},
                            data={RELEVANCY_INTERVAL.DAY: relevance},
                            bulk_op=bulk_op)
                        bulk_count += 1
                        # self.db_connection.update_tweet(tweet_id=tweet["_id"], update={RELEVANCY_INTERVAL.DAY: relevance})

                    elif time_interval == TIME_INTERVAL.WEEK:
                        self.db_connection.add_to_bulk_upsert(
                            query={"_id": tweet["_id"]},
                            data={RELEVANCY_INTERVAL.WEEK: relevance},
                            bulk_op=bulk_op)
                        bulk_count += 1
                        # self.db_connection.update_tweet(tweet_id=tweet["_id"], update={RELEVANCY_INTERVAL.WEEK: relevance})

                    elif time_interval == TIME_INTERVAL.WEEK * 2:
                        self.db_connection.add_to_bulk_upsert(
                            query={"_id": tweet["_id"]},
                            data={RELEVANCY_INTERVAL.TWO_WEEKS: relevance},
                            bulk_op=bulk_op)
                        bulk_count += 1
                        # self.db_connection.update_tweet(tweet_id=tweet["_id"], update={RELEVANCY_INTERVAL.TWO_WEEKS: relevance})

                    elif time_interval == TIME_INTERVAL.MONTH:
                        self.db_connection.add_to_bulk_upsert(
                            query={"_id": tweet["_id"]},
                            data={RELEVANCY_INTERVAL.MONTH: relevance},
                            bulk_op=bulk_op)
                        bulk_count += 1
                        # self.db_connection.update_tweet(tweet_id=tweet["_id"], update={RELEVANCY_INTERVAL.MONTH: relevance})

                    if bulk_count % 100 == 0:
                        logger.info("Insert bulk data for relevancy: %s" %
                                    bulk_count)
                        self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
                        bulk_op = self.db_connection.start_bulk_upsert(
                            collection=DB.RELEVANT_TWEET_COLLECTION)
                else:
                    continue

            if bulk_count % 100 != 0:
                self.db_connection.end_bulk_upsert(bulk_op=bulk_op)

            logger.info("Inserted final bulk data %s" % bulk_count)