def __init__(self): self.db_connection = DBConnection() self.twitter_api = Twitter(os.environ.get(CREDS.TWITTER_KEY), os.environ.get(CREDS.TWITTER_SECRET), os.environ.get(CREDS.TWITTER_TOKEN), os.environ.get(CREDS.TWITTER_TOKEN_SECRET), self.db_connection)
class DBFindTest(unittest.TestCase): def setUp(self): self.db_connection = DBConnection() def tearDown(self): self.db_connection.close() def test_find_document(self): result = self.db_connection.find_document( collection=DB.MP_COLLECTION, filter={"twitter_handle": "@theresa_may"}, projection={ "name": 1, "_id": 0 }) self.assertEqual(result[0]["name"], "Theresa May") def test_validate_twitter(self): twitter_api = Twitter(os.getenv(CREDS.TWITTER_KEY), os.getenv(CREDS.TWITTER_SECRET), os.getenv(CREDS.TWITTER_TOKEN), os.getenv(CREDS.TWITTER_TOKEN_SECRET), self.db_connection) self.assertTrue(expr=twitter_api.verify_credentials(), msg="Could not validate Twitter credentials.")
def __init__(self): self.db_connection = DBConnection() self.logger = logging.getLogger(__name__) self.api = PageviewsClient( "Mozilla/5.0 (X11; Linux x86_64)" " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" )
def __init__(self): self.api = praw.Reddit(client_id='DiI57R025MBQLQ', client_secret='4IaDtRqQrX4jIEDZeYqh_y4cJCA', user_agent='script') self.db_connection = DBConnection() self.subreddit_list = [] self.submission_list = []
def __init__(self): self.client = crowdflower.client.Client( os.getenv("CROWDFLOWER_API_KEY")) self.db_connection = DBConnection() self.api_key = os.getenv("CROWDFLOWER_API_KEY") self.judgements_session = requests.session() self.nlu = NaturalLanguageUnderstandingV1( version='2017-02-27', username="******", password="******")
def __init__(self, parent): self.db_connection = DBConnection() self.bulk_count = 0 tk.Frame.__init__(self, parent) # create a prompt, an input box, an output label, # and a button to do the computation self.prompt = tk.Label(self, text="Enter a number:", anchor="w", wraplength=500) # self.entry = tk.Entry(self) self.relevant = tk.Button(self, text="Relevant", command=self.calculate1) self.not_relevant = tk.Button(self, text="Not Relevant", command=self.calculate2) self.output = tk.Label(self, text="") # lay the widgets out on the screen. self.prompt.pack(side="top", fill="x") # self.entry.pack(side="top", fill="x", padx=20) self.output.pack(side="top", fill="x", expand=True) self.not_relevant.pack(side="bottom") self.relevant.pack(side="bottom") self.tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={ "$and": [{ "crowdsourced": { "$exists": False } }, { TWEET.SET_TO_FACTCHECK: { "$exists": False } }, { TWEET.TOPICS: { "$exists": True } }] }, # {TWEET.SET_TO_FACTCHECK, projection={"text": 1}, sort=True, sort_field="retweet_count", limit=500) self.current = self.tweets.next() self.bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) self.bulk_count = 0 self.prompt.configure(text=self.current["text"])
def __init__(self): self.db_connection = DBConnection() self.sid = SentimentIntensityAnalyzer() self.nlu = NaturalLanguageUnderstandingV1( version='2017-02-27', username="******", password="******") self.twitter = Twitter(os.environ.get(CREDS.TWITTER_KEY), os.environ.get(CREDS.TWITTER_SECRET), os.environ.get(CREDS.TWITTER_TOKEN), os.environ.get(CREDS.TWITTER_TOKEN_SECRET), self.db_connection) self.session = requests.session() self.resolved_urls = []
class TweetHandler(object): def __init__(self): self.db_connection = DBConnection() def get_clean(self, filter): """ Get tweets for specific MP and clean tweet :param filter: Filter for selecting tweets to clean :return: Clean tweets for a given MP """ tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter=filter, projection={"text": 1}) stopword_list = [] stopword_file = open('stopwords.txt', 'r') for line in stopword_file: stopword_list.append(line.strip()) stopword_list = stopword_list + stopwords.words('english') stop_words = set(stopword_list) tweets = map(lambda x: x["text"].lower(), tweets) # Combine list into just text content regex_remove = "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|&|amp|(\w+:\/\/\S+)|^RT|http.+?" tweets = [re.sub(regex_remove, '', tweet).strip() for tweet in tweets] clean_tweets = [] # Stop word removal from tweet for tweet in tweets: clean_tweets.append(" ".join(word for word in tweet.split() if word not in stop_words)) return clean_tweets
def resolve_url(self, urls): db_connection = DBConnection() url_list = [] try: r = requests.get(urls[1]) if r.status_code != 200: longurl = None else: longurl = r.url self.resolved_urls.append((urls[0], longurl)) r.close() except requests.exceptions.RequestException: return None
def main(): db_connection = DBConnection() twitter_api = Twitter(os.environ.get(CREDS.TWITTER_KEY), os.environ.get(CREDS.TWITTER_SECRET), os.environ.get(CREDS.TWITTER_TOKEN), os.environ.get(CREDS.TWITTER_TOKEN_SECRET), db_connection) if "trends" in sys.argv: if "historic" in sys.argv: date = datetime.today() day_end = date.day - 1 month_end = date.month month = 1 day = 2 while month != month_end or day != day_end: twitter_api.get_historic_trends(month=month, day=day) time.sleep(3) day += 1 if day % 30 == 0: month += 1 day = 1 globally = "global" in sys.argv is_uk = "UK" in sys.argv location = WOEIDS.UK if not is_uk and len( sys.argv) > 2: # Check that no location has be inputted location = WOEIDS.USA while True: twitter_api.get_trends(location=location, globally=globally) time.sleep(60 * 60 * 2) # Run every 2 hours elif "tweets" in sys.argv: historic = "historical" in sys.argv while True: twitter_api.update_all_tweets(historic=historic) if historic: break
class TopicModel(object): def __init__(self): self.db_connection = DBConnection() def clean_tweet(self, tweet): regex_remove = "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|&|amp|(\w+:\/\/\S+)|^RT|http.+?" tweet_text = re.sub(regex_remove, '', tweet["text"]).strip() tweet_id = tweet["_id"] stopword_list = [] stopword_file = open('stopwords.txt', 'r') for line in stopword_file: stopword_list.append(line.strip()) stopword_list = stopword_list + stopwords.words('english') stop_words = set(stopword_list) tweet_text = " ".join(word.lower() for word in tweet_text.split() if word.lower() not in stop_words) tweet["text"] = tweet_text return tweet def get_final_topics(self, topics): kw_list = [] intact_kw_list = [] for topic_kws in topics: topic_kws = re.findall('"([^"]*)"', topic_kws[1]) kw_list = kw_list + topic_kws intact_kw_list.append(topic_kws) # clean_topics.append(clean_topics) top_kws = [kw for kw, kw_count in Counter(kw_list).most_common(30)] return (top_kws, intact_kw_list) # pass def model(self, mp_id): ''' Topic model by MP :return: ''' tweet_docs = [] tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={"author_id": mp_id}, projection={"text": 1}) if tweets.count() > 0: for tweet in tweets: tweet_docs.append(self.clean_tweet(tweet)) # dictionary = gensim.corpora.Dictionary(gen_docs) # corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs] # tf_idf = gensim.models.TfidfModel(corpus) gen_docs = [[ w.lower() for w in word_tokenize(tweet['text'].lower()) ] for tweet in tweet_docs] dictionary = corpora.Dictionary(gen_docs) # dictionary.save(os.path.join(TEMP_FOLDER, 'elon.dict')) # store the dictionary, for future reference corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs] # corpora.MmCorpus.serialize() # corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'elon.mm'), corpus) # store to disk, for later use tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model corpus_tfidf = tfidf[corpus] total_topics = 5 total_topic_aggregation = 2 i = 0 possible_topics = [] while i < total_topic_aggregation: possible_topics = possible_topics + models.LdaModel( corpus, id2word=dictionary, num_topics=total_topics).show_topics(total_topics, 5) i += 1 topic_data = self.get_final_topics(topics=possible_topics) final_topics = [] print "Top keywords: %s " % topic_data[0] for batch in topic_data[1]: print batch print "----" decision = None while decision != "": decision = raw_input() if decision: if decision.lower() not in final_topics: final_topics.append(decision.lower()) if final_topics: self.db_connection.update_mp(user_id=mp_id, update={MP.TOPICS: final_topics}) for final_topic in final_topics: self.db_connection.increment_field( collection=DB.RELEVANT_TOPICS, query={"name": final_topic}, field=TOPIC.IDENTIFIED_AS_TOPIC) def evaluate(self, mp_id): ''' Topic model by MP :return: ''' tweet_docs = [] tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={"author_id": mp_id["_id"]}, projection={"text": 1}) tweet_count = tweets.count() if tweets.count() > 0: for tweet in tweets: tweet_docs.append(self.clean_tweet(tweet)) # dictionary = gensim.corpora.Dictionary(gen_docs) # corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs] # tf_idf = gensim.models.TfidfModel(corpus) gen_docs = [[ w.lower() for w in word_tokenize(tweet['text'].lower()) ] for tweet in tweet_docs] dictionary = corpora.Dictionary(gen_docs) # dictionary.save(os.path.join(TEMP_FOLDER, 'elon.dict')) # store the dictionary, for future reference corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs] # corpora.MmCorpus.serialize() # corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'elon.mm'), corpus) # store to disk, for later use tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model corpus_tfidf = tfidf[corpus] total_topics = 5 total_topic_aggregation = 2 i = 0 possible_topics = [] model1 = models.LdaModel(corpus, id2word=dictionary, num_topics=5) model2 = models.LdaModel(corpus, id2word=dictionary, num_topics=10) model3 = models.LdaModel(corpus, id2word=dictionary, num_topics=20) model4 = models.LdaModel(corpus, id2word=dictionary, num_topics=30) model5 = models.LdaModel(corpus, id2word=dictionary, num_topics=40) model6 = models.LdaModel(corpus, id2word=dictionary, num_topics=50) model7 = models.LdaModel(corpus, id2word=dictionary, num_topics=60) model8 = models.LdaModel(corpus, id2word=dictionary, num_topics=70) model9 = models.LdaModel(corpus, id2word=dictionary, num_topics=80) model10 = models.LdaModel(corpus, id2word=dictionary, num_topics=90) model11 = models.LdaModel(corpus, id2word=dictionary, num_topics=100) # perplexity = model.bound(corpus=corpus, subsample_ratio=tweet_count/61152) perplexity1 = model1.bound(corpus=corpus, subsample_ratio=tweet_count / 61152) perplexity2 = model2.bound(corpus=corpus, subsample_ratio=tweet_count / 61152) perplexity3 = model3.bound(corpus=corpus, subsample_ratio=tweet_count / 61152) perplexity4 = model4.bound(corpus=corpus, subsample_ratio=tweet_count / 61152) perplexity5 = model5.bound(corpus=corpus, subsample_ratio=tweet_count / 61152) perplexity6 = model6.bound(corpus=corpus, subsample_ratio=tweet_count / 61152) perplexity7 = model7.bound(corpus=corpus, subsample_ratio=tweet_count / 61152) perplexity8 = model8.bound(corpus=corpus, subsample_ratio=tweet_count / 61152) perplexity9 = model9.bound(corpus=corpus, subsample_ratio=tweet_count / 61152) perplexity10 = model10.bound(corpus=corpus, subsample_ratio=tweet_count / 61152) perplexity11 = model11.bound(corpus=corpus, subsample_ratio=tweet_count / 61152) return [[ perplexity1, perplexity2, perplexity3, perplexity4, perplexity5, perplexity6, perplexity7, perplexity8, perplexity9, perplexity10, perplexity11 ], [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100], mp["name"]]
def __init__(self): self.logger = logging.getLogger(__name__) self.api = NewsApiClient(api_key="0d0fe7063a414d63ad34d037d87ca92f") self.db_connection = DBConnection()
class NewsClient(object): def __init__(self): self.logger = logging.getLogger(__name__) self.api = NewsApiClient(api_key="0d0fe7063a414d63ad34d037d87ca92f") self.db_connection = DBConnection() def get_sources(self): ''' Get all the sources used by NEWSAPI to insert into database :return: ''' sources = self.api.get_sources() sources = sources[NEWS_API_PARAMS.SOURCE] sources_to_insert = [] for source in sources: if source[NEWS_SOURCE.COUNTRY] in [ NEWS_COUNTRIES.UK, NEWS_COUNTRIES.USA ]: sources_to_insert.append({ NEWS_SOURCE.DESCRIPTION: source[NEWS_SOURCE.DESCRIPTION], NEWS_SOURCE.CATEGORY: source[NEWS_SOURCE.CATEGORY], NEWS_SOURCE.COUNTRY: source[NEWS_SOURCE.COUNTRY], NEWS_SOURCE.LANGUAGE: source[NEWS_SOURCE.LANGUAGE], NEWS_SOURCE.NAME: source[NEWS_SOURCE.NAME], NEWS_SOURCE.URL: source[NEWS_SOURCE.URL], NEWS_SOURCE.NEWS_API_ID: source["id"], NEWS_SOURCE.NEWS_API_FRIENDLY: True }) self.db_connection.bulk_insert(data=sources_to_insert, collection=DB.SOURCES_COLLECTION) def get_timestamps(self): news = self.db_connection.find_document( collection=DB.NEWS_ARTICLES, filter={}, projection={NEWS_ARTICLE.PUBLISH_DATE}) for piece in news: timestamp = calendar.timegm(piece['published_at'].timetuple()) result_piece = self.db_connection.find_and_update( collection=DB.NEWS_ARTICLES, query={"_id": piece["_id"]}, update={"$set": { "timestamp": timestamp }}) def get_articles(self, query=None, since=None): """ :param query: Query for specific articles :param since: Datetime of the earliest date the articles can be :return: """ articles_to_insert = [] batch_size = 300 article_count = 0 page_no = 1 stop_words = re.compile("|".join([ "sport", "entertainment" ])) # words, categories etc that are not important to collect sort_by = NEWS_API_PARAMS.SORT_BY_NEWEST sources = list( self.db_connection.find_document( collection=DB.SOURCES_COLLECTION, filter={NEWS_SOURCE.COUNTRY: NEWS_COUNTRIES.UK}, projection={ NEWS_SOURCE.NEWS_API_ID: 1, "_id": 0 })) sources = map(lambda x: x[NEWS_SOURCE.NEWS_API_ID], sources) sources = ','.join(sources) if query: # Sort by relevancy instead of newest if query placed sort_by = NEWS_API_PARAMS.SORT_BY_RELEVANCY if not since: since = datetime.now() - timedelta(days=30) count = 0 while True: news_payload = self.api.get_everything( q=query, language='en', sources=sources, from_parameter=since, to='2018-01-15', sort_by=sort_by, page=page_no, page_size=NEWS_API_PARAMS.PAGE_SIZE) count += 1 if 'articles' not in news_payload: self.logger.info("hit API limit, stopping") break total_articles = None if "totalResults" in news_payload: total_articles = news_payload["totalResults"] if "totalResults" in news_payload: total_articles = news_payload["totalResults"] raw_articles = None if "articles" in news_payload: article_count += len(news_payload["articles"]) raw_articles = news_payload["articles"] if raw_articles: for article in raw_articles: if not stop_words.search( article["url"] ): # Avoid URLs with the given stop words in them date = datetime.strptime(article["publishedAt"], '%Y-%m-%dT%H:%M:%SZ') doc = { NEWS_ARTICLE.DESCRIPTION: article["description"], NEWS_ARTICLE.TITLE: article["title"], NEWS_ARTICLE.URL: article["url"], NEWS_ARTICLE.SOURCE: article["source"]["name"], NEWS_ARTICLE.PUBLISH_DATE: date, NEWS_ARTICLE.TIMESTAMP: calendar.timegm(date.timetuple()) } self.db_connection.insert_news_article(article=doc) # articles_to_insert.append({ # NEWS_ARTICLE.DESCRIPTION: article["description"], # NEWS_ARTICLE.TITLE: article["title"], # NEWS_ARTICLE.URL: article["url"], # NEWS_ARTICLE.SOURCE: article["source"]["name"], # NEWS_ARTICLE.PUBLISH_DATE: date, # NEWS_ARTICLE.TIMESTAMP: calendar.timegm(date.timetuple()) # }) page_no += 1 # if count >= 240: # self.logger.info("Stopping news collection due to API limits") # self.logger.info("last timestamp: %s" % calendar.timegm(date.timetuple())) # break # if raw_articles: # self.db_connection.bulk_insert(data=articles_to_insert, collection=DB.NEWS_ARTICLES) # articles_to_insert = [] if not raw_articles: break
class Classifier(object): def __init__(self): self.db_connection = DBConnection() # self.iris = datasets.load_iris() # self.digits = datasets.load_digits() # self.classifier = svm.SVC(probability=True, kernel='linear') self.classifier = svm.SVC(probability=True, kernel='linear', C=1, gamma=1) self.clean_train_data = [] self.classifier_predictions = None self.gold_results = None Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] self.coef = None self.raw_tweets = [] self.features_names = [ "tweet_%s" % TWEET.CHARACTER_COUNT, "tweet_%s" % TWEET.WORD_COUNT, "tweet_%s" % TWEET.CONTAINS_QM, "tweet_%s" % TWEET.CONTAINS_EM, "tweet_%s" % TWEET.CONTAINS_MULTIPLE_MARKS, "tweet_%s" % TWEET.FRACTION_CAPITALISED, "tweet_%s" % TWEET.CONTAINS_HAPPY_EMOJI, "tweet_%s" % TWEET.CONTAINS_SAD_EMOJI, "tweet_%s" % TWEET.CONTAINS_HAPPY_EMOTICON, "tweet_%s" % TWEET.CONTAINS_SAD_EMOTICON, "tweet_%s" % TWEET.CONTAINS_PRONOUNS, "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP10, "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP30, "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP50, "tweet_%s" % TWEET.MENTIONS_USER, "tweet_%s" % TWEET.CONTAINS_STOCK_SYMBOL, "tweet_%s" % TWEET.PUBLISH_WEEKDAY, "tweet_%s" % TWEET.POSITIVE_WORD_COUNT, "tweet_%s" % TWEET.NEGATIVE_WORD_COUNT, "tweet_%s" % TWEET.SENTIMENT_SCORE, "tweet_%s" % TWEET.AVERAGE_ENTITY_CERTAINTY, "tweet_%s" % TWEET.AVERAGE_KEYWORD_CERTAINTY, "tweet_%s" % TWEET.ENTITIES_COUNT, "tweet_%s" % TWEET.KEYWORDS_COUNT, "tweet_%s" % TWEET.RELEVANCY_DAY, "tweet_%s" % TWEET.RELEVANCY_WEEK, "tweet_%s" % TWEET.RELEVANCY_TWO_WEEKS, "tweet_%s" % TWEET.CONTAINS_FIGURES, "tweet_%s" % TWEET.FRAC_NOT_IN_DICT, "mp_%s" % MP.FOLLOWERS_COUNT, "mp_%s" % MP.FRIENDS_COUNT, "mp_%s" % MP.TWEET_COUNT, "mp_%s" % MP.IS_VERIFIED, "mp_%s" % MP.AVERAGE_NO_RETWEETS, "mp_%s" % MP.AVERAGE_NO_FAVOURITES, "mp_%s" % MP.ACCOUNT_DAYS, "topic_%s" % TOPIC.TWEET_COUNT, "topic_%s" % TOPIC.TWEET_AVERAGE_LENGTH, "topic_%s" % TOPIC.FRAC_CONTAINING_QM, "topic_%s" % TOPIC.FRAC_CONTAINING_EM, "topic_%s" % TOPIC.FRAC_CONTAINING_MULTIPLE_MARKS, "topic_%s" % TOPIC.FRAC_CONTAINING_HAPPY_EMOTICON, "topic_%s" % TOPIC.FRAC_CONTAINING_SAD_EMOTICON, "topic_%s" % TOPIC.FRAC_CONTAINING_HAPPY_EMOJI, "topic_%s" % TOPIC.FRAC_CONTAINING_SAD_EMOJI, "topic_%s" % TOPIC.FRAC_CONTAINING_PRONOUNS, "topic_%s" % TOPIC.FRAC_CONTAINING_FIGURES, "topic_%s" % TOPIC.FRAC_CONTAINING_UPPERCASE, "topic_%s" % TOPIC.FRAC_CONTAINING_URL, "topic_%s" % TOPIC.FRAC_CONTAINING_USER_MENTION, "topic_%s" % TOPIC.FRAC_CONTAINING_HASHTAGS, "topic_%s" % TOPIC.FRAC_CONTAINING_STOCK_SYMBOLS, "topic_%s" % TOPIC.AVERAGE_SENTIMENT_SCORE, "topic_%s" % TOPIC.FRAC_CONTAINING_POSITIVE_SENTIMENT, "topic_%s" % TOPIC.FRAC_CONTAINING_NEGATIVE_SENTIMENT, "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN10, "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN30, "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN50, "topic_%s" % TOPIC.DISTINCT_URLS_COUNT, "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_VISITED_URL, "topic_%s" % TOPIC.DISTINCT_HASHTAG_COUNT, "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_USED_HASHTAG, "topic_%s" % TOPIC.DISTINCT_USER_MENTION_COUNT, "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_MENTIONED_USER, "topic_%s" % TOPIC.DISTINCT_TWEET_AUTHOR_COUNT, "topic_%s" % TOPIC.FRAC_CONTAINING_TOP_AUTHOR, "topic_%s" % TOPIC.AVERAGE_AUTHOR_TWITTER_LIFE, "topic_%s" % TOPIC.AVERAGE_AUTHOR_TWEET_COUNT, "topic_%s" % TOPIC.AVERAGE_AUTHOR_FOLLOWER_COUNT, "topic_%s" % TOPIC.AVERAGE_AUTHOR_FRIEND_COUNT, "topic_%s" % TOPIC.FRAC_FROM_VERIFIED, "topic_%s" % TOPIC.AVERAGE_DAY_RELEVANCE, "topic_%s" % TOPIC.AVERAGE_WEEK_RELEVANCE, "topic_%s" % TOPIC.AVERAGE_2WEEK_RELEVANCE, "topic_%s" % TOPIC.AVERAGE_WORDS_NOT_IN_DICT ] def train(self, train_data, train_target): ''' Trains SVM classifier based on the feature set acquired in feature_extractor Normalises data for optimal results Gets class decision and probabilistic result :return: ''' # self.clean_train_data = [] # Cleaning of data in preparation for training for tweet in train_data: tweet_block = [ tweet[TWEET.CHARACTER_COUNT], tweet[TWEET.WORD_COUNT], int(tweet[TWEET.CONTAINS_QM]), int(tweet[TWEET.CONTAINS_EM]), int(tweet[TWEET.CONTAINS_MULTIPLE_MARKS]), tweet[TWEET.FRACTION_CAPITALISED], int(tweet[TWEET.CONTAINS_HAPPY_EMOJI]), int(tweet[TWEET.CONTAINS_SAD_EMOJI]), int(tweet[TWEET.CONTAINS_HAPPY_EMOTICON]), int(tweet[TWEET.CONTAINS_SAD_EMOTICON]), int(tweet[TWEET.CONTAINS_PRONOUNS]), int(tweet[TWEET.CONTAINS_DOMAIN_TOP10]), int(tweet[TWEET.CONTAINS_DOMAIN_TOP30]), int(tweet[TWEET.CONTAINS_DOMAIN_TOP50]), int(tweet[TWEET.MENTIONS_USER]), int(tweet[TWEET.CONTAINS_STOCK_SYMBOL]), tweet[TWEET.PUBLISH_WEEKDAY], tweet[TWEET.POSITIVE_WORD_COUNT], tweet[TWEET.NEGATIVE_WORD_COUNT], tweet[TWEET.SENTIMENT_SCORE], tweet[TWEET.AVERAGE_ENTITY_CERTAINTY], tweet[TWEET.AVERAGE_KEYWORD_CERTAINTY], tweet[TWEET.ENTITIES_COUNT], tweet[TWEET.KEYWORDS_COUNT], tweet[TWEET.RELEVANCY_DAY], tweet[TWEET.RELEVANCY_WEEK], tweet[TWEET.RELEVANCY_TWO_WEEKS], int(tweet[TWEET.CONTAINS_FIGURES]), tweet[TWEET.FRAC_NOT_IN_DICT] ] mp_data = self.db_connection.find_document( collection=DB.MP_COLLECTION, filter={"_id": tweet[TWEET.AUTHOR_ID]}, projection={ MP.FOLLOWERS_COUNT: 1, MP.FRIENDS_COUNT: 1, MP.TWEET_COUNT: 1, MP.IS_VERIFIED: 1, MP.AVERAGE_NO_RETWEETS: 1, MP.AVERAGE_NO_FAVOURITES: 1, MP.ACCOUNT_DAYS: 1 }) for mp in mp_data: mp_block = [ mp[MP.FOLLOWERS_COUNT], mp[MP.FRIENDS_COUNT], mp[MP.TWEET_COUNT], int(mp[MP.IS_VERIFIED]), mp[MP.AVERAGE_NO_RETWEETS], mp[MP.AVERAGE_NO_FAVOURITES], mp[MP.ACCOUNT_DAYS] ] break top_topic = max(tweet[TWEET.TOPICS], key=lambda x: x[TOPIC.IDENTIFIED_AS_TOPIC]) topics = self.db_connection.find_document( collection=DB.RELEVANT_TOPICS, filter={"_id": top_topic["_id"]}) for topic in topics: topic_block = [ topic[TOPIC.TWEET_COUNT], topic[TOPIC.TWEET_AVERAGE_LENGTH], topic[TOPIC.FRAC_CONTAINING_QM], topic[TOPIC.FRAC_CONTAINING_EM], topic[TOPIC.FRAC_CONTAINING_MULTIPLE_MARKS], topic[TOPIC.FRAC_CONTAINING_HAPPY_EMOTICON], topic[TOPIC.FRAC_CONTAINING_SAD_EMOTICON], topic[TOPIC.FRAC_CONTAINING_HAPPY_EMOJI], topic[TOPIC.FRAC_CONTAINING_SAD_EMOJI], topic[TOPIC.FRAC_CONTAINING_PRONOUNS], topic[TOPIC.FRAC_CONTAINING_FIGURES], topic[TOPIC.FRAC_CONTAINING_UPPERCASE], topic[TOPIC.FRAC_CONTAINING_URL], topic[TOPIC.FRAC_CONTAINING_USER_MENTION], topic[TOPIC.FRAC_CONTAINING_HASHTAGS], topic[TOPIC.FRAC_CONTAINING_STOCK_SYMBOLS], topic[TOPIC.AVERAGE_SENTIMENT_SCORE], topic[TOPIC.FRAC_CONTAINING_POSITIVE_SENTIMENT], topic[TOPIC.FRAC_CONTAINING_NEGATIVE_SENTIMENT], topic[TOPIC.FRAC_CONTAINING_DOMAIN10], topic[TOPIC.FRAC_CONTAINING_DOMAIN30], topic[TOPIC.FRAC_CONTAINING_DOMAIN50], topic[TOPIC.DISTINCT_URLS_COUNT], topic[TOPIC.FRAC_CONTAINING_MOST_VISITED_URL], topic[TOPIC.DISTINCT_HASHTAG_COUNT], topic[TOPIC.FRAC_CONTAINING_MOST_USED_HASHTAG], topic[TOPIC.DISTINCT_USER_MENTION_COUNT], topic[TOPIC.FRAC_CONTAINING_MOST_MENTIONED_USER], topic[TOPIC.DISTINCT_TWEET_AUTHOR_COUNT], topic[TOPIC.FRAC_CONTAINING_TOP_AUTHOR], topic[TOPIC.AVERAGE_AUTHOR_TWITTER_LIFE], topic[TOPIC.AVERAGE_AUTHOR_TWEET_COUNT], topic[TOPIC.AVERAGE_AUTHOR_FOLLOWER_COUNT], topic[TOPIC.AVERAGE_AUTHOR_FRIEND_COUNT], topic[TOPIC.FRAC_FROM_VERIFIED], topic[TOPIC.AVERAGE_DAY_RELEVANCE], topic[TOPIC.AVERAGE_WEEK_RELEVANCE], topic[TOPIC.AVERAGE_2WEEK_RELEVANCE], topic[TOPIC.AVERAGE_WORDS_NOT_IN_DICT] ] break data_block = tweet_block + mp_block + topic_block self.clean_train_data.append(data_block) X = np.array(self.clean_train_data) X = preprocessing.scale(X) # self.get_best_hyperparameters(X=X[:-60], y=train_target) # self.classifier.fit(X=X[:-60], y=train_target) self.classifier.fit(X=X[:-150], y=train_target) self.coef = self.classifier.coef_ # here the weights of the features will be stored # self.get_feature_importance(self.classifier, feature_names=self.features_names, top_features=10) return X def predict(self, target_data): ''' Get predictions for the target data :param target_data: array of feature sets for each tweet :return: ''' predictions = self.classifier.predict(target_data) self.classifier_predictions = predictions.tolist() class_probabilities = self.classifier.predict_proba(target_data) # print predictions print class_probabilities print len(class_probabilities) for pos, tweet in enumerate(self.raw_tweets): prediction = predictions[pos] confidence_score = class_probabilities[pos, prediction] # confidence_score = class_probabilities[pos] verdict = bool(prediction == 1) clf.db_connection.find_and_update( collection=DB.RELEVANT_TWEET_COLLECTION, query={"_id": tweet[TWEET.ID]}, update={ "$set": { TWEET.CONFIDENCE_SCORE: confidence_score, TWEET.PREDICTED_VERDICT: verdict } }) print pos print tweet print "----" def evaluate_classifier(self): target_names = ["false", "true"] # labeler1 = [2, 0, 2, 2, 0, 1] # labeler2 = [0, 0, 2, 2, 0, 2] print "predictions - true: %s" % self.classifier_predictions.count(1) print "predictions - false: %s" % self.classifier_predictions.count(0) print "ground truth - true: %s" % self.gold_results.count(1) print "ground truth - false: %s" % self.gold_results.count(0) kappa_score = cohen_kappa_score(self.classifier_predictions, self.gold_results) print("kappa score: %s" % kappa_score) print( classification_report(self.gold_results, self.classifier_predictions, target_names=target_names)) tn, fp, fn, tp = confusion_matrix(self.gold_results, self.classifier_predictions).ravel() print(tn, fp, fn, tp) def get_best_hyperparameters(self, X, y): Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma': gammas} grid_search = GridSearchCV(svm.SVC(kernel='linear'), param_grid, cv=25) grid_search.fit(X, y) grid_search.best_params_ print grid_search.best_params_ def get_feature_importance(self, classifier, feature_names, top_features=20): rcParams.update({'figure.autolayout': True}) # imp = self.coef # imp, names = zip(*sorted(zip(imp, self.features_names))) # plt.barh(range(len(names)), imp, align='center') # plt.yticks(range(len(names)), names) # plt.show() coef = classifier.coef_.ravel() top_positive_coefficients = np.argsort(coef)[-top_features:] top_negative_coefficients = np.argsort(coef)[:top_features] top_coefficients = np.hstack( [top_negative_coefficients, top_positive_coefficients]) # create plot plt.figure(figsize=(10, 10)) colors = ["red" if c < 0 else "green" for c in coef[top_coefficients]] plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors) feature_names = np.array(feature_names) plt.xticks(np.arange(0, 2 * top_features), feature_names[top_coefficients], rotation=60, ha="right") # plt.xticks(np.arange(0, 1 + 2 * top_features), feature_names[top_coefficients], rotation=90) plt.ylabel("Feature coefficient") plt.xlabel("Feature name") plt.show() def get_ground_truth_set(self): tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={ "$and": [{ TWEET.AGGREGATE_LABEL: { "$exists": False } }, { TWEET.TOPICS: { "$exists": True } }, { TWEET.ENTITIES_COUNT: { "$gt": 0 } }, { TWEET.SET_TO_FACTCHECK: True }] }, projection={"text": 1}) print tweets.count() total_count = 0 for tweet in tweets: print tweet print "-------------" verdict = raw_input("Is the tweet true?\n") verdict = int("y" in verdict) self.db_connection.find_and_update( collection=DB.RELEVANT_TWEET_COLLECTION, query={"_id": tweet["_id"]}, update={ "$set": { TWEET.AGGREGATE_LABEL: verdict, TWEET.GOLDEN: True } }) total_count += 1 def populate_authors(self): valid_tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={TWEET.CONFIDENCE_SCORE: { "$exists": True }}, projection={ TWEET.AUTHOR_ID: 1, TWEET.CONFIDENCE_SCORE: 1 }) for tweet in valid_tweets: self.db_connection.find_and_update( collection=DB.MP_COLLECTION, query={MP.ID: tweet[TWEET.AUTHOR_ID]}, update={ "$inc": { MP.FACTUAL_SCORE: tweet[TWEET.CONFIDENCE_SCORE], MP.NO_FACT_CHECKED_TWEETS: 1 } })
class Reddit(object): def __init__(self): self.api = praw.Reddit(client_id='DiI57R025MBQLQ', client_secret='4IaDtRqQrX4jIEDZeYqh_y4cJCA', user_agent='script') self.db_connection = DBConnection() self.subreddit_list = [] self.submission_list = [] def get_subreddit(self, name): # assume you have a Reddit instance bound to variable `reddit` subreddit = self.api.subreddit(name) self.subreddit_list.append(subreddit) # print(subreddit.display_name) # print(subreddit.title) # print(subreddit.description) def get_top_comments(self, subreddit): count = 0 for submission in subreddit.new(): # for submission in subreddit.top(time_filter='month'): # for submission in subreddit.new(limit=1000): count +=1 # print(dir(submission)) if "spoiler" in submission.title.lower() or submission.spoiler: comments = submission.comments.list() for comment in comments: print(submission.title) print(submission.selftext) if submission.selftext != "": doc = { SPOILER.ID: submission.id, SPOILER.TITLE: submission.title, SPOILER.CONTENT: submission.selftext, SPOILER.SHOW: 'Breaking Bad', } try: self.db_connection.insert(doc) except Exception: pass # print(submission.title) # print(submission.selftext) # comments = submission.comments.list() # for comment in comments: # print(comment.body) # print(submission.title) # Output: the submission's title # print(submission.score) # Output: the submission's score # print(submission.id) # Output: the submission's ID # print(submission.url) # submission.# Output: the URL the submission points to # print("-------") # or the submission's URL if it's a self post # top_level_comments = list(submission.comments) # all_comments = submission.comments.list() print(count) if __name__ == "__main__": r = Reddit() r.get_subreddit(name="breakingbad") r.get_top_comments(subreddit=r.subreddit_list[0])
def __init__(self): self.db_connection = DBConnection() # self.iris = datasets.load_iris() # self.digits = datasets.load_digits() # self.classifier = svm.SVC(probability=True, kernel='linear') self.classifier = svm.SVC(probability=True, kernel='linear', C=1, gamma=1) self.clean_train_data = [] self.classifier_predictions = None self.gold_results = None Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] self.coef = None self.raw_tweets = [] self.features_names = [ "tweet_%s" % TWEET.CHARACTER_COUNT, "tweet_%s" % TWEET.WORD_COUNT, "tweet_%s" % TWEET.CONTAINS_QM, "tweet_%s" % TWEET.CONTAINS_EM, "tweet_%s" % TWEET.CONTAINS_MULTIPLE_MARKS, "tweet_%s" % TWEET.FRACTION_CAPITALISED, "tweet_%s" % TWEET.CONTAINS_HAPPY_EMOJI, "tweet_%s" % TWEET.CONTAINS_SAD_EMOJI, "tweet_%s" % TWEET.CONTAINS_HAPPY_EMOTICON, "tweet_%s" % TWEET.CONTAINS_SAD_EMOTICON, "tweet_%s" % TWEET.CONTAINS_PRONOUNS, "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP10, "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP30, "tweet_%s" % TWEET.CONTAINS_DOMAIN_TOP50, "tweet_%s" % TWEET.MENTIONS_USER, "tweet_%s" % TWEET.CONTAINS_STOCK_SYMBOL, "tweet_%s" % TWEET.PUBLISH_WEEKDAY, "tweet_%s" % TWEET.POSITIVE_WORD_COUNT, "tweet_%s" % TWEET.NEGATIVE_WORD_COUNT, "tweet_%s" % TWEET.SENTIMENT_SCORE, "tweet_%s" % TWEET.AVERAGE_ENTITY_CERTAINTY, "tweet_%s" % TWEET.AVERAGE_KEYWORD_CERTAINTY, "tweet_%s" % TWEET.ENTITIES_COUNT, "tweet_%s" % TWEET.KEYWORDS_COUNT, "tweet_%s" % TWEET.RELEVANCY_DAY, "tweet_%s" % TWEET.RELEVANCY_WEEK, "tweet_%s" % TWEET.RELEVANCY_TWO_WEEKS, "tweet_%s" % TWEET.CONTAINS_FIGURES, "tweet_%s" % TWEET.FRAC_NOT_IN_DICT, "mp_%s" % MP.FOLLOWERS_COUNT, "mp_%s" % MP.FRIENDS_COUNT, "mp_%s" % MP.TWEET_COUNT, "mp_%s" % MP.IS_VERIFIED, "mp_%s" % MP.AVERAGE_NO_RETWEETS, "mp_%s" % MP.AVERAGE_NO_FAVOURITES, "mp_%s" % MP.ACCOUNT_DAYS, "topic_%s" % TOPIC.TWEET_COUNT, "topic_%s" % TOPIC.TWEET_AVERAGE_LENGTH, "topic_%s" % TOPIC.FRAC_CONTAINING_QM, "topic_%s" % TOPIC.FRAC_CONTAINING_EM, "topic_%s" % TOPIC.FRAC_CONTAINING_MULTIPLE_MARKS, "topic_%s" % TOPIC.FRAC_CONTAINING_HAPPY_EMOTICON, "topic_%s" % TOPIC.FRAC_CONTAINING_SAD_EMOTICON, "topic_%s" % TOPIC.FRAC_CONTAINING_HAPPY_EMOJI, "topic_%s" % TOPIC.FRAC_CONTAINING_SAD_EMOJI, "topic_%s" % TOPIC.FRAC_CONTAINING_PRONOUNS, "topic_%s" % TOPIC.FRAC_CONTAINING_FIGURES, "topic_%s" % TOPIC.FRAC_CONTAINING_UPPERCASE, "topic_%s" % TOPIC.FRAC_CONTAINING_URL, "topic_%s" % TOPIC.FRAC_CONTAINING_USER_MENTION, "topic_%s" % TOPIC.FRAC_CONTAINING_HASHTAGS, "topic_%s" % TOPIC.FRAC_CONTAINING_STOCK_SYMBOLS, "topic_%s" % TOPIC.AVERAGE_SENTIMENT_SCORE, "topic_%s" % TOPIC.FRAC_CONTAINING_POSITIVE_SENTIMENT, "topic_%s" % TOPIC.FRAC_CONTAINING_NEGATIVE_SENTIMENT, "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN10, "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN30, "topic_%s" % TOPIC.FRAC_CONTAINING_DOMAIN50, "topic_%s" % TOPIC.DISTINCT_URLS_COUNT, "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_VISITED_URL, "topic_%s" % TOPIC.DISTINCT_HASHTAG_COUNT, "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_USED_HASHTAG, "topic_%s" % TOPIC.DISTINCT_USER_MENTION_COUNT, "topic_%s" % TOPIC.FRAC_CONTAINING_MOST_MENTIONED_USER, "topic_%s" % TOPIC.DISTINCT_TWEET_AUTHOR_COUNT, "topic_%s" % TOPIC.FRAC_CONTAINING_TOP_AUTHOR, "topic_%s" % TOPIC.AVERAGE_AUTHOR_TWITTER_LIFE, "topic_%s" % TOPIC.AVERAGE_AUTHOR_TWEET_COUNT, "topic_%s" % TOPIC.AVERAGE_AUTHOR_FOLLOWER_COUNT, "topic_%s" % TOPIC.AVERAGE_AUTHOR_FRIEND_COUNT, "topic_%s" % TOPIC.FRAC_FROM_VERIFIED, "topic_%s" % TOPIC.AVERAGE_DAY_RELEVANCE, "topic_%s" % TOPIC.AVERAGE_WEEK_RELEVANCE, "topic_%s" % TOPIC.AVERAGE_2WEEK_RELEVANCE, "topic_%s" % TOPIC.AVERAGE_WORDS_NOT_IN_DICT ]
from __future__ import unicode_literals from __future__ import division import json from bson.objectid import ObjectId from django.http import HttpResponse from ingest_engine.twitter_ingest import Twitter from cons import CREDS, DB, TWEET, MP from db_engine import DBConnection from django.shortcuts import render from .models import MemberParliament, Tweet import os import sys sys.path.append(os.path.dirname(os.path.dirname(__file__))) db_connection = DBConnection() twitter_api = Twitter(os.environ.get(CREDS.TWITTER_KEY), os.environ.get(CREDS.TWITTER_SECRET), os.environ.get(CREDS.TWITTER_TOKEN), os.environ.get(CREDS.TWITTER_TOKEN_SECRET), db_connection) def index(request): if 'mp_search' in request.GET: mp_name = request.GET['mp_search'] mp_list = db_connection.find_document(collection=DB.MP_COLLECTION, filter={"name": {"$regex": mp_name.title()}}) # if 'mp_search' in request.GET: # mp_name = request.GET['mp_search'] # mp_list = MemberParliament.objects.filter(name__contains=mp_name.title())
def __init__(self, parent): self.db_connection = DBConnection() self.bulk_count = 0 tk.Frame.__init__(self, parent) # create a prompt, an input box, an output label, # and a button to do the computation self.prompt = tk.Label(self, text="Enter a number:", anchor="w", wraplength=500) self.entities_prompt = tk.Label(self, text="entities", anchor="w", wraplength=500) # self.entry = tk.Entry(self) self.tp = tk.Button(self, text="Is an entity and API says it's an entity", command=self.calculate1) self.tn = tk.Button(self, text="Is not an entity, API does not include it", command=self.calculate2) self.fp = tk.Button(self, text='Is not an entity, API includes it', command=self.calculate3) self.fn = tk.Button(self, text='Is an entity, API does not include it', command=self.calculate4) self.output = tk.Label(self, text="") # lay the widgets out on the screen. self.prompt.pack(side="top", fill="x") self.entities_prompt.pack(side="bottom") # self.entry.pack(side="top", fill="x", padx=20) self.output.pack(side="top", fill="x", expand=True) self.fn.pack(side="bottom") self.fp.pack(side="bottom") self.tn.pack(side="bottom") self.tp.pack(side="bottom") self.tweets = self.db_connection.get_random_sample( collection=DB.RELEVANT_TWEET_COLLECTION, query={ "$and": [{ TWEET.SET_TO_FACTCHECK: True }, { TWEET.ENTITIES_COUNT: { "$eq": 1 } }] }, size=200) self.current = self.tweets.next() self.prompt.configure(text=self.current["text"]) self.entities_prompt.configure( text="Entities: %s" % [x['entity'] for x in self.current["entities"]]) self.tp = 0 self.tn = 0 self.fp = 0 self.fn = 0
class CrowdFlower(object): def __init__(self): self.client = crowdflower.client.Client( os.getenv("CROWDFLOWER_API_KEY")) self.db_connection = DBConnection() self.api_key = os.getenv("CROWDFLOWER_API_KEY") self.judgements_session = requests.session() self.nlu = NaturalLanguageUnderstandingV1( version='2017-02-27', username="******", password="******") # self.connection = crowdflower.Connection(api_key=os.getenv("CROWDFLOWER_API_KEY")) def chunks(self, l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] def get_jobs(self): job = self.client.get_job(1239688) job.cml = """ <div class="html-element-wrapper"> <h2>The tweet you are evaluating is:</h2> <p>{{tweet_content}}</p> <h2><strong>This tweet's entities are:</strong></h2> <ul> {% for entity in entity_list %} <li>{{ entity_list[forloop.index0] }}</li> {% endfor %} </ul> </div> <cml:radios label="Do you understand the tweet?" validates="required" gold="true"> <cml:radio label="Yes" value="yes" /> <cml:radio label="No" value="no" /> </cml:radios> <cml:select label="Please indicate the first entity of your relation: NOTE: THIS MUST BE DIFFERENT FROM THE SECOND ENTITY." validates="required" gold="true"> {% for entity in entity_list %} <cml:option label="{{ entity_list[forloop.index0] }}" value="{{ entity_list[forloop.index0] }}" /> {% endfor %} </cml:select> <cml:select label="Please indicate the second entity of your relation: NOTE: THIS MUST BE DIFFERENT FROM THE FIRST ENTITY." validates="required" gold="true"> {% for entity in entity_list %} <cml:option label="{{ entity_list[forloop.index0] }}" value="{{ entity_list[forloop.index0] }}" /> {% endfor %} </cml:select> <cml:text label="What is the SIMPLE relationship between the entities you have chosen" validates="required" gold="true" /> <cml:radios label="Do you think the topic of the tweet is politically important?" validates="required" gold="true"> <cml:radio label="Yes" value="yes" /> <cml:radio label="No" value="no" /> </cml:radios><cml:text label="What is the first word in the tweet?" validates="required" gold="true" /> """ job.update() def get_judgements(self, job_id): page_no = 1 index_resolver = { "tweet1": 0, "tweet2": 1, "tweet3": 2, "tweet4": 3, "tweet5": 4, "tweet6": 5, "tweet7": 6, "tweet8": 7, "tweet9": 8, "tweet10": 9 } results = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id, self.api_key, page_no)) content = json.loads(results.content) no_count = 0 yes_count = 0 for key, result in content.iteritems(): answers = result[CF.FACTCHECKABLE_ANSWERS] answers = answers['res'] tweets_to_check = {} for answer in answers: if len(answer) != 10: for tweet in answer: if tweet not in tweets_to_check: tweets_to_check[tweet] = 1 else: tweets_to_check[tweet] = tweets_to_check[tweet] + 1 tweet_list = result[CF.TWEET_LIST] for tweet, occurrence in tweets_to_check.iteritems(): text = tweet_list[index_resolver.get(tweet)] if occurrence > 1: yes_count += 1 # text = tweet_list[index_resolver.get(tweet)] # self.db_connection.find_and_update(collection=DB.RELEVANT_TWEET_COLLECTION, # query={"text": text, TWEET.SET_TO_FACTCHECK: {"$exists": False}}, # update={"$set": {TWEET.SET_TO_FACTCHECK: True}}) else: self.db_connection.find_and_update( collection=DB.RELEVANT_TWEET_COLLECTION, query={ "text": text, TWEET.SET_TO_FACTCHECK: { "$exists": False } }, update={"$set": { TWEET.SET_TO_FACTCHECK: False }}) print yes_count print no_count # json_result = json.loads(results) def get_fact_opinion(self, job_id): crowd_data = [] tweet_list = [] job = self.client.get_job(job_id) tweets = self.db_connection.find_document( collection=DB.TWEET_COLLECTION, filter={ "$and": [{ "created_at_epoch": { "$gt": 1520812800 } }, { "created_at_epoch": { "$lt": 1523491200 } }, { "entities": { "$exists": True } }, { "keywords": { "$exists": True } }] }, projection={ "text": 1, "entities": 1, "keywords": 1 }, limit=2000, sort=True, sort_field="retweet_count") for tweet in tweets: if len(tweet['entities']) > 2 and len(tweet['keywords']) > 2: tweet_list.append(tweet['text']) data_list = list(self.chunks(tweet_list, 10)) # Chunk data for data in data_list: if len(data) == 10: crowd_data.append({"tweet_list": data}) job.upload(data=crowd_data, force=True) def process_job(self): data_list = [] job = self.client.get_job(1256982) tweets = self.db_connection.find_document( collection=DB.TWEET_COLLECTION, filter={ "$and": [{ "created_at_epoch": { "$gt": 1520812800 } }, { "created_at_epoch": { "$lt": 1523491200 } }, { "entities": { "$exists": True } }, { "keywords": { "$exists": True } }] }, projection={ "text": 1, "entities": 1, "keywords": 1 }) for tweet in tweets: if len(tweet['entities']) > 2 and len(tweet['keywords']) > 2: entities = [] for entity_data in tweet['entities']: if entity_data['entity'] not in entities: entities.append(entity_data['entity']) data_list.append({ "tweet_content": tweet["text"], "entity_list": entities, "keyword_list": tweet["keywords"], "full_list": entities + tweet["keywords"] }) job.upload(data=data_list, force=True) def update_data(self, tweet_content, entity_list): job = self.client.get_job(1239688) data_list = [] entity_amount = 0 data = {CF.TWEET_CONTENT: tweet_content, "entity_list": entity_list} for index, entity in enumerate(entity_list): entity_no = index + 1 data['entity%s' % entity_no] = entity data['dropdown%s' % entity_no] = entity entity_amount += 1 data[CF.ENTITY_AMOUNT] = entity_amount if len(entity_list) < 10: # empty_entities = 10 - len(entity_list) for i in range(len(entity_list) + 1, 11): data['entity%s' % i] = "" data_list.append(data) job.upload(data=data_list, force=True) def fact_checking_processing(self, job_id): data_list = [] job = self.client.get_job(job_id) # tweets = self.db_connection.find_document(collection=DB.RELEVANT_TWEET_COLLECTION, # filter={"$and":[{TWEET.SET_TO_FACTCHECK: True}, # {"crowdsourced":{"$exists": False}}]}, # projection={TWEET.TEXT: 1}) tweets = list( self.db_connection.get_random_sample( collection=DB.RELEVANT_TWEET_COLLECTION, query={ "$and": [{ "set_to_factcheck": True }, { "crowdsourced": { "$exists": False } }] }, size=300)) bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) # print tweets.count() for tweet in tweets: data_list.append({"tweet": tweet["text"]}) self.db_connection.add_to_bulk_upsert(query={"_id": tweet["_id"]}, data={"crowdsourced": True}, bulk_op=bulk_op) self.db_connection.end_bulk_upsert(bulk_op=bulk_op) job.upload(data=data_list, force=True) def get_old_judgements(self, job_id): page_no = 1 crowd_results = {} results = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id, self.api_key, page_no)) content = json.loads(results.content) for key, result in content.iteritems(): crowd_results[result['tweet_content']] = { "first_entity": result[ "please_indicate_the_first_entity_of_your_link_note_this_must_be_different_from_the_second_entity"] ["res"], "second_entity": result[ "please_indicate_the_second_entity_of_your_link_note_this_must_be_different_from_the_first_entity"] ["res"], "simple_relation": result[ "what_is_the_simple_link_between_the_entities_you_have_chosen"], "entity_list": result["entity_list"] } total_tp = 0 total_fn = 0 correct_relations = 0 incorrect_relations = 0 wrong_instructions = 0 for key, value in crowd_results.iteritems(): print key print "---------------------------------------" print "entity list: %s" % value["entity_list"] print "first_entities: %s" % value['first_entity'] print "Second entities: %s" % value['second_entity'] print "Simple relation: %s" % value['simple_relation'] print "---------------------------------------" tp = int(raw_input("tp?\n")) total_tp += tp fn = int(raw_input("fn?\n")) total_fn += fn corr_r = int(raw_input("correct relations (small verb ?\n")) correct_relations += corr_r incc_r = int(raw_input("incorrect relations (small verb) ?\n")) incorrect_relations += incc_r wrong_ins = int(raw_input("wrong instructions (long phrase) ?\n")) wrong_instructions += wrong_ins print "---------------------------------------\n\n\n\n\n\n\n\n\n\n\n" print "tp: %d" % total_tp print "fn: %d" % total_fn print "correct relations: %d" % correct_relations print "incorrect relations: %d" % incorrect_relations print "wrong instructions: %d" % wrong_instructions def check_relations(self, job_id): page_no = 1 tweet_list = [] results = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id, self.api_key, page_no)) content = json.loads(results.content) for key, result in content.iteritems(): tweet_list.append(result['tweet_content']) total_relations = len(tweet_list) valid_relations = 0 for tweet in tweet_list: relations = self.nlu.analyze( text=tweet, features=Features(semantic_roles=SemanticRolesVerb())) print tweet semantic_roles = relations["semantic_roles"] for entry in semantic_roles: print "subject: %s" % entry["subject"]["text"] print "verb: %s" % entry["action"]["text"] if "object" in entry: print "object: %s" % entry["object"]["text"] print "------------------------------------------" valid = raw_input("valid?\n") if valid == "y": valid_relations += 1 print valid_relations def get_factchecking_judgements(self, job_id): index_resolver = { 'almost_definitely_true': 1, 'likely_to_be_false': 0, 'almost_definitely_false': 0, 'very_ambiguous__i_really_cant_decide': -1 } page_no = 2 results = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id, self.api_key, page_no)) content = json.loads(results.content) for key, result in content.iteritems(): almost_definitely_true_count = 0 likely_to_be_false_count = 0 almost_definitely_false_count = 0 ambiguous_count = 0 source_list = [] author_list = [] tweet = result['tweet'] evidence = result['evidence']['res'] source_list = result['source'] author_list = result['author'] aggregate_rating = index_resolver.get(result['rating']['agg']) for value in result['rating']['res']: if value == 'almost_definitely_true': almost_definitely_true_count += 1 elif value == 'likely_to_be_false': likely_to_be_false_count += 1 elif value == 'almost_definitely_false': almost_definitely_false_count += 1 elif value == 'very_ambiguous__i_really_cant_decide': ambiguous_count += 1 doc = { TWEET.ALMOST_DEFINITELY_TRUE_COUNT: almost_definitely_true_count, TWEET.LIKELY_TO_BE_FALSE_COUNT: likely_to_be_false_count, TWEET.ALMOST_DEFINITELY_FALSE_COUNT: almost_definitely_false_count, TWEET.AMBIGUOUS_COUNT: ambiguous_count, TWEET.AGGREGATE_LABEL: aggregate_rating, TWEET.TOTAL_CROWDSOURCING_COUNT: almost_definitely_true_count + likely_to_be_false_count + almost_definitely_false_count + ambiguous_count, TWEET.EVIDENCE: evidence, TWEET.CROWDSOURCING_SOURCE_LIST: source_list, TWEET.CROWDSOURCING_AUTHOR_LIST: author_list } self.db_connection.find_and_update( collection=DB.RELEVANT_TWEET_COLLECTION, query={"text": tweet}, update={"$set": doc}) def evaluate_interesting_statements(self, job_id): # index_resolver = { # "tweet1": res # } page_no = 2 tp = 0 tn = 0 fp = 0 fn = 0 results = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id, self.api_key, page_no)) content = json.loads(results.content) total_judgements = 0 for key, result in content.iteritems(): labels = result[ 'tick_the_box_of_the_tweets_that_are_politically_important_andor_worth_factchecking'][ 'res'] for entry in labels: all_tweets = [ "tweet1", "tweet2", "tweet3", "tweet4", "tweet5", "tweet6", "tweet7", "tweet8", "tweet9", "tweet10" ] for tweet_value in entry: tweet = self.index_resolver( list_to_check=result['tweet_list'], value=tweet_value) verdict = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={ "text": tweet }, projection={ TWEET.SET_TO_FACTCHECK: 1 }).next() if TWEET.SET_TO_FACTCHECK not in verdict: verdict = False else: verdict = verdict[TWEET.SET_TO_FACTCHECK] if verdict: tp += 1 else: fp += 1 all_tweets.remove(tweet_value) total_judgements += 1 for tweet_value in all_tweets: tweet = self.index_resolver( list_to_check=result['tweet_list'], value=tweet_value) verdict = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={ "text": tweet }, projection={ TWEET.SET_TO_FACTCHECK: 1 }).next() if TWEET.SET_TO_FACTCHECK not in verdict: verdict = False else: verdict = verdict[TWEET.SET_TO_FACTCHECK] if verdict: fn += 1 else: tn += 1 print "tp: %s" % tp print "tn: %s" % tn print "fp: %s" % fp print "fn: %s" % fn print "total judgements: %s" % total_judgements # for value in result["tick_the_box_of_the_tweets_that_are_politically_important_andor_worth_factchecking"]["res"]: def evaluate_factchecking(self, job_id): page_no = 4 tp = 0 tn = 0 fp = 0 fn = 0 index_resolver = { 'almost_definitely_true': 1, 'likely_to_be_false': 0, 'almost_definitely_false': 0, 'very_ambiguous__i_really_cant_decide': -1 } results = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id, self.api_key, page_no)) content = json.loads(results.content) total_judgements = 0 for key, result in content.iteritems(): tweet = result['tweet'] print tweet ratings = result['rating']['res'] print ratings verdict = raw_input("Is the tweet true?\n") verdict = verdict == "y" for rating in ratings: judgement = index_resolver.get(rating) if verdict: if judgement == 1: tp += 1 elif judgement == 0: fn += 1 else: if judgement == 1: fp += 1 elif judgement == 0: tn += 1 total_judgements += 1 print "tp: %s" % tp print "tn: %s" % tn print "fp: %s" % fp print "fn: %s" % fn print "total judgements: %s" % total_judgements def index_resolver(self, list_to_check, value): resolver = { "tweet1": list_to_check[0], "tweet2": list_to_check[1], "tweet3": list_to_check[2], "tweet4": list_to_check[3], "tweet5": list_to_check[4], "tweet6": list_to_check[5], "tweet7": list_to_check[6], "tweet8": list_to_check[7], "tweet9": list_to_check[8], "tweet10": list_to_check[9], } return resolver.get(value) def evaluate_worker_background(self): countries = {} total = 0 with open("background.txt") as f: for line in f: total += 1 if line not in countries: countries[line] = 1 else: countries[line] = countries[line] + 1 print countries print "bnt" def evalute_factchecking_info(self, job_id1, job_id2): almost_definitely_true_count = 0 likely_to_be_false_count = 0 almost_definitely_false_count = 0 ambiguous_count = 0 page_no = 1 results1 = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id1, self.api_key, 1)) results2 = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id2, self.api_key, 1)) results3 = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id1, self.api_key, 2)) results4 = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id2, self.api_key, 2)) results5 = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id1, self.api_key, 3)) results6 = self.judgements_session.get( url= "https://api.figure-eight.com/v1/jobs/%s/judgments.json?key=%s&page=%s" % (job_id2, self.api_key, 3)) content1 = json.loads(results1.content) content2 = json.loads(results2.content) content3 = json.loads(results3.content) content4 = json.loads(results4.content) content5 = json.loads(results5.content) content6 = json.loads(results6.content) final_source_dict = {} final_author_dict = {} for key, result in content1.iteritems(): source_list = result['source'] author_list = result['author'] # for source in source_list: # source = source.split("//")[1].split(".")[0] # if source not in final_source_dict: # final_source_dict[source] = 1 # # else: # final_source_dict[source] = final_source_dict[source] + 1 # # for author in author_list: # author = author.encode('ascii', 'ignore') # if author not in final_author_dict: # final_author_dict[author] = 1 # # else: # final_author_dict[author] = final_author_dict[author] + 1 for value in result['rating']['res']: if value == 'almost_definitely_true': almost_definitely_true_count += 1 elif value == 'likely_to_be_false': likely_to_be_false_count += 1 elif value == 'almost_definitely_false': almost_definitely_false_count += 1 elif value == 'very_ambiguous__i_really_cant_decide': ambiguous_count += 1 for key, result in content2.iteritems(): source_list = result['source'] author_list = result['author'] # for source in source_list: # source = source.split("//")[1].split(".")[0] # if source not in final_source_dict: # final_source_dict[source] = 1 # # else: # final_source_dict[source] = final_source_dict[source] + 1 # # for author in author_list: # author = author.encode('ascii', 'ignore') # if author not in final_author_dict: # final_author_dict[author] = 1 # # else: # final_author_dict[author] = final_author_dict[author] + 1 for value in result['rating']['res']: if value == 'almost_definitely_true': almost_definitely_true_count += 1 elif value == 'likely_to_be_false': likely_to_be_false_count += 1 elif value == 'almost_definitely_false': almost_definitely_false_count += 1 elif value == 'very_ambiguous__i_really_cant_decide': ambiguous_count += 1 for key, result in content3.iteritems(): source_list = result['source'] author_list = result['author'] # for source in source_list: # source = source.split("//")[1].split(".")[0] # if source not in final_source_dict: # final_source_dict[source] = 1 # # else: # final_source_dict[source] = final_source_dict[source] + 1 # # for author in author_list: # author = author.encode('ascii', 'ignore') # if author not in final_author_dict: # final_author_dict[author] = 1 # # else: # final_author_dict[author] = final_author_dict[author] + 1 for value in result['rating']['res']: if value == 'almost_definitely_true': almost_definitely_true_count += 1 elif value == 'likely_to_be_false': likely_to_be_false_count += 1 elif value == 'almost_definitely_false': almost_definitely_false_count += 1 elif value == 'very_ambiguous__i_really_cant_decide': ambiguous_count += 1 for key, result in content4.iteritems(): source_list = result['source'] author_list = result['author'] # for source in source_list: # source = source.split("//")[1].split(".")[0] # if source not in final_source_dict: # final_source_dict[source] = 1 # # else: # final_source_dict[source] = final_source_dict[source] + 1 # # for author in author_list: # author = author.encode('ascii', 'ignore') # if author not in final_author_dict: # final_author_dict[author] = 1 # # else: # final_author_dict[author] = final_author_dict[author] + 1 for value in result['rating']['res']: if value == 'almost_definitely_true': almost_definitely_true_count += 1 elif value == 'likely_to_be_false': likely_to_be_false_count += 1 elif value == 'almost_definitely_false': almost_definitely_false_count += 1 elif value == 'very_ambiguous__i_really_cant_decide': ambiguous_count += 1 for key, result in content5.iteritems(): source_list = result['source'] author_list = result['author'] # for source in source_list: # source = source.split("//")[1].split(".")[0] # if source not in final_source_dict: # final_source_dict[source] = 1 # # else: # final_source_dict[source] = final_source_dict[source] + 1 # # for author in author_list: # author = author.encode('ascii', 'ignore') # if author not in final_author_dict: # final_author_dict[author] = 1 # # else: # final_author_dict[author] = final_author_dict[author] + 1 for value in result['rating']['res']: if value == 'almost_definitely_true': almost_definitely_true_count += 1 elif value == 'likely_to_be_false': likely_to_be_false_count += 1 elif value == 'almost_definitely_false': almost_definitely_false_count += 1 elif value == 'very_ambiguous__i_really_cant_decide': ambiguous_count += 1 for key, result in content6.iteritems(): source_list = result['source'] author_list = result['author'] # for source in source_list: # source = source.split("//")[1].split(".")[0] # if source not in final_source_dict: # final_source_dict[source] = 1 # # else: # final_source_dict[source] = final_source_dict[source] + 1 # # for author in author_list: # author = author.encode('ascii', 'ignore') # if author not in final_author_dict: # final_author_dict[author] = 1 # # else: # final_author_dict[author] = final_author_dict[author] + 1 for value in result['rating']['res']: if value == 'almost_definitely_true': almost_definitely_true_count += 1 elif value == 'likely_to_be_false': likely_to_be_false_count += 1 elif value == 'almost_definitely_false': almost_definitely_false_count += 1 elif value == 'very_ambiguous__i_really_cant_decide': ambiguous_count += 1 print "bant"
class WikiIngest(object): def __init__(self): self.db_connection = DBConnection() self.logger = logging.getLogger(__name__) self.api = PageviewsClient( "Mozilla/5.0 (X11; Linux x86_64)" " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" ) def get_top_articles(self, time_collect=None, historic=False): if not historic: time_collect = datetime.now() - timedelta(days=1) results = self.api.top_articles(project=WIKI_SOURCES.ENGLISH_WIKIPEDIA, year=time_collect.year, month=time_collect.month, day=time_collect.day) timestamp = calendar.timegm(time_collect.timetuple()) articles_to_insert = [] bulk_op = None if historic: bulk_op = self.db_connection.start_bulk_upsert( collection=DB.WIKI_TRENDS) for result in results: name = result["article"] if "_" in name: name = name.replace("_", " ") doc = { WIKI_TREND.NAME: name, WIKI_TREND.RANK: int(result["rank"]), WIKI_TREND.VIEWS: int(result["views"]), WIKI_TREND.TIMESTAMP: timestamp, WIKI_TREND.DATE_OBJECT: time_collect, WIKI_TREND.DATE_STRING: time_collect.strftime("%A %B %d %Y"), WIKI_TREND.MONTH: time_collect.strftime("%B").lower(), WIKI_TREND.WEEKDAY: time_collect.strftime("%A").lower(), WIKI_TREND.MONTH_DAY: int(time_collect.strftime("%d")), WIKI_TREND.YEAR: time_collect.strftime("%Y") } if historic: self.db_connection.add_to_bulk_upsert(query={ "$and": [{ WIKI_TREND.NAME: name }, { WIKI_TREND.DATE_STRING: time_collect.strftime("%A %B %d %Y") }] }, data=doc, bulk_op=bulk_op) else: articles_to_insert.append(doc) if historic: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) else: self.db_connection.bulk_insert(data=articles_to_insert, collection=DB.WIKI_TRENDS)
class FeatureExtractor(object): def __init__(self): self.db_connection = DBConnection() self.sid = SentimentIntensityAnalyzer() self.nlu = NaturalLanguageUnderstandingV1( version='2017-02-27', username="******", password="******") self.twitter = Twitter(os.environ.get(CREDS.TWITTER_KEY), os.environ.get(CREDS.TWITTER_SECRET), os.environ.get(CREDS.TWITTER_TOKEN), os.environ.get(CREDS.TWITTER_TOKEN_SECRET), self.db_connection) self.session = requests.session() self.resolved_urls = [] # self.session = requests.session() def get_extra_features(self, tweets): ''' Gets extra features such as whether tweet contains figures and percentage of words not in dictionary :param tweets: :return: ''' english_dict = enchant.Dict("en_GB") bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) bulk_count = 0 for tweet in tweets: not_english = 0 text = re.sub(r"http\S+", "", tweet['text']) figures = re.findall("-?\d+", text) no_words = len(re.findall(r'\w+', text)) has_figures = len(figures) > 0 clean_text = ''.join([i for i in text if not i.isdigit()]) clean_text = re.sub(r'[^\w]', ' ', clean_text) for word in clean_text.split(): if not english_dict.check(word): not_english += 1 doc = { TWEET.CONTAINS_FIGURES: has_figures, TWEET.FRAC_NOT_IN_DICT: not_english / no_words } self.db_connection.add_to_bulk_upsert(query={"_id": tweet["_id"]}, data=doc, bulk_op=bulk_op) bulk_count += 1 if bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) logger.info("Pushing 100 extra featured tweets to DB") if bulk_count > 0 and bulk_count % 100 != 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) logger.info("Final DB push for tweets with extra features") def chunks(self, l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] def resolve_url(self, urls): db_connection = DBConnection() url_list = [] try: r = requests.get(urls[1]) if r.status_code != 200: longurl = None else: longurl = r.url self.resolved_urls.append((urls[0], longurl)) r.close() except requests.exceptions.RequestException: return None def fetch_url(self, url): # urlHandler = urllib2.urlopen(url[1]) # print urlHandler # session = requests.Session() # so connections are recycled resp = requests.head(url[1], allow_redirects=True, timeout=3) # if resp.status_code == 200 or resp.status_code == 302: self.resolved_urls.append((url[0], resp.url)) resp.close() # print "appended" def convert_weekday(self, weekday): week_dict = { "Monday": WEEKDAY.MONDAY, "Tuesday": WEEKDAY.TUESDAY, "Wednesday": WEEKDAY.WEDNESDAY, "Thursday": WEEKDAY.THURSDAY, "Friday": WEEKDAY.FRIDAY, "Saturday": WEEKDAY.SATURDAY, "Sunday": WEEKDAY.SUNDAY } return week_dict.get(weekday) def get_top_websites(self): domains_to_insert = [] rank = 0 with open("top_news_domains", "rb") as f: for line in f: line = line.decode("utf8").strip() if "Website" in line: rank += 1 domain_info = { DOMAIN.URL: line.split(" ")[1], DOMAIN.RANK: rank } domains_to_insert.append(domain_info) f.close() self.db_connection.bulk_insert(data=domains_to_insert, collection=DB.TOP_NEWS_DOMAINS) def aggregate_urls(self, tweets): urls_list = [] resolved_urls = [] bulk_count = 0 bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) # pool = ThreadPool(100) for tweet in tweets: # urls = re.findall(r'(https?://[^\s]+)', tweet["text"]) urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet["text"]) if len(urls) > 0: for url in urls: urls_list.append((tweet["_id"], url)) url_chunks = self.chunks(urls_list, 100) for chunk in url_chunks: pool = ThreadPool(100) pool.imap_unordered(self.fetch_url, chunk) pool.close() pool.join() pool.terminate() for tweet_id, long_url in self.resolved_urls: self.db_connection.add_to_bulk_upsert_push( query={"_id": tweet_id}, field=TWEET.RESOLVED_URLS, value=long_url, bulk_op=bulk_op) bulk_count += 1 try: if bulk_count != 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) logger.info("pushing %d updates to database" % bulk_count) bulk_count = 0 except InvalidOperation as e: logger.warn(e) urls_list = [] # resolved_urls = [] self.resolved_urls = [] if bulk_count != 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) def get_tweet_urls(self, tweets): urls_list = [] resolved_urls = [] bulk_count = 0 bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) # pool = ThreadPool(100) for tweet in tweets: # urls = re.findall(r'(https?://[^\s]+)', tweet["text"]) urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet["text"]) if len(urls) > 0: for url in urls: urls_list.append((tweet["_id"], url)) url_chunks = self.chunks(urls_list, 100) for chunk in url_chunks: # if len(urls_list) != 0 and len(urls_list) % 100 == 0: # threads = [threading.Thread(target=self.fetch_url, args=(url,)) for url in urls_list] # for thread in threads: # thread.start() # for thread in threads: # thread.join() pool = ThreadPool(100) pool.imap_unordered(self.fetch_url, chunk) pool.close() pool.join() pool.terminate() # rs = (grequests.head(u[1], timeout=2) for u in urls_list) # resolved = grequests.map(rs, exception_handler=exception_handler) # for index, long_url in enumerate(self.resolved_urls): # for tweet_id, long_url in self.resolved_urls: # if long_url: # long_url = long_url.url # tweet_id = urls_list[index][0] # for tweet_id, long_url in pool.map(self.resolve_url, urls_list): # resolved_urls.append((tweet_id, long_url)) for tweet_id, long_url in self.resolved_urls: top10 = False top30 = False top50 = False doc = {TWEET.VERIFIED_URLS: True} url = long_url.split("://")[1] if re.match(r'^www.', url): try: url = url.split("www.")[1] except IndexError: continue if "/" in url: url = url.split("/")[0] if len(url.split('.')[0]) > 1: # regexp = re.compile("/.*%s.*/" % url, re.IGNORECASE) regexp = "/.*%s.*/" % url # match = self.db_connection.find_document(collection=DB.TOP_NEWS_DOMAINS, # filter={"url": {"$regex": url}}) match = self.db_connection.find_document( collection=DB.TOP_NEWS_DOMAINS, filter={"url": url}) for domain in match: rank = domain["rank"] if not top10: top10 = rank <= 10 if not top30: top30 = 11 <= rank <= 30 if not top50: top50 = 31 <= rank <= 50 if top10: doc[TWEET.CONTAINS_DOMAIN_TOP10] = top10 if top30: doc[TWEET.CONTAINS_DOMAIN_TOP30] = top30 if top50: doc[TWEET.CONTAINS_DOMAIN_TOP50] = top50 self.db_connection.add_to_bulk_upsert(query={"_id": tweet_id}, data=doc, bulk_op=bulk_op) bulk_count += 1 try: if bulk_count != 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) logger.info("pushing %d updates to database" % bulk_count) bulk_count = 0 except InvalidOperation as e: logger.warn(e) urls_list = [] # resolved_urls = [] self.resolved_urls = [] if bulk_count != 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) def get_tweet_features(self, tweets): ''' Given a list of tweets, extracts the necessary features for this tweet for the classifier This includes a tweet's: - Number of characters - Number of words - Contains a question mark - Contains an exclamation mark - Fraction of capital letters - Are there multiple exclamation marks or question marks - Contains happy emoji(s) - Contains unhappy emoji(s) - Contains happy emoticon - Contains unhappy emoticon - Contains pronouns - No.of URLS - Contains popular domain top 10 - Contains popular domain top 30 - Contains popular domain top 50 - Mentions user - Contains hashtag - Contains stock symbol e.g. $GOOGL - Day of the week in which tweet was made: - Monday = 1 ...... Sunday = 7 - No.of positive words - No.of negative words - Total final sentiment score - Relevance score from news: day, week, 2weeks - No.of entities extracted - No.of keywords extracted - Average certainty of entities extracted - Average relevance of keywords extracted :param tweets: List of tweets to perform feature extraction :return: ''' bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) bulk_count = 0 for tweet in tweets: text = re.sub(r'http\S+', '', tweet['text']) # Remove links capitalised = sum(1 for c in text if c.isupper()) text = text.lower() timestamp = tweet['created_at_epoch'] no_chars = len(re.sub(r"\s+", "", text)) no_words = len(re.findall(r'\w+', text)) capitalised = capitalised / no_chars contains_qm = "?" in text contains_em = "!" in text multiple_marks = text.count("?") > 1 or text.count("!") > 1 # happy_emoji = [] # Pronoun extraction tokens = nltk.word_tokenize(text) pos_tags = nltk.pos_tag(tokens) has_personal_pronoun = False for tag in pos_tags: has_personal_pronoun = tag[0] in ['PRP', 'PRP$'] if has_personal_pronoun: break # Extracting user mentions user_mentions = re.findall("(^|[^@\w])@(\w{1,15})", text) user_mentions = [mention[1] for mention in user_mentions] # Extracting stock symbols stock_result = re.findall("$([a-zA-Z0-9]{1,15})", text) day_of_week = datetime.fromtimestamp(timestamp).strftime("%A") # Extracting emoticons happy_emoticons = """ :‑) :) :-] :] :-3 :3 :-> :> 8-) 8) :-} :} :o) :c) :^) =] =) :‑D :D 8‑D 8D x‑D xD X‑D XD =D =3 B^D :-)) :'‑) :') :‑P :P :‑p :p =p >:P """.split() sad_emoticons = """ :‑( :( :‑c :c :‑< :< :‑[ :[ :-|| >:[ :{ :@ >:( :'‑( :'( D‑': D:< D: D8 D; D= DX :‑/ :/ :‑. >:\ >:/ :\ =/ =\ :L =L :S """.split() happy_emoticon_pattern = "|".join(map(re.escape, happy_emoticons)) sad_emoticon_pattern = "|".join(map(re.escape, sad_emoticons)) happy_emoticon_count = re.findall(happy_emoticon_pattern, text) sad_emoticon_count = re.findall(sad_emoticon_pattern, text) # Extracting emojis happy_emoji_count = len( [c for c in text.split() if c in EMOJI_HAPPY]) sad_emoji_count = len( [c for c in text.split() if c in EMOJI_UNHAPPY]) # Extracting sentiment score and its components sentiment_score = 0 pos_word_count = 0 neg_word_count = 0 for word in text.split(): with open('positive_words.txt') as positive_file: if word in positive_file.read().split(): pos_word_count += 1 else: positive_file.close() with open('negative_words.txt') as negative_file: if word in negative_file.read().split(): neg_word_count += 1 # Domain extraction top10 = False top30 = False top50 = False if TWEET.LINKS in tweet: for url in tweet[TWEET.LINKS]: try: url = requests.head(url, allow_redirects=True).url url = url.split("://")[1] if re.match(r'^www.', url): try: url = url.split("www.")[1] except IndexError: url = url.split("www3.")[1] if "/" in url: url = url.split("/")[0] if len(url.split('.')[0]) > 1: # regexp = re.compile("/.*%s.*/" % url, re.IGNORECASE) regexp = "/.*%s.*/" % url match = self.db_connection.find_document( collection=DB.TOP_NEWS_DOMAINS, filter={"url": { "$regex": url }}) for domain in match: rank = domain["rank"] top10 = rank <= 10 top30 = 11 <= rank <= 30 top50 = 31 <= rank <= 50 except ConnectionError as e: logger.warn(e) # Certainty extraction entity_certainty = 0 keyword_certainty = 0 for entity in tweet[TWEET.ENTITIES]: entity_certainty += entity['certainty'] for keyword in tweet[TWEET.KEYWORDS]: keyword_certainty += keyword['certainty'] # Sentiment extraction try: sentiment_response = self.nlu.analyze( text=text, features=Features(sentiment=SentimentOptions())) sentiment_score += sentiment_response['sentiment']['document'][ 'score'] except WatsonApiException as e: logger.warn(e.message) sentiment_score = 0 doc = { TWEET.CHARACTER_COUNT: no_chars, TWEET.WORD_COUNT: no_words, TWEET.CONTAINS_QM: contains_qm, TWEET.CONTAINS_EM: contains_em, TWEET.CONTAINS_MULTIPLE_MARKS: multiple_marks, TWEET.FRACTION_CAPITALISED: capitalised, TWEET.CONTAINS_HAPPY_EMOJI: happy_emoji_count > 0, TWEET.CONTAINS_SAD_EMOJI: sad_emoji_count > 0, TWEET.CONTAINS_HAPPY_EMOTICON: len(happy_emoticon_count) > 0, TWEET.CONTAINS_SAD_EMOTICON: len(sad_emoticon_count) > 0, TWEET.CONTAINS_PRONOUNS: has_personal_pronoun, TWEET.MENTIONED_USERS: user_mentions, TWEET.MENTIONS_USER: len(user_mentions) > 0, TWEET.CONTAINS_STOCK_SYMBOL: len(stock_result) > 0, TWEET.PUBLISH_WEEKDAY: self.convert_weekday(day_of_week), TWEET.POSITIVE_WORD_COUNT: pos_word_count, TWEET.NEGATIVE_WORD_COUNT: neg_word_count, TWEET.SENTIMENT_SCORE: sentiment_score, TWEET.ENTITIES_COUNT: len(tweet[TWEET.ENTITIES]), TWEET.KEYWORDS_COUNT: len(tweet[TWEET.KEYWORDS]), TWEET.CONTAINS_DOMAIN_TOP10: top10, TWEET.CONTAINS_DOMAIN_TOP30: top30, TWEET.CONTAINS_DOMAIN_TOP50: top50 } if len(tweet[TWEET.ENTITIES]) == 0: doc[TWEET.AVERAGE_ENTITY_CERTAINTY] = 0 else: doc[TWEET.AVERAGE_ENTITY_CERTAINTY] = entity_certainty / len( tweet[TWEET.ENTITIES]) if len(tweet[TWEET.KEYWORDS]) == 0: doc[TWEET.AVERAGE_KEYWORD_CERTAINTY] = 0 else: doc[TWEET.AVERAGE_KEYWORD_CERTAINTY] = keyword_certainty / len( tweet[TWEET.KEYWORDS]) # TWEET.AVERAGE_ENTITY_CERTAINTY: entity_certainty / len(tweet[TWEET.ENTITIES]), # TWEET.AVERAGE_KEYWORD_CERTAINTY: keyword_certainty / len(tweet[TWEET.KEYWORDS]), self.db_connection.add_to_bulk_upsert(query={"_id": tweet["_id"]}, data=doc, bulk_op=bulk_op) bulk_count += 1 if bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) if bulk_count % 100 != 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) def get_user_features(self, users): ''' Given a list of users, extracts the necessary features for this user for the classifier The feature list includes: - Amount of days until now since user created account - Number of tweets - Number of followers - Number of followees - Is verified (1 if verified) - Has non empty description - Average number of retweets - Average number of favourites :param users: :return: ''' for user in users: tweet_info = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={"author_handle": user["twitter_handle"]}, projection={ "retweet_count": 1, "favourites_count": 1 }) cursor_count = tweet_info.count() total_retweets = 0 total_favourites = 0 if cursor_count > 0: for tweet in tweet_info: total_favourites += tweet["favourites_count"] total_retweets += tweet["retweet_count"] total_retweets = total_retweets / cursor_count total_favourites = total_favourites / cursor_count user_data = self.twitter.api.GetUser(user_id=user["_id"]) created_at = datetime.strptime(user_data.created_at, '%a %b %d %H:%M:%S +0000 %Y') final_date = datetime(year=2018, month=4, day=15) days_since = (final_date - created_at).days timestamp = calendar.timegm(created_at.timetuple()) if user_data.status: doc = { MP.IS_VERIFIED: user_data.verified, MP.FRIENDS_COUNT: user_data.friends_count, MP.AVERAGE_NO_FAVOURITES: total_favourites, MP.AVERAGE_NO_RETWEETS: total_retweets, MP.NON_EMPTY_DESCRIPTION: len(user_data.description) > 0, MP.ACCOUNT_DAYS: days_since, MP.CREATED_AT: created_at, MP.CREATED_AT_EPOCH: timestamp } self.db_connection.find_and_update(collection=DB.MP_COLLECTION, query={"_id": user["_id"]}, update={"$set": doc}) def get_topic_features(self, topics): ''' Extract features for a given topic, including: - amount of tweets - Average length - Fraction containing questioning mark - Fraction containing exclamation mark - Fraction containing multiple question marks/multiple exclamation marks - Fraction containing happy emoticon, sad emoticon, happy emoji, sad emoji - Fraction containing pronouns - Fraction containing 30% of characters uppercased - Fraction containing a URL - Fraction containing a user mention - Fraction containing hashtags - Fraction containing stock symbols - Average sentiment score - Fraction containing positive sentiment score - Fraction containing negative sentiment score - Fraction containing popular domain top 10 - Fraction containing popular domain top 30 - Fraction containing popular domain top 50 - Number of distinct URLs - Fraction containing most visited URL - Number of distinct short URLs - Number of distinct hashtags - Fraction containing most used hashtag - Number of distinct users mentioned - Fraction containing most mentioned user - Number of distinct tweet authors - Fraction of tweets by most frequent author - Author average twitter life - Author average amount of tweets - Author average amount of followers - Author average amount of friends - Fraction of tweets from verified users - Fraction with authors with description :param topics: :return: ''' for topic in topics: tweet_bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) # matching_tweets = self.db_connection.find_document(collection=DB.RELEVANT_TWEET_COLLECTION, # filter={"$and":[{"text": {"$regex": " %s | #%s " % topic["name"], # "$options": "-i"}}, # {"text": { # "$regex": " #%s " % topic["name"], # "$options": "-i"}}]}) matching_tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={ "text": { "$regex": " %s | #%s |%s | %s|#%s | #%s" % (topic["name"], topic["name"], topic["name"], topic["name"], topic["name"], topic["name"]), "$options": "-i" } }) # matching_tweets = self.db_connection.find_document(collection=DB.RELEVANT_TWEET_COLLECTION, # filter={"text": {"$regex": " %s | #" % topic["name"], # "$options": "-i"}}) # matching_tweets1 = list(matching_tweets1) # # matching_tweets2 = self.db_connection.find_document(collection=DB.RELEVANT_TWEET_COLLECTION, # filter={"text": {"$regex": " #%s " % topic["name"], # "$options": "-i"}}) total = matching_tweets.count() print total tweet_length = 0 contains_qm = 0 contains_em = 0 contains_multiple_marks = 0 contains_happy_emoticon = 0 contains_sad_emoticon = 0 contains_happy_emoji = 0 contains_sad_emoji = 0 contains_pronouns = 0 contains_uppercase = 0 contains_figures = 0 contains_url = 0 contains_user_mention = 0 contains_hashtag = 0 contains_stock_symbols = 0 sentiment_score = 0 positive_sentiment = 0 negative_sentiment = 0 top10 = 0 top30 = 0 top50 = 0 distinct_urls_count = 0 most_visited_url_count = 0 distinct_hashtag_count = 0 most_used_hashtag_count = 0 distinct_user_mention_count = 0 most_mentioned_user_count = 0 distinct_tweet_author_count = 0 top_author_tweets_count = 0 author_twitter_life = 0 author_follower_count = 0 author_friend_count = 0 author_tweet_count = 0 verified = 0 day_relevance = 0 week_relevance = 0 two_week_relevance = 0 words_not_in_dict = 0 # Distinctions distinct_urls = {} distinct_hashtags = {} distinct_user_mentions = {} distinct_authors = {} # total_tweets = list(matching_tweets1) + list(matching_tweets2) if total > 0: for tweet in matching_tweets: self.db_connection.add_to_bulk_upsert_addtoset( query={TWEET.ID: tweet["_id"]}, field=TWEET.TOPICS, value={ "_id": topic["_id"], TOPIC.IDENTIFIED_AS_TOPIC: topic[TOPIC.IDENTIFIED_AS_TOPIC] }, bulk_op=tweet_bulk_op) # {"_id": topic["_id"], # TOPIC.IDENTIFIED_AS_TOPIC: topic[TOPIC.IDENTIFIED_AS_TOPIC]}}}, # bulk_op=tweet_bulk_op) tweet_length += tweet[TWEET.CHARACTER_COUNT] if tweet[TWEET.CONTAINS_QM]: contains_qm += 1 if tweet[TWEET.CONTAINS_EM]: contains_em += 1 if tweet[TWEET.CONTAINS_MULTIPLE_MARKS]: contains_multiple_marks += 1 if tweet[TWEET.CONTAINS_HAPPY_EMOTICON]: contains_happy_emoticon += 1 if tweet[TWEET.CONTAINS_SAD_EMOTICON]: contains_sad_emoticon += 1 if tweet[TWEET.CONTAINS_HAPPY_EMOJI]: contains_happy_emoji += 1 if tweet[TWEET.CONTAINS_SAD_EMOJI]: contains_sad_emoji += 1 if tweet[TWEET.CONTAINS_PRONOUNS]: contains_pronouns += 1 if tweet[TWEET.CONTAINS_FIGURES]: contains_figures += 1 if tweet[TWEET.FRACTION_CAPITALISED] >= 0.3: contains_uppercase += 1 urls = re.findall(r'(https?://[^\s]+)', tweet[TWEET.TEXT]) if len(urls) > 0: contains_url += 1 if TWEET.RESOLVED_URLS in tweet: for c, url in enumerate( tweet[TWEET.RESOLVED_URLS]): if url not in distinct_urls: if url.split("//")[1].split( "/" )[0] != "twitter.com": # Ignore twitter domain URLs distinct_urls[url] = 1 else: distinct_urls[url] = distinct_urls[url] + 1 if tweet[TWEET.MENTIONS_USER]: contains_user_mention += 1 if TWEET.MENTIONED_USERS in tweet: if len(tweet[TWEET.MENTIONED_USERS]) > 0: for mentioned_user in tweet[TWEET.MENTIONED_USERS]: if mentioned_user not in distinct_user_mentions: distinct_user_mentions[mentioned_user] = 1 else: distinct_user_mentions[ mentioned_user] = distinct_user_mentions[ mentioned_user] + 1 if TWEET.HASHTAGS in tweet: if len(tweet[TWEET.HASHTAGS]) > 0: contains_hashtag += 1 for hashtag in tweet[TWEET.HASHTAGS]: if hashtag not in distinct_hashtags: distinct_hashtags[hashtag] = 1 else: distinct_hashtags[ hashtag] = distinct_hashtags[ hashtag] + 1 if tweet[TWEET.CONTAINS_STOCK_SYMBOL]: contains_stock_symbols += 1 sentiment_score += tweet[TWEET.SENTIMENT_SCORE] if tweet[TWEET.SENTIMENT_SCORE] >= 0: positive_sentiment += 1 if tweet[TWEET.SENTIMENT_SCORE] < 0: negative_sentiment += 1 if tweet[TWEET.CONTAINS_DOMAIN_TOP10]: top10 += 1 if tweet[TWEET.CONTAINS_DOMAIN_TOP30]: top30 += 1 if tweet[TWEET.CONTAINS_DOMAIN_TOP50]: top50 += 1 author_info = self.db_connection.find_document( collection=DB.MP_COLLECTION, filter={"_id": tweet[TWEET.AUTHOR_ID]})[0] if author_info[MP.TWITTER_HANDLE] not in distinct_authors: distinct_authors[author_info[MP.TWITTER_HANDLE]] = 1 if author_info[MP.IS_VERIFIED]: verified += 1 author_twitter_life += author_info[MP.ACCOUNT_DAYS] author_follower_count += author_info[ MP.FOLLOWERS_COUNT] author_friend_count += author_info[MP.FRIENDS_COUNT] author_tweet_count += author_info[MP.TWEET_COUNT] else: distinct_authors[author_info[ MP.TWITTER_HANDLE]] = distinct_authors[author_info[ MP.TWITTER_HANDLE]] + 1 # if author_info[MP.IS_VERIFIED]: # verified += 1 day_relevance += tweet[TWEET.RELEVANCY_DAY] week_relevance += tweet[TWEET.RELEVANCY_WEEK] two_week_relevance += tweet[TWEET.RELEVANCY_TWO_WEEKS] words_not_in_dict += tweet[TWEET.FRAC_NOT_IN_DICT] distinct_urls_count += len(distinct_urls) if distinct_urls_count > 0: top_url = max(distinct_urls.iteritems(), key=operator.itemgetter(1))[0] distinct_hashtag_count += len(distinct_hashtags) if distinct_hashtag_count > 0: top_hashtag = max(distinct_hashtags.iteritems(), key=operator.itemgetter(1))[0] distinct_user_mention_count += len(distinct_user_mentions) if distinct_user_mention_count > 0: top_user_mention = max(distinct_user_mentions.iteritems(), key=operator.itemgetter(1))[0] distinct_tweet_author_count += len(distinct_authors) if distinct_tweet_author_count > 0: top_author = max(distinct_authors.iteritems(), key=operator.itemgetter(1))[0] # for tweet in matching_tweets: # if top_url in tweet[TWEET.RESOLVED_URLS]: # most_visited_url_count += 1 # # if top_hashtag in tweet[TWEET.HASHTAGS]: # most_used_hashtag_count += 1 # # if top_user_mention in tweet[TWEET.MENTIONED_USERS]: # most_mentioned_user_count += 1 # # if tweet[TWEET.AUTHOR_HANDLE] == top_author: # top_author_tweets_count += 1 doc = { TOPIC.TWEET_COUNT: total, TOPIC.TWEET_AVERAGE_LENGTH: tweet_length / total, TOPIC.FRAC_CONTAINING_QM: contains_qm / total, TOPIC.FRAC_CONTAINING_EM: contains_em / total, TOPIC.FRAC_CONTAINING_MULTIPLE_MARKS: contains_multiple_marks / total, TOPIC.FRAC_CONTAINING_HAPPY_EMOTICON: contains_happy_emoticon / total, TOPIC.FRAC_CONTAINING_SAD_EMOTICON: contains_sad_emoticon / total, TOPIC.FRAC_CONTAINING_HAPPY_EMOJI: contains_happy_emoji / total, TOPIC.FRAC_CONTAINING_SAD_EMOJI: contains_sad_emoji / total, TOPIC.FRAC_CONTAINING_PRONOUNS: contains_pronouns / total, TOPIC.FRAC_CONTAINING_FIGURES: contains_figures / total, TOPIC.FRAC_CONTAINING_UPPERCASE: contains_uppercase / total, TOPIC.FRAC_CONTAINING_URL: contains_url / total, TOPIC.FRAC_CONTAINING_USER_MENTION: contains_user_mention / total, TOPIC.FRAC_CONTAINING_HASHTAGS: contains_hashtag / total, TOPIC.FRAC_CONTAINING_STOCK_SYMBOLS: contains_stock_symbols / total, TOPIC.AVERAGE_SENTIMENT_SCORE: sentiment_score / total, TOPIC.FRAC_CONTAINING_POSITIVE_SENTIMENT: positive_sentiment / total, TOPIC.FRAC_CONTAINING_NEGATIVE_SENTIMENT: negative_sentiment / total, TOPIC.FRAC_CONTAINING_DOMAIN10: top10 / total, TOPIC.FRAC_CONTAINING_DOMAIN30: top30 / total, TOPIC.FRAC_CONTAINING_DOMAIN50: top50 / total, TOPIC.DISTINCT_URLS_COUNT: distinct_urls_count, TOPIC.DISTINCT_HASHTAG_COUNT: distinct_hashtag_count, TOPIC.DISTINCT_USER_MENTION_COUNT: distinct_user_mention_count, TOPIC.DISTINCT_TWEET_AUTHOR_COUNT: distinct_tweet_author_count, TOPIC.AVERAGE_AUTHOR_TWITTER_LIFE: author_twitter_life / distinct_tweet_author_count, TOPIC.AVERAGE_AUTHOR_TWEET_COUNT: author_tweet_count / distinct_tweet_author_count, TOPIC.AVERAGE_AUTHOR_FOLLOWER_COUNT: author_follower_count / distinct_tweet_author_count, TOPIC.AVERAGE_AUTHOR_FRIEND_COUNT: author_friend_count / distinct_tweet_author_count, TOPIC.FRAC_FROM_VERIFIED: verified / distinct_tweet_author_count, TOPIC.AVERAGE_DAY_RELEVANCE: day_relevance / total, TOPIC.AVERAGE_WEEK_RELEVANCE: week_relevance / total, TOPIC.AVERAGE_2WEEK_RELEVANCE: two_week_relevance / total, TOPIC.AVERAGE_WORDS_NOT_IN_DICT: words_not_in_dict / total, } if distinct_urls_count > 0: doc[TOPIC. FRAC_CONTAINING_MOST_VISITED_URL] = distinct_urls.get( top_url) / total else: doc[TOPIC.FRAC_CONTAINING_MOST_VISITED_URL] = 0 if distinct_hashtag_count > 0: doc[TOPIC. FRAC_CONTAINING_MOST_USED_HASHTAG] = distinct_hashtags.get( top_hashtag) / total else: doc[TOPIC.FRAC_CONTAINING_MOST_USED_HASHTAG] = 0 if distinct_user_mention_count > 0: doc[TOPIC. FRAC_CONTAINING_MOST_MENTIONED_USER] = distinct_user_mentions.get( top_user_mention) / total else: doc[TOPIC.FRAC_CONTAINING_MOST_MENTIONED_USER] = 0 if distinct_tweet_author_count > 0: doc[TOPIC. FRAC_CONTAINING_TOP_AUTHOR] = distinct_authors.get( top_author) / total else: doc[TOPIC.FRAC_CONTAINING_TOP_AUTHOR] = 0 # self.db_connection.update_many(collection=DB.RELEVANT_TWEET_COLLECTION, # query={"$in": tweet_id_list}, # update={"$push": {TWEET.TOPICS: {"_id": topic["_id"], # TOPIC.IDENTIFIED_AS_TOPIC: topic[TOPIC.IDENTIFIED_AS_TOPIC]}}}) self.db_connection.find_and_update( collection=DB.RELEVANT_TOPICS, query={"_id": topic["_id"]}, update={"$set": doc}) self.db_connection.end_bulk_upsert(bulk_op=tweet_bulk_op) def get_topics_for_lost_tweets(self): tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={ "$and": [{ "aggregate_label": { "$exists": True } }, { "topics": { "$exists": False } }] }) for tweet in tweets: print tweet['text'] topic = raw_input("topic:\n") possible_topic = self.db_connection.find_document( collection=DB.RELEVANT_TOPICS, filter={"name": topic.lower()}) if possible_topic.count() > 0: found_topic = possible_topic.next() self.db_connection.find_and_update( collection=DB.RELEVANT_TWEET_COLLECTION, query={"_id": tweet["_id"]}, update={ "$set": { "topics": [{ "_id": found_topic["_id"], TOPIC.IDENTIFIED_AS_TOPIC: found_topic[TOPIC.IDENTIFIED_AS_TOPIC] }] } })
class Human(object): def __init__(self): self.db_connection = DBConnection() def label(self, label=False, fact_checking=False): ''' :param label: Determine whether worth fact-checking :param fact_checking: Determine the truth of it :return: ''' start_epoch = 1520812800 # tweet_test = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION, # query={"$and":[{"crowdsourced": {"$exists": False}}, # {"created_at_epoch": {"$gt": start_epoch}}]}, # size=100)) # # tweets = self.db_connection.find_document(collection=DB.RELEVANT_TWEET_COLLECTION, # filter={"$and":[{"crowdsourced": {"$exists": False}}, # {"created_at_epoch": {"$gt": start_epoch}}]}, # projection={"text": 1}) # print tweet_test.count() # print tweets.count() # print tweet_test[0]['text'] # # for tweet in tweet_test: # # print tweet['text'] # # break # print tweets[0]['text'] # # print tweet['text'] bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) bulk_count = 0 if label: # tweets = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION, # query={"$and": [{"crowdsourced": {"$exists": False}}, # {TWEET.SET_TO_FACTCHECK: # {"$exists": False}}]}, # size=500)) tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={ "$and": [{ "crowdsourced": { "$exists": False } }, { TWEET.SET_TO_FACTCHECK: { "$exists": False } }] }, # {TWEET.SET_TO_FACTCHECK, projection={"text": 1}, sort=True, sort_field="retweet_count", limit=500) for tweet in tweets: print tweet['text'] worth = raw_input() if worth == "y": self.db_connection.add_to_bulk_upsert( query={"_id": tweet["_id"]}, data={TWEET.SET_TO_FACTCHECK: True}, bulk_op=bulk_op) else: self.db_connection.add_to_bulk_upsert( query={"_id": tweet["_id"]}, data={TWEET.SET_TO_FACTCHECK: False}, bulk_op=bulk_op) bulk_count += 1 print "\n" if bulk_count != 0 and bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) if fact_checking: tweets = list( self.db_connection.get_random_sample( collection=DB.RELEVANT_TWEET_COLLECTION, query={ "$and": [{ "crowdsourced": { "$exists": False } }, { TWEET.SET_TO_FACTCHECK: True }] }, size=100)) for tweet in tweets: print tweet['text'] rating = raw_input() self.db_connection.add_to_bulk_upsert( query={"_id": tweet["_id"]}, data={TWEET.LABEL: rating == " "}, bulk_op=bulk_op) print "---\n" bulk_count += 1 if bulk_count % 100 == 0: self.db_connection.end_bulk_upsert() bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) if bulk_count != 0 and bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) def entity_measure(self): tp = 0 tn = 0 fp = 0 fn = 0 count = 0 try: tweets = self.db_connection.get_random_sample( collection=DB.RELEVANT_TWEET_COLLECTION, query={ "$and": [{ TWEET.SET_TO_FACTCHECK: True }, { TWEET.ENTITIES_COUNT: { "$eq": 1 } }] }, size=1) total = 5 for tweet in tweets: print tweet['text'] print '----ENTITIES----' entities = [x['entity'] for x in tweet['entities']] print entities print '----INPUT-------' tp_input = int( raw_input("Is an entity and API says it's an entity\n")) if tp_input == 0: fp += 1 tn_input = int( raw_input( "Is not an entity, API says it's not an entity\n")) fn_input = int(raw_input("Is an entity, API says it's not \n")) print "\n\n\n" tp += tp_input tn += tn_input fn += fn_input count += 1 total -= 1 print "total: %s" % total print "tp: %s" % tp print "tn: %s" % tn print "fp: %s" % fp print "fn: %s" % fn except Exception as e: print e print "count: %s" % count print "tp: %s" % tp print "tn: %s" % tn print "fp: %s" % fp print "fn: %s" % fn def mp_evaluation(self): total_rank = 0 topic_count = 0 mp_topics = self.db_connection.find_document( collection=DB.MP_COLLECTION, filter={"topics": { "$exists": True }}, projection={"topics": 1}) for topics in mp_topics: for topic in topics["topics"]: average_rank = 0 rank = self.db_connection.find_document( collection=DB.RELEVANT_TOPICS, filter={"name": topic}, projection={"identified_as_topic": 1}) for item in rank: average_rank += item["identified_as_topic"] # for item in rank: # print item # average_rank += rank total_rank += average_rank / len(topics['topics']) # print "b" # topic_count += len(topics["topics"]) count = mp_topics.count() print total_rank / count print topic_count
def __init__(self): self.db_connection = DBConnection()
class HumanComputeGUI(tk.Frame): def __init__(self, parent): self.db_connection = DBConnection() self.bulk_count = 0 tk.Frame.__init__(self, parent) # create a prompt, an input box, an output label, # and a button to do the computation self.prompt = tk.Label(self, text="Enter a number:", anchor="w", wraplength=500) # self.entry = tk.Entry(self) self.relevant = tk.Button(self, text="Relevant", command=self.calculate1) self.not_relevant = tk.Button(self, text="Not Relevant", command=self.calculate2) self.output = tk.Label(self, text="") # lay the widgets out on the screen. self.prompt.pack(side="top", fill="x") # self.entry.pack(side="top", fill="x", padx=20) self.output.pack(side="top", fill="x", expand=True) self.not_relevant.pack(side="bottom") self.relevant.pack(side="bottom") self.tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={ "$and": [{ "crowdsourced": { "$exists": False } }, { TWEET.SET_TO_FACTCHECK: { "$exists": False } }, { TWEET.TOPICS: { "$exists": True } }] }, # {TWEET.SET_TO_FACTCHECK, projection={"text": 1}, sort=True, sort_field="retweet_count", limit=500) self.current = self.tweets.next() self.bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) self.bulk_count = 0 self.prompt.configure(text=self.current["text"]) # for tweet in tweets: def calculate1(self): try: self.db_connection.add_to_bulk_upsert( query={"_id": self.current["_id"]}, data={TWEET.SET_TO_FACTCHECK: True}, bulk_op=self.bulk_op) self.bulk_count += 1 if self.bulk_count != 0 and self.bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=self.bulk_op) self.bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) self.current = self.tweets.next() if self.current: self.prompt.configure( text=self.current['text'].encode('ascii', 'ignore')) else: if self.bulk_count != 0 and self.bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=self.bulk_op) # result = self.not_relevant.getboolean(False) except Exception as e: print e # set the output widget to have our result # self.output.configure(text=result) def calculate2(self): try: result = self.relevant.getboolean(True) self.db_connection.add_to_bulk_upsert( query={"_id": self.current["_id"]}, data={TWEET.SET_TO_FACTCHECK: False}, bulk_op=self.bulk_op) self.bulk_count += 1 if self.bulk_count != 0 and self.bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=self.bulk_op) self.bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) self.current = self.tweets.next() if self.current: self.prompt.configure( text=self.current['text'].encode('ascii', 'ignore')) else: if self.bulk_count != 0 and self.bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=self.bulk_op) # result = self.not_relevant.getboolean(False) except Exception as e: print e # set the output widget to have our result # self.output.configure(text=result) def label(self, label=False, fact_checking=False): ''' :param label: Determine whether worth fact-checking :param fact_checking: Determine the truth of it :return: ''' start_epoch = 1520812800 bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) bulk_count = 0 if label: # tweets = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION, # query={"$and": [{"crowdsourced": {"$exists": False}}, # {TWEET.SET_TO_FACTCHECK: # {"$exists": False}}]}, # size=500)) tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={ "$and": [{ "crowdsourced": { "$exists": False } }, { TWEET.SET_TO_FACTCHECK: { "$exists": False } }] }, # {TWEET.SET_TO_FACTCHECK, projection={"text": 1}, sort=True, sort_field="retweet_count", limit=500) for tweet in tweets: print tweet['text'] worth = raw_input() if worth == "y": self.db_connection.add_to_bulk_upsert( query={"_id": tweet["_id"]}, data={TWEET.SET_TO_FACTCHECK: True}, bulk_op=bulk_op) else: self.db_connection.add_to_bulk_upsert( query={"_id": tweet["_id"]}, data={TWEET.SET_TO_FACTCHECK: False}, bulk_op=bulk_op) self.bulk_count += 1 print "\n" if self.bulk_count != 0 and self.bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) # if fact_checking: # tweets = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION, # query={"$and": [{"crowdsourced": {"$exists": False}}, # {TWEET.SET_TO_FACTCHECK: True}]}, # size=100)) # # for tweet in tweets: # print tweet['text'] # rating = raw_input() # self.db_connection.add_to_bulk_upsert(query={"_id": tweet["_id"]}, # data={TWEET.LABEL: rating == " "}, # bulk_op=bulk_op) # print "---\n" # bulk_count += 1 # # if bulk_count % 100 == 0: # self.db_connection.end_bulk_upsert() # bulk_op = self.db_connection.start_bulk_upsert(collection=DB.RELEVANT_TWEET_COLLECTION) # if self.bulk_count != 0 and self.bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
def setUp(self): self.db_connection = DBConnection()
class HumanComputeGUIEntity(tk.Frame): def __init__(self, parent): self.db_connection = DBConnection() self.bulk_count = 0 tk.Frame.__init__(self, parent) # create a prompt, an input box, an output label, # and a button to do the computation self.prompt = tk.Label(self, text="Enter a number:", anchor="w", wraplength=500) self.entities_prompt = tk.Label(self, text="entities", anchor="w", wraplength=500) # self.entry = tk.Entry(self) self.tp = tk.Button(self, text="Is an entity and API says it's an entity", command=self.calculate1) self.tn = tk.Button(self, text="Is not an entity, API does not include it", command=self.calculate2) self.fp = tk.Button(self, text='Is not an entity, API includes it', command=self.calculate3) self.fn = tk.Button(self, text='Is an entity, API does not include it', command=self.calculate4) self.output = tk.Label(self, text="") # lay the widgets out on the screen. self.prompt.pack(side="top", fill="x") self.entities_prompt.pack(side="bottom") # self.entry.pack(side="top", fill="x", padx=20) self.output.pack(side="top", fill="x", expand=True) self.fn.pack(side="bottom") self.fp.pack(side="bottom") self.tn.pack(side="bottom") self.tp.pack(side="bottom") self.tweets = self.db_connection.get_random_sample( collection=DB.RELEVANT_TWEET_COLLECTION, query={ "$and": [{ TWEET.SET_TO_FACTCHECK: True }, { TWEET.ENTITIES_COUNT: { "$eq": 1 } }] }, size=200) self.current = self.tweets.next() self.prompt.configure(text=self.current["text"]) self.entities_prompt.configure( text="Entities: %s" % [x['entity'] for x in self.current["entities"]]) self.tp = 0 self.tn = 0 self.fp = 0 self.fn = 0 # for tweet in tweets: def calculate1(self): try: self.tp += 1 self.current = self.tweets.next() if self.current: self.prompt.configure( text=self.current['text'].encode('ascii', 'ignore')) self.entities_prompt.configure( text="Entities: %s" % [x['entity'] for x in self.current["entities"]]) else: print "tp: %s" % self.tp print "tn: %s" % self.tn print "fp: %s" % self.fp print "fn: %s" % self.fn except Exception as e: print e print "tp: %s" % self.tp print "tn: %s" % self.tn print "fp: %s" % self.fp print "fn: %s" % self.fn def calculate2(self): try: self.tn += 1 self.current = self.tweets.next() if self.current: self.prompt.configure( text=self.current['text'].encode('ascii', 'ignore')) self.entities_prompt.configure( text="Entities: %s" % [x['entity'] for x in self.current["entities"]]) else: print "tp: %s" % self.tp print "tn: %s" % self.tn print "fp: %s" % self.fp print "fn: %s" % self.fn except Exception as e: print e print "tp: %s" % self.tp print "tn: %s" % self.tn print "fp: %s" % self.fp print "fn: %s" % self.fn def calculate3(self): try: self.fp += 1 self.current = self.tweets.next() if self.current: self.prompt.configure( text=self.current['text'].encode('ascii', 'ignore')) self.entities_prompt.configure( text="Entities: %s" % [x['entity'] for x in self.current["entities"]]) else: print "tp: %s" % self.tp print "tn: %s" % self.tn print "fp: %s" % self.fp print "fn: %s" % self.fn except Exception as e: print e print "tp: %s" % self.tp print "tn: %s" % self.tn print "fp: %s" % self.fp print "fn: %s" % self.fn def calculate4(self): try: self.fn += 1 self.current = self.tweets.next() if self.current: self.prompt.configure( text=self.current['text'].encode('ascii', 'ignore')) self.entities_prompt.configure( text=[x['entity'] for x in self.current["entities"]]) else: print "tp: %s" % self.tp print "tn: %s" % self.tn print "fp: %s" % self.fp print "fn: %s" % self.fn except Exception as e: print e print "tp: %s" % self.tp print "tn: %s" % self.tn print "fp: %s" % self.fp print "fn: %s" % self.fn def label(self, label=False, fact_checking=False): ''' :param label: Determine whether worth fact-checking :param fact_checking: Determine the truth of it :return: ''' start_epoch = 1520812800 bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) bulk_count = 0 if label: # tweets = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION, # query={"$and": [{"crowdsourced": {"$exists": False}}, # {TWEET.SET_TO_FACTCHECK: # {"$exists": False}}]}, # size=500)) tweets = self.db_connection.find_document( collection=DB.RELEVANT_TWEET_COLLECTION, filter={ "$and": [{ "crowdsourced": { "$exists": False } }, { TWEET.SET_TO_FACTCHECK: { "$exists": False } }] }, # {TWEET.SET_TO_FACTCHECK, projection={"text": 1}, sort=True, sort_field="retweet_count", limit=500) for tweet in tweets: print tweet['text'] worth = raw_input() if worth == "y": self.db_connection.add_to_bulk_upsert( query={"_id": tweet["_id"]}, data={TWEET.SET_TO_FACTCHECK: True}, bulk_op=bulk_op) else: self.db_connection.add_to_bulk_upsert( query={"_id": tweet["_id"]}, data={TWEET.SET_TO_FACTCHECK: False}, bulk_op=bulk_op) self.bulk_count += 1 print "\n" if self.bulk_count != 0 and self.bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) # if fact_checking: # tweets = list(self.db_connection.get_random_sample(collection=DB.RELEVANT_TWEET_COLLECTION, # query={"$and": [{"crowdsourced": {"$exists": False}}, # {TWEET.SET_TO_FACTCHECK: True}]}, # size=100)) # # for tweet in tweets: # print tweet['text'] # rating = raw_input() # self.db_connection.add_to_bulk_upsert(query={"_id": tweet["_id"]}, # data={TWEET.LABEL: rating == " "}, # bulk_op=bulk_op) # print "---\n" # bulk_count += 1 # # if bulk_count % 100 == 0: # self.db_connection.end_bulk_upsert() # bulk_op = self.db_connection.start_bulk_upsert(collection=DB.RELEVANT_TWEET_COLLECTION) # if self.bulk_count != 0 and self.bulk_count % 100 == 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op)
class Relevancy(object): def __init__(self): self.db_connection = DBConnection() self.twitter_api = Twitter(os.environ.get(CREDS.TWITTER_KEY), os.environ.get(CREDS.TWITTER_SECRET), os.environ.get(CREDS.TWITTER_TOKEN), os.environ.get(CREDS.TWITTER_TOKEN_SECRET), self.db_connection) def clean_tweet(self, tweet): regex_remove = "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|&|amp|(\w+:\/\/\S+)|^RT|http.+?" tweet_text = re.sub(regex_remove, '', tweet["text"]).strip() tweet_id = tweet["_id"] stopword_list = [] stopword_file = open('stopwords.txt', 'r') for line in stopword_file: stopword_list.append(line.strip()) stopword_list = stopword_list + stopwords.words('english') stop_words = set(stopword_list) tweet_text = " ".join(word for word in tweet_text.split() if word not in stop_words) tweet["text"] = tweet_text return tweet def cleaner(self, tweets): ''' Remove tweets that are too insignificant to classify for relevance score e.g. tweets with one word :param tweets: list of tweets to clean :return: ''' for tweet in tweets: try: if tweet['text']: tweet_data = self.twitter_api.get_status( tweet_id=tweet["_id"]) lang = detect(tweet['text']) if tweet_data.in_reply_to_status_id: # It's a reply, not worth fact-checking self.db_connection.delete_tweet(tweet_id=tweet["_id"]) elif lang != 'en': self.db_connection.delete_tweet(tweet_id=tweet["_id"]) elif len(re.findall(r'\w+', tweet['text'])) <= 10: self.db_connection.delete_tweet(tweet_id=tweet["_id"]) elif tweet['text'].count('@') > 4: self.db_connection.delete_tweet(tweet_id=tweet["_id"]) elif tweet['text'].count('#') > 4: self.db_connection.delete_tweet(tweet_id=tweet["_id"]) except LangDetectException as e: self.db_connection.delete_tweet(tweet['text']) def get_prediction_model(self, timestamp, time_interval): ''' Given a timestamp, gets relevant: - News articles - Trends Builds the contents of these into similarity measure object :param tweet: Tweet to analyse :param timestamp: Timestamp to analyse from :param time_interval: Interval of time for which to create the similarity :return: similarity measure object to query tweets against ''' start_timestamp = timestamp - time_interval articles = [] articles_ingest = self.db_connection.find_document( collection=DB.NEWS_ARTICLES, filter={ "$and": [{ "timestamp": { "$gt": start_timestamp } }, { "timestamp": { "$lt": timestamp } }] }, projection={ "title": 1, "description": 1 }) if articles_ingest.count() > 0: for article in articles_ingest: if 'description' in article: if article['description']: articles.append(article['description']) if 'title' in article: if article['title']: articles.append(article['title']) gen_docs = [[w.lower() for w in word_tokenize(text)] for text in articles] dictionary = gensim.corpora.Dictionary(gen_docs) corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs] tf_idf = gensim.models.TfidfModel(corpus) # sims = gensim.similarities.Similarity('gensim', tf_idf[corpus], num_features=len(dictionary)) index = gensim.similarities.MatrixSimilarity( tf_idf[corpus], num_features=len(dictionary)) return [index, dictionary, tf_idf] else: return None def calculate_relevance(self, tweets, timestamp, time_interval): start_timestamp = timestamp - time_interval model = self.get_prediction_model(timestamp=timestamp, time_interval=time_interval) if model: twitter_trends_ingest = self.db_connection.find_document( collection=DB.TWITTER_TRENDS, filter={ "$and": [{ "timestamp_epoch": { "$gt": start_timestamp } }, { "timestamp_epoch": { "$lt": timestamp } }] }, projection={"name": 1}) wiki_trends_ingest = self.db_connection.find_document( collection=DB.WIKI_TRENDS, filter={ "$and": [{ "epoch_timestamp": { "$gt": start_timestamp } }, { "epoch_timestamp": { "$lt": timestamp } }] }, projection={ "name": 1, "rank": 1 }) bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) bulk_count = 0 for tweet in tweets: tweet = self.clean_tweet(tweet) query_doc = [w.lower() for w in word_tokenize(tweet['text'])] query_doc_bow = model[1].doc2bow(query_doc) query_doc_tf_idf = model[2][query_doc_bow] sims = model[0][query_doc_tf_idf] relevance = sims[sims != 0].mean() if not math.isnan(relevance): twitter_trends = [] wiki_trends = [] for trend in twitter_trends_ingest: twitter_trends.append(trend['name']) for trend in wiki_trends_ingest: wiki_trends.append(trend['name']) for trend in twitter_trends: if trend in tweet: relevance += ( 0.1 * relevance ) # 10% relevance booster for each trend for trend in wiki_trends: if trend in tweet: relevance += (1 - ((trend["rank"] - 1) / 1000)) * ( 0.1 * relevance) # Scaled booster relevance = float(relevance) if time_interval == TIME_INTERVAL.DAY: self.db_connection.add_to_bulk_upsert( query={"_id": tweet["_id"]}, data={RELEVANCY_INTERVAL.DAY: relevance}, bulk_op=bulk_op) bulk_count += 1 # self.db_connection.update_tweet(tweet_id=tweet["_id"], update={RELEVANCY_INTERVAL.DAY: relevance}) elif time_interval == TIME_INTERVAL.WEEK: self.db_connection.add_to_bulk_upsert( query={"_id": tweet["_id"]}, data={RELEVANCY_INTERVAL.WEEK: relevance}, bulk_op=bulk_op) bulk_count += 1 # self.db_connection.update_tweet(tweet_id=tweet["_id"], update={RELEVANCY_INTERVAL.WEEK: relevance}) elif time_interval == TIME_INTERVAL.WEEK * 2: self.db_connection.add_to_bulk_upsert( query={"_id": tweet["_id"]}, data={RELEVANCY_INTERVAL.TWO_WEEKS: relevance}, bulk_op=bulk_op) bulk_count += 1 # self.db_connection.update_tweet(tweet_id=tweet["_id"], update={RELEVANCY_INTERVAL.TWO_WEEKS: relevance}) elif time_interval == TIME_INTERVAL.MONTH: self.db_connection.add_to_bulk_upsert( query={"_id": tweet["_id"]}, data={RELEVANCY_INTERVAL.MONTH: relevance}, bulk_op=bulk_op) bulk_count += 1 # self.db_connection.update_tweet(tweet_id=tweet["_id"], update={RELEVANCY_INTERVAL.MONTH: relevance}) if bulk_count % 100 == 0: logger.info("Insert bulk data for relevancy: %s" % bulk_count) self.db_connection.end_bulk_upsert(bulk_op=bulk_op) bulk_op = self.db_connection.start_bulk_upsert( collection=DB.RELEVANT_TWEET_COLLECTION) else: continue if bulk_count % 100 != 0: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) logger.info("Inserted final bulk data %s" % bulk_count)