def test_create_exist_twitter_warc(self): """ crawling a exist twitter warc will replace new """ wc.create_twitter_warc(TWITTER_HTML) os.chdir("..") self.assertTrue(os.path.isfile(WARC_TWITTER_DIRECTORY + "/https:__twitter.com_wesbos_status_" + "519123918422958081.warc.gz"))
def test_create_wrong_url_twitter_warc(self): """ crawling a wrong twitter article should be going to success because there is 404 page """ try: os.chdir("..") os.remove(WARC_TWITTER_DIRECTORY + "/https:__twitter.com_wesbos_statu.warc.gz") except OSError: pass self.setUp() wc.create_twitter_warc(WRONG_TWITTER_HTML) os.chdir("..") time.sleep(1) self.assertTrue(os.path.isfile(WARC_TWITTER_DIRECTORY + "/https:__twitter.com_wesbos_statu" + ".warc.gz"))
def test_create_twitter_warc(self): """ create a real twitter url warc should work """ try: os.chdir("..") os.remove(WARC_TWITTER_DIRECTORY + "/https:__twitter.com_wesbos_status_" + "519123918422958081.warc.gz") except OSError: pass self.setUp() wc.create_twitter_warc(TWITTER_HTML) os.chdir("..") time.sleep(1) self.assertTrue(os.path.isfile(WARC_TWITTER_DIRECTORY + "/https:__twitter.com_wesbos_status_" + "519123918422958081.warc.gz"))
def parse_tweets(twitter_users, keywords, foreign_sites, tweet_number): """ (list of str, list of str, list of str, str) -> none Parses through tweets of users, looking for keywords and foreign sites. Relevant tweets will be sent to a database. Keyword arguments: twitter_users -- List of strings as twitter handles keywords -- List of strings as keywords to search for foreign_sites -- List of strings as sources to search for db_name -- String of Database """ config = configuration()['storage'] django.setup() added, updated, no_match = 0, 0, 0 start = time.time() for user in twitter_users: # Check for any new command on communication stream check_command() processed = 0 tweets = get_tweets(user, tweet_number) tweet_followers = get_follower_count(user) tweet_count = len(tweets) for tweet in tweets: # Check for any new command on communication stream check_command() #setting correct data for each field tweet_id = tweet.id tweet_date = timezone.localtime( timezone.make_aware(tweet.created_at, timezone=timezone.get_fixed_timezone(180))) tweet_user = tweet.user.screen_name tweet_store_date = timezone.localtime(timezone.now()) tweet_keywords = get_keywords(tweet, keywords) tweet_sources = get_sources(tweet, foreign_sites) tweet_text = tweet.text if not(tweet_keywords == [] and tweet_sources == []): tweet_list = Tweet.objects.filter(tweet_id=tweet_id) if (not tweet_list): #creating new intry in collection tweet = Tweet(tweet_id=tweet_id, user=tweet_user, date_added=tweet_store_date, date_published=tweet_date, followers=tweet_followers, text=tweet_text) tweet.save() tweet = Tweet.objects.get(tweet_id=tweet_id) for key in tweet_keywords: tweet.keyword_set.create(keyword=key) for source in tweet_sources: tweet.source_set.create(url=source[0], url_origin=source[1]) added += 1 else: tweet = tweet_list[0] tweet.text = tweet_text tweet.tweet_id = tweet_id tweet.user = tweet_user # tweet.date_added = tweet_store_date tweet.date_published = tweet_date tweet.followers = tweet_followers tweet.save() for key in tweet_keywords: if not T_keyword.objects.filter(keyword=key): tweet.keyword_set.create(keyword=key) for source in tweet_sources: if not Source.objects.filter(url=source[0]): tweet.source_set.create( url=source[0], url_origin=source[1]) updated += 1 warc_creator.create_twitter_warc( 'https://twitter.com/' + tweet.user + '/status/' + str(tweet_id)) else: no_match += 1 processed += 1 sys.stdout.write("%s (Twitter|%s) %i/%i \r" % (str(timezone.localtime(timezone.now()))[:-13], user, processed, tweet_count)) sys.stdout.flush() print format("%s (Twitter|%s) %i/%i " % ( str(timezone.localtime(timezone.now()))[:-13], user, processed, tweet_count))
def parse_tweets(twitter_users, keywords, source_sites, tweet_number, source_twitter_list): """ (list of str, list of str, list of str, str) -> none Parses through tweets of users, looking for keywords and foreign sites. Relevant tweets will be sent to a database. Keyword arguments: twitter_users -- List of strings as twitter handles keywords -- List of strings as keywords to search for foreign_sites -- List of strings as sources to search for db_name -- String of Database """ config = configuration()["storage"] django.setup() added, updated, no_match = 0, 0, 0 start = time.time() for user in twitter_users: # Check for any new command on communication stream check_command() processed = 0 tweets = get_tweets(user, tweet_number) tweet_followers = get_follower_count(user) tweet_count = len(tweets) for i in range(tweet_count): tweet = tweets[i] try: # Check for any new command on communication stream check_command() except (KeyboardInterrupt, SystemExit): raise # setting correct data for each field tweet_id = tweet.id tweet_date = timezone.localtime( timezone.make_aware(tweet.created_at, timezone=timezone.get_fixed_timezone(180)) ) tweet_user = tweet.user.screen_name tweet_store_date = timezone.localtime(timezone.now()) tweet_keywords = get_keywords(tweet.text, keywords) tweet_sources = get_source_sites(tweet.entities["urls"], source_sites) twitter_accounts = get_sources_twitter(tweet.text, source_twitter_list) tweet_text = tweet.text if not (tweet_keywords == [] and tweet_sources[0] == [] and twitter_accounts[0] == []): retweet_count = tweet.retweet_count favorite_count = tweet.favorite_count tweet_list = Tweet.objects.filter(tweet_id=tweet_id) if not tweet_list: # creating new intry in collection tweet = Tweet( tweet_id=tweet_id, name=tweet_user, date_added=tweet_store_date, date_published=tweet_date, text=tweet_text, ) tweet.save() tweet = Tweet.objects.get(tweet_id=tweet_id) tweet.countlog_set.create( retweet_count=retweet_count, favorite_count=favorite_count, date=tweet_store_date ) for account in twitter_accounts[0]: tweet.sourcetwitter_set.create(name=account, matched=True) for account in twitter_accounts[1]: tweet.sourcetwitter_set.create(name=account, matched=False) for key in tweet_keywords: tweet.keyword_set.create(name=key) for source in tweet_sources[0]: tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=True) for source in tweet_sources[1]: tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=False) added += 1 else: tweet = tweet_list[0] tweet.text = tweet_text tweet.tweet_id = tweet_id tweet.name = tweet_user # tweet.date_added = tweet_store_date tweet.date_published = tweet_date tweet.save() if not CountLog.objects.filter(retweet_count=retweet_count, favorite_count=favorite_count): tweet.countlog_set.create( retweet_count=retweet_count, favorite_count=favorite_count, date=tweet_store_date ) for key in tweet_keywords: if not TwitterKeyword.objects.filter(name=key): tweet.keyword_set.create(name=key) for source in tweet_sources[0]: if not TwitterSourceSite.objects.filter(url=source[0]): tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=True) for source in tweet_sources[1]: if not TwitterSourceSite.objects.filter(url=source[0]): tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=False) for account in twitter_accounts[0]: if not TwitterSourceTwitter.objects.filter(name=account): tweet.sourcetwitter_set.create(name=account, matched=True) for account in twitter_accounts[1]: if not TwitterSourceTwitter.objects.filter(name=account): tweet.sourcetwitter_set.create(name=account, matched=False) updated += 1 warc_creator.create_twitter_warc("https://twitter.com/" + tweet.name + "/status/" + str(tweet_id)) else: no_match += 1 processed += 1 print ( "%s (Twitter|%s) %i/%i \r" % (str(timezone.localtime(timezone.now()))[:-13], user, processed, tweet_count) ) tweets[i] = None print format( "%s (Twitter|%s) %i/%i " % (str(timezone.localtime(timezone.now()))[:-13], user, processed, tweet_count) )
def process_tweet(tweet, keywords, source_sites, source_accounts): """ Checks if the given tweet match the scope. """ user, tweet_text, tweet_id, tweet_date = tweet.user, tweet.text, tweet.tweet_id, tweet.date tweet_store_date = timezone.localtime(timezone.now()) tweet_keywords = get_keywords(tweet_text, keywords) tweet_sources = get_source_sites(tweet.urls, source_sites) twitter_accounts = get_source_twitter(tweet.mentions, source_accounts) retweet_count, favorite_count = tweet.retweet_count, tweet.favorite_count if len(tweet_text) > 450: try: tweet_text = tweet_text[:450] except: return NO_MATCH # finds match if tweet_keywords or tweet_sources[0] or twitter_accounts[0]: existing_tweets = Tweet.objects.filter(tweet_id=tweet_id) if not existing_tweets: tweet = Tweet(tweet_id=tweet_id, name=user, date_added=tweet_store_date, date_published=tweet_date, text=tweet_text) tweet.save() tweet = Tweet.objects.get(tweet_id=tweet_id) tweet.countlog_set.create(retweet_count=retweet_count, favorite_count=favorite_count, date=tweet_store_date) for account in twitter_accounts[0]: tweet.sourcetwitter_set.create(name=account, matched=True) for account in twitter_accounts[1]: tweet.sourcetwitter_set.create(name=account, matched=False) for key in tweet_keywords: tweet.keyword_set.create(name=key) for source in tweet_sources[0]: tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=True) for source in tweet_sources[1]: tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=False) try: warc_creator.create_twitter_warc('https://twitter.com/' + tweet.name + '/status/' + str(tweet_id)) # adjustable, give time for warc creation and avoids using too many resources time.sleep(3) except: print("Warc error at {}.{}".format(user, tweet_id)) logging.error("Warc error at {}.{}".format(user, tweet_id)) return ADDED else: tweet = existing_tweets[0] if not tweet.countlog_set.filter(retweet_count=retweet_count, favorite_count=favorite_count): tweet.countlog_set.create(retweet_count=retweet_count, favorite_count=favorite_count, date=tweet_store_date) for key in tweet_keywords: if not tweet.keyword_set.filter(name=key): tweet.keyword_set.create(name=key) for source in tweet_sources[0]: if not tweet.sourcesite_set.filter(url=source[0]): tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=True) for source in tweet_sources[1]: if not tweet.sourcesite_set.filter(url=source[0]): tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=False) for account in twitter_accounts[0]: if not tweet.sourcetwitter_set.filter(name=account): tweet.sourcetwitter_set.create(name=account, matched=True) for account in twitter_accounts[1]: if not tweet.sourcetwitter_set.filter(name=account): tweet.sourcetwitter_set.create(name=account, matched=False) return UPDATED return NO_MATCH
def process_tweet(tweet, keywords, source_sites, source_accounts): """ Checks if the given tweet match the scope. """ user, tweet_text, tweet_id, tweet_date = tweet.user, tweet.text, tweet.tweet_id, tweet.date tweet_store_date = timezone.localtime(timezone.now()) tweet_keywords = get_keywords(tweet_text, keywords) tweet_sources = get_source_sites(tweet.urls, source_sites) twitter_accounts = get_source_twitter(tweet.mentions, source_accounts) retweet_count, favorite_count = tweet.retweet_count, tweet.favorite_count if len(tweet_text) > 450: try: tweet_text = tweet_text[:450] except: return NO_MATCH # finds match if tweet_keywords or tweet_sources[0] or twitter_accounts[0]: existing_tweets = Tweet.objects.filter(tweet_id=tweet_id) if not existing_tweets: tweet = Tweet(tweet_id=tweet_id, name=user, date_added=tweet_store_date, date_published=tweet_date, text=tweet_text) tweet.save() tweet = Tweet.objects.get(tweet_id=tweet_id) tweet.countlog_set.create(retweet_count = retweet_count, favorite_count = favorite_count, date =tweet_store_date) for account in twitter_accounts[0]: tweet.sourcetwitter_set.create(name=account, matched=True) for account in twitter_accounts[1]: tweet.sourcetwitter_set.create(name=account, matched=False) for key in tweet_keywords: tweet.keyword_set.create(name=key) for source in tweet_sources[0]: tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=True) for source in tweet_sources[1]: tweet.sourcesite_set.create(url=source[0], domain=source[1], matched=False) try: warc_creator.create_twitter_warc( 'https://twitter.com/' + tweet.name + '/status/' +str(tweet_id)) # adjustable, give time for warc creation and avoids using too many resources time.sleep(3) except: print("Warc error at {}.{}".format(user, tweet_id)) logging.error("Warc error at {}.{}".format(user, tweet_id)) return ADDED else: tweet = existing_tweets[0] if not tweet.countlog_set.filter(retweet_count=retweet_count, favorite_count=favorite_count): tweet.countlog_set.create(retweet_count=retweet_count, favorite_count=favorite_count, date=tweet_store_date) for key in tweet_keywords: if not tweet.keyword_set.filter(name=key): tweet.keyword_set.create(name=key) for source in tweet_sources[0]: if not tweet.sourcesite_set.filter(url=source[0]): tweet.sourcesite_set.create( url=source[0], domain=source[1], matched=True) for source in tweet_sources[1]: if not tweet.sourcesite_set.filter(url=source[0]): tweet.sourcesite_set.create( url=source[0], domain=source[1], matched=False) for account in twitter_accounts[0]: if not tweet.sourcetwitter_set.filter(name=account): tweet.sourcetwitter_set.create(name=account, matched=True) for account in twitter_accounts[1]: if not tweet.sourcetwitter_set.filter(name=account): tweet.sourcetwitter_set.create(name=account, matched=False) return UPDATED return NO_MATCH