class TestRawTweetDAO(CustomTestCase): def setUp(self) -> None: super(TestRawTweetDAO, self).setUp() Mongo().db = mongomock.database.Database(mongomock.MongoClient(), 'elections', _store=None) self.target = RawTweetDAO() def tearDown(self) -> None: # This has to be done because we are testing a Singleton RawTweetDAO._instances.clear() def test_cooccurrence_checked(self): tweet = {'_id': 'some_id'} self.target.insert_tweet(tweet) self.target.cooccurrence_checked(tweet) retrieved = self.target.get_first({'_id': 'some_id'}) assert retrieved is not None assert retrieved.get('cooccurrence_checked', None) is not None assert retrieved['cooccurrence_checked'] def test_hashtag_origin_checked(self): tweet = {'_id': 'some_id'} self.target.insert_tweet(tweet) self.target.hashtag_origin_checked(tweet) retrieved = self.target.get_first({'_id': 'some_id'}) assert retrieved is not None assert retrieved.get('hashtag_origin_checked', None) is not None assert retrieved['hashtag_origin_checked']
def insert_hashtags(cls): """ """ cls.get_logger().info("Starting User Hashtag process.") tweets_cursor = RawTweetDAO().get_all( {"in_user_hashtag_collection": { '$exists': False }}) for tweet in tweets_cursor: cls.insert_hashtags_of_one_tweet(tweet) RawTweetDAO().update_first({'_id': tweet['_id']}, {'in_user_hashtag_collection': True}) cls.get_logger().info("User Hashtag Service finished.")
def store_new_tweets(cls, follower_download_tweets, min_tweet_date): """ Store new follower's tweet since last update. """ for tweet in follower_download_tweets: tweet_date = cls.get_formatted_date(tweet['created_at']) if tweet_date >= min_tweet_date: try: tweet_copy = tweet.copy() tweet_copy["_id"] = tweet.pop('id_str', None) tweet_copy.pop('id', None) tweet_copy["text"] = tweet.pop('full_text', None) tweet_copy['created_at'] = tweet_date tweet_copy['user_id'] = tweet.pop('user')['id_str'] tweet_copy['in_user_hashtag_collection'] = True RawTweetDAO().insert_tweet(tweet_copy) HashtagOriginService().process_tweet(tweet_copy) HashtagCooccurrenceService().process_tweet(tweet_copy) UserHashtagService().insert_hashtags_of_one_tweet( tweet_copy) except DuplicatedTweetError: # cls.get_logger().info( # f'{updated_tweets} tweets of {tweet["user"]["id"]} are updated. Actual date: {tweet_date}') return else: # cls.get_logger().info( # f'{updated_tweets} tweets of {tweet["user"]["id"]} are updated. Actual date: {tweet_date}') return
def process_tweet(cls, tweet): # Generate documents for hashtag origin collection and store for hashtag in {h['text'] for h in tweet['entities']['hashtags']}: # Make hashtag key key = hashtag.lower() # Lock for this hashtag to avoid undesired overwriting ConcurrencyUtils().create_lock(key) ConcurrencyUtils().acquire_lock(key) # Retrieve existing hashtag with that key document = HashtagDAO().find(key) # Only update tweet data if this tweet was older than the previous one if document is None or document['created_at'].timestamp( ) > tweet['created_at'].timestamp(): # Store in database HashtagDAO().put(key, tweet, hashtag) else: # In this case we only add one to the number of appearances of the hashtag HashtagDAO().put(key, None, hashtag) # Wrap releasing to avoid exploding try: ConcurrencyUtils().release_lock(key) except RuntimeError: cls.get_logger().error( f'Tried to release a lock that was never acquired with id {key}.' ) SlackHelper.post_message_to_channel( cls.SLACK_MESSAGE_FORMAT % key, '#errors') # Mark tweet as already checked RawTweetDAO().hashtag_origin_checked(tweet)
def get_necessary_data(cls): """ Retrieve db data and create candidates list. """ candidate_index, candidate_group = CandidateDAO( ).get_required_candidates() candidates_list = list(candidate_index.keys()) candidates_rt_cursor = RawTweetDAO().get_rt_to_candidates_cursor( candidates_list) return candidate_index, candidates_list, candidate_group, candidates_rt_cursor
def send_server_status(cls): if not EnvironmentUtils.is_prod(cls.__env): return yesterday = datetime.datetime.today() - datetime.timedelta(days=1) followers_updated = RawFollowerDAO().get_users_updated_since_date( yesterday) tweets_updated = RawTweetDAO().get_count( ) # new_followers = CandidatesFollowersDAO().get() message = f'Cantidad de tweets descargados hasta el momento: {tweets_updated} \n ' \ f'Usuarios actualizados durante el día de ayer: {followers_updated} \n' cls.post_message_to_channel(message)
def process_tweet(cls, tweet): """ Process tweet for hashtag cooccurrence detection. """ if cls.__is_processable(tweet): # Flatten list of hashtags and keep distinct values only hashtags = list({h['text'].lower() for h in tweet['entities']['hashtags']}) # Generate documents for cooccurrence collection and store for i in range(len(hashtags) - 1): for j in range(i + 1, len(hashtags)): pair = sorted([hashtags[i], hashtags[j]]) # Store only if the same user didn't use that pair of hashtags in the same day if not CooccurrenceDAO().exists_in_tweet_day(tweet, pair): CooccurrenceDAO().store(tweet, pair) # Mark tweet as already used RawTweetDAO().cooccurrence_checked(tweet)
def load_tweets(cls): cls.get_logger().info(f'Inserting in DB pre download tweets ') candidates = ["cfk", "macri"] min_tweet_date = datetime.datetime(2019, 1, 1).astimezone( pytz.timezone('America/Argentina/Buenos_Aires')) - datetime.timedelta() tweets_updated = 0 for candidate in candidates: path = cls.FOLLOWERS_PATH_FORMAT + candidate + ".pickle" download_tweets = {} try: with open(path, 'rb') as frb: download_tweets = pickle.load(frb) frb.close() except IOError: cls.get_logger().error('Error opening the file') cls.get_logger().info(f'Inserting in db {candidate}\'s followers tweets.') cls.get_logger().info(str(download_tweets.keys())) for follower, follower_tweets in download_tweets.items(): if len(follower_tweets) != 0: cls.update_follower_with_first_tweet(follower, follower_tweets[0]) for tweet in follower_tweets: tweet_date = cls.get_formatted_date(tweet['created_at']) if tweet_date >= min_tweet_date: # Clean tweet's information tweet["_id"] = tweet['id_str'] tweet.pop('id') tweet.pop('id_str') tweet['text'] = tweet['full_text'] tweet.pop('full_text') tweet['created_at'] = tweet_date tweet['user_id'] = tweet['user']['id'] tweet.pop('user') try: RawTweetDAO().insert_tweet(tweet) except DuplicatedTweetError: break tweets_updated += 1 cls.get_logger().info(f'{follower} updated') cls.get_logger().info(f'Tweets updated: {str(tweets_updated)}')
def setUp(self) -> None: super(TestRawTweetDAO, self).setUp() Mongo().db = mongomock.database.Database(mongomock.MongoClient(), 'elections', _store=None) self.target = RawTweetDAO()