Python RawTweetDAO示例，src.db.dao.RawTweetDAO.RawTweetDAO Python示例

示例#1

0

显示文件

class TestRawTweetDAO(CustomTestCase):
    def setUp(self) -> None:
        super(TestRawTweetDAO, self).setUp()
        Mongo().db = mongomock.database.Database(mongomock.MongoClient(),
                                                 'elections',
                                                 _store=None)
        self.target = RawTweetDAO()

    def tearDown(self) -> None:
        # This has to be done because we are testing a Singleton
        RawTweetDAO._instances.clear()

    def test_cooccurrence_checked(self):
        tweet = {'_id': 'some_id'}
        self.target.insert_tweet(tweet)
        self.target.cooccurrence_checked(tweet)
        retrieved = self.target.get_first({'_id': 'some_id'})
        assert retrieved is not None
        assert retrieved.get('cooccurrence_checked', None) is not None
        assert retrieved['cooccurrence_checked']

    def test_hashtag_origin_checked(self):
        tweet = {'_id': 'some_id'}
        self.target.insert_tweet(tweet)
        self.target.hashtag_origin_checked(tweet)
        retrieved = self.target.get_first({'_id': 'some_id'})
        assert retrieved is not None
        assert retrieved.get('hashtag_origin_checked', None) is not None
        assert retrieved['hashtag_origin_checked']

示例#2

0

显示文件

 def insert_hashtags(cls):
     """ """
     cls.get_logger().info("Starting User Hashtag process.")
     tweets_cursor = RawTweetDAO().get_all(
         {"in_user_hashtag_collection": {
             '$exists': False
         }})
     for tweet in tweets_cursor:
         cls.insert_hashtags_of_one_tweet(tweet)
         RawTweetDAO().update_first({'_id': tweet['_id']},
                                    {'in_user_hashtag_collection': True})
     cls.get_logger().info("User Hashtag Service finished.")

示例#3

0

显示文件

 def store_new_tweets(cls, follower_download_tweets, min_tweet_date):
     """ Store new follower's tweet since last update. """
     for tweet in follower_download_tweets:
         tweet_date = cls.get_formatted_date(tweet['created_at'])
         if tweet_date >= min_tweet_date:
             try:
                 tweet_copy = tweet.copy()
                 tweet_copy["_id"] = tweet.pop('id_str', None)
                 tweet_copy.pop('id', None)
                 tweet_copy["text"] = tweet.pop('full_text', None)
                 tweet_copy['created_at'] = tweet_date
                 tweet_copy['user_id'] = tweet.pop('user')['id_str']
                 tweet_copy['in_user_hashtag_collection'] = True
                 RawTweetDAO().insert_tweet(tweet_copy)
                 HashtagOriginService().process_tweet(tweet_copy)
                 HashtagCooccurrenceService().process_tweet(tweet_copy)
                 UserHashtagService().insert_hashtags_of_one_tweet(
                     tweet_copy)
             except DuplicatedTweetError:
                 # cls.get_logger().info(
                 #    f'{updated_tweets} tweets of {tweet["user"]["id"]} are updated. Actual date: {tweet_date}')
                 return
         else:
             # cls.get_logger().info(
             #   f'{updated_tweets} tweets of {tweet["user"]["id"]} are updated. Actual date: {tweet_date}')
             return

示例#4

0

显示文件

 def process_tweet(cls, tweet):
     # Generate documents for hashtag origin collection and store
     for hashtag in {h['text'] for h in tweet['entities']['hashtags']}:
         # Make hashtag key
         key = hashtag.lower()
         # Lock for this hashtag to avoid undesired overwriting
         ConcurrencyUtils().create_lock(key)
         ConcurrencyUtils().acquire_lock(key)
         # Retrieve existing hashtag with that key
         document = HashtagDAO().find(key)
         # Only update tweet data if this tweet was older than the previous one
         if document is None or document['created_at'].timestamp(
         ) > tweet['created_at'].timestamp():
             # Store in database
             HashtagDAO().put(key, tweet, hashtag)
         else:
             # In this case we only add one to the number of appearances of the hashtag
             HashtagDAO().put(key, None, hashtag)
         # Wrap releasing to avoid exploding
         try:
             ConcurrencyUtils().release_lock(key)
         except RuntimeError:
             cls.get_logger().error(
                 f'Tried to release a lock that was never acquired with id {key}.'
             )
             SlackHelper.post_message_to_channel(
                 cls.SLACK_MESSAGE_FORMAT % key, '#errors')
     # Mark tweet as already checked
     RawTweetDAO().hashtag_origin_checked(tweet)

示例#5

0

显示文件

 def get_necessary_data(cls):
     """ Retrieve db data and create candidates list. """
     candidate_index, candidate_group = CandidateDAO(
     ).get_required_candidates()
     candidates_list = list(candidate_index.keys())
     candidates_rt_cursor = RawTweetDAO().get_rt_to_candidates_cursor(
         candidates_list)
     return candidate_index, candidates_list, candidate_group, candidates_rt_cursor

示例#6

0

显示文件

    def send_server_status(cls):
        if not EnvironmentUtils.is_prod(cls.__env): return
        yesterday = datetime.datetime.today() - datetime.timedelta(days=1)
        followers_updated = RawFollowerDAO().get_users_updated_since_date(
            yesterday)
        tweets_updated = RawTweetDAO().get_count(
        )  # new_followers = CandidatesFollowersDAO().get()

        message = f'Cantidad de tweets descargados hasta el momento: {tweets_updated} \n ' \
            f'Usuarios actualizados durante el día de ayer: {followers_updated} \n'
        cls.post_message_to_channel(message)

示例#7

0

显示文件

 def process_tweet(cls, tweet):
     """ Process tweet for hashtag cooccurrence detection. """
     if cls.__is_processable(tweet):
         # Flatten list of hashtags and keep distinct values only
         hashtags = list({h['text'].lower() for h in tweet['entities']['hashtags']})
         # Generate documents for cooccurrence collection and store
         for i in range(len(hashtags) - 1):
             for j in range(i + 1, len(hashtags)):
                 pair = sorted([hashtags[i], hashtags[j]])
                 # Store only if the same user didn't use that pair of hashtags in the same day
                 if not CooccurrenceDAO().exists_in_tweet_day(tweet, pair):
                     CooccurrenceDAO().store(tweet, pair)
     # Mark tweet as already used
     RawTweetDAO().cooccurrence_checked(tweet)

示例#8

0

显示文件

 def load_tweets(cls):
     cls.get_logger().info(f'Inserting in DB pre download tweets ')
     candidates = ["cfk", "macri"]
     min_tweet_date = datetime.datetime(2019, 1, 1).astimezone(
         pytz.timezone('America/Argentina/Buenos_Aires')) - datetime.timedelta()
     tweets_updated = 0
     for candidate in candidates:
         path = cls.FOLLOWERS_PATH_FORMAT + candidate + ".pickle"
         download_tweets = {}
         try:
             with open(path, 'rb') as frb:
                 download_tweets = pickle.load(frb)
             frb.close()
         except IOError:
             cls.get_logger().error('Error opening the file')
         cls.get_logger().info(f'Inserting in db {candidate}\'s followers tweets.')
         cls.get_logger().info(str(download_tweets.keys()))
         for follower, follower_tweets in download_tweets.items():
             if len(follower_tweets) != 0:
                 cls.update_follower_with_first_tweet(follower, follower_tweets[0])
                 for tweet in follower_tweets:
                     tweet_date = cls.get_formatted_date(tweet['created_at'])
                     if tweet_date >= min_tweet_date:
                         # Clean tweet's information
                         tweet["_id"] = tweet['id_str']
                         tweet.pop('id')
                         tweet.pop('id_str')
                         tweet['text'] = tweet['full_text']
                         tweet.pop('full_text')
                         tweet['created_at'] = tweet_date
                         tweet['user_id'] = tweet['user']['id']
                         tweet.pop('user')
                         try:
                             RawTweetDAO().insert_tweet(tweet)
                         except DuplicatedTweetError:
                             break
                         tweets_updated += 1
                 cls.get_logger().info(f'{follower} updated')
         cls.get_logger().info(f'Tweets updated: {str(tweets_updated)}')

示例#9

0

显示文件

 def setUp(self) -> None:
     super(TestRawTweetDAO, self).setUp()
     Mongo().db = mongomock.database.Database(mongomock.MongoClient(),
                                              'elections',
                                              _store=None)
     self.target = RawTweetDAO()