Exemplo n.º 1
0
def backfill(path=None, db_name='slpng_giants', collection_name='tweets'):
    tweets = create_collection(db_name, collection_name)

    config = Config.get()
    save_dir = os.path.expanduser(path or config['output_dir'])

    paths = glob.glob(os.path.join(save_dir, '**', '*.t*'), recursive=True)
    t0 = datetime.now()
    for idx, path in enumerate(paths):
        if os.path.basename(os.path.dirname(path)) != 'stream':
            continue
        try:
            for lidx, line in enumerate(utils.readlines(path)):
                try:
                    data = json.loads(line)
                except Exception:
                    TwiLogger.exception(
                        f'Backfill: Unable to read line {path}:{lidx + 1}')
                    continue
                else:
                    if data.get('delete'):
                        continue
                    if os.path.basename(os.path.dirname(path)) == 'stream':
                        data = utils.stream_to_search(data)
                    data = utils.timestamp_to_datetime(data)
                    tweets.replace_one({'id': data['id']}, data, upsert=True)
            t_delta = datetime.now() - t0
            average = t_delta / (idx + 1)
            remaining = str((len(paths) - (idx + 1)) * average).split('.')[0]

            TwiLogger.info(f'{idx + 1}/{len(paths)} '
                           f'{remaining} '
                           f'{os.sep.join(path.split(os.sep)[-3:])}')
        except Exception:
            TwiLogger.exception(f'Backfill: Unable to read file: {path}')
Exemplo n.º 2
0
 def id_to_screenname(self):
     now = datetime.now()
     time_since_lookup = now - (self._id_to_screenname_time or now)
     expiry = timedelta(minutes=15)
     if self._id_to_screenname and time_since_lookup <= expiry:
         return self._id_to_screenname
     for follow_id in self.follow:
         user = self.api.get_user(follow_id)
         self._id_to_screenname[follow_id] = '@{}'.format(user.screen_name)
     self._id_to_screenname_time = datetime.now()
     TwiLogger.info(self._id_to_screenname)
     return self._id_to_screenname
Exemplo n.º 3
0
    def on_data(self, json_data):
        """
        Defines the actions to take on data capture. Caching all available user
        data and writing tweet data to disk.

        Args:
            json_data (str): String containing tweet data on JSON format

        Returns:
            bool: True if successful

        """
        self._rate_limit_retry_count = 0
        os.makedirs(self.output_dir, exist_ok=True)
        file_path = os.path.join(self.output_dir, self.file_name)
        data = json.loads(json_data)
        if data.get('created_at'):
            users = utils.collect_key_values('user', data)
            for user in users:
                user['recorded_at'] = data['created_at']
                self.users[user['id_str']] = user
            if self.config.get('full_user_mentions', False):
                self.update_mentions(data)

            # Add tweet to MongoDB
            if self.config.get('use_mongo', True) and self.mongo_collection:
                try:
                    mongo_data = copy.deepcopy(data)
                    mongo_data = utils.timestamp_to_datetime(mongo_data)
                    mongo_data = utils.stream_to_search(mongo_data)
                    self.mongo_collection.replace_one(
                        {'id': mongo_data['id']},
                        mongo_data,
                        upsert=True
                    )
                except Exception:
                    TwiLogger.exception(
                        'Twicorder Listener: Unable to connect to MongoDB: '
                    )

        self._data.append(data)
        utils.write(json.dumps(data) + '\n', file_path)
        timestamp = '{:%d %b %Y %H:%M:%S}'.format(datetime.now())
        tweet = self.get_full_text(data)
        if not tweet:
            return True
        user = data.get('user', {}).get('screen_name', '-')
        oneline_tweet = tweet.replace('\n', ' ')
        TwiLogger.info(f'{timestamp}, @{user}: {oneline_tweet}')
        return True
Exemplo n.º 4
0
 def run(self):
     """
     Fetches query from queue and executes it.
     """
     while True:
         self._query = self.queue.get()
         if self.query is None:
             TwiLogger.info(f'Terminating thread "{self.name}"')
             break
         while not self.query.done:
             try:
                 self.query.run()
             except Exception:
                 import traceback
                 TwiLogger.exception(traceback.format_exc())
                 break
             TwiLogger.info(self.query.fetch_log())
             time.sleep(.2)
         time.sleep(.5)
         self.queue.task_done()
Exemplo n.º 5
0
    def add(self, query):
        """
        Finds appropriate queue for given end point and adds it.

        Args:
            query (BaseQuery): Query object

        """
        queue = self.get_queue(query.endpoint)
        if query in queue.queue:
            TwiLogger.info(f'Query with ID {query.uid} is already in the queue.')
            return
        thread = self.threads.get(query.endpoint)
        if thread and thread.query == query:
            TwiLogger.info(f'Query with ID {query.uid} is already running.')
            return
        queue.put(query)
        TwiLogger.info(query)
Exemplo n.º 6
0
 def track(self):
     track_list = [t for t in self.config.get('track') or [] if t] or None
     if track_list and self.follow_also_tracks:
         track_list += self.id_to_screenname.values()
     TwiLogger.info('Tracking: ', track_list)
     return track_list