Пример #1
0
 def _user_timeline(self, user_id=None, since_id=None):
     max_id = int(since_id) if since_id else 0
     for count, tweet in enumerate(self.twarc.timeline(user_id=user_id, since_id=since_id)):
         if not count % 100:
             log.debug("Collected %s tweets for %s", count, user_id)
         self.writer.write(tweet)
         max_id = max(max_id, tweet['id'])
         if self.stop_event.is_set():
             break
     return str(max_id) if max_id else None
Пример #2
0
 def _move(self, src_file):
     dest_filepath = src_file.filepath.replace(self.collections_path,
                                               DEFAULT_COLLECTIONS_PATH)
     if isinstance(src_file, AddFile):
         log.debug('Copying %s to s3://%s/%s', src_file.filepath,
                   self.bucket, dest_filepath)
         aws_client('s3').upload_file(src_file.filepath, self.bucket,
                                      dest_filepath)
         if src_file.delete:
             os.remove(src_file.filepath)
     else:
         log.debug('Deleting s3://%s/%s', self.bucket, dest_filepath)
         aws_client('s3').delete_object(Bucket=self.bucket,
                                        Key=dest_filepath)
Пример #3
0
def download_all(bucket, path, local_path):
    paginator = aws_resource('s3').meta.client.get_paginator('list_objects')
    for result in paginator.paginate(Bucket=bucket, Prefix=_prefix(path)):
        for obj in result['Contents']:
            object_key = obj['Key']
            if object_key.endswith('/'):
                continue

            dest_filepath = os.path.join(local_path, _remove_prefix(object_key, path))
            if os.path.isfile(dest_filepath) and os.path.getsize(dest_filepath) == obj['Size']:
                log.debug('Skipping downloading s3://%s/%s to %s', bucket, object_key, dest_filepath)
            else:
                log.debug('Downloading s3://%s/%s to %s', bucket, object_key, dest_filepath)
                os.makedirs(os.path.dirname(dest_filepath), exist_ok=True)
                aws_resource('s3').Bucket(bucket).download_file(object_key, dest_filepath)
Пример #4
0
 def user_timelines(self):
     assert 'users' in self.config
     user_ids = self.config['users'].keys()
     user_changes = []
     with FileQueueingWriter(
             get_users_filepath(self.config['id'], self.harvest_timestamp, collections_path=self.collections_path),
             self.file_queue, delete=True) as users_writer:
         for count, user_id in enumerate(user_ids):
             user_details = self.config['users'][user_id]
             screen_name = user_details.get('screen_name')
             result, user = self._lookup_user(user_id)
             if result != 'OK':
                 change_details = {
                     'user_id': user_id,
                     'change': result
                 }
                 if 'screen_name' in user_details:
                     change_details['screen_name'] = user_details['screen_name']
                 user_changes.append(change_details)
                 if result in self.config.get('delete_users_for', []):
                     self.changeset.delete_user(user_id)
                 continue
             users_writer.write_json(user)
             if 'screen_name' not in user_details:
                 user_changes.append({
                     'user_id': user_id,
                     'change': 'screen name found',
                     'screen_name': user['screen_name']
                 })
                 self.changeset.update_user('screen_name', user['screen_name'], user_id)
             elif user_details['screen_name'] != user['screen_name']:
                 user_changes.append({
                     'user_id': user_id,
                     'change': 'screen name changed',
                     'screen_name': user['screen_name']
                 })
                 self.changeset.update_user('screen_name', user['screen_name'], user_id)
             log.debug("Collecting timeline of %s (%s of %s)", screen_name or user_id, count + 1, len(user_ids))
             new_max_id = self._user_timeline(user_id=user_id, since_id=user_details.get('since_id'))
             if new_max_id and (new_max_id != user_details.get('since_id')):
                 self.changeset.update_user('since_id', new_max_id, user_id)
             if self.stop_event.is_set():
                 break
     with FileQueueingWriter(get_user_changes_filepath(self.config['id'], self.harvest_timestamp,
                                                       collections_path=self.collections_path),
                             self.file_queue, delete=True) as user_changes_writer:
         user_changes_writer.write_json(user_changes, indent=2)
Пример #5
0
    def filter(self):
        filter_config = self.config.get('filter')
        track = filter_config.get('track')
        follow = filter_config.get('follow')
        locations = filter_config.get('locations')

        assert track or follow or locations

        max_records = int(self.config['filter'].get('max_records', 0))
        for count, tweet in enumerate(
                self.twarc.filter(track=track, follow=follow, locations=locations, event=self.stop_event)):
            if not count % 1000:
                log.debug("Collected %s tweets", count)
            self.writer.write(tweet)
            if max_records and max_records-1 == count:
                log.debug("Reached max records of %s", max_records)
                self.stop_event.set()
Пример #6
0
 def search(self):
     assert 'query' in self.config.get('search', {})
     query = self.config['search']['query']
     since_id = self.config['search'].get('since_id')
     max_records = int(self.config['search'].get('max_records', 0))
     max_id = int(since_id) if since_id else 0
     for count, tweet in enumerate(self.twarc.search(q=query, since_id=since_id)):
         if not count % 1000:
             log.debug("Collected %s tweets", count)
         self.writer.write(tweet)
         max_id = max(max_id, tweet['id'])
         if self.stop_event.is_set():
             break
         if max_records and max_records-1 == count:
             log.debug("Reached max records of %s", max_records)
             break
     # Set since_id on changeset
     self.changeset.update_search(max_id)
Пример #7
0
 def run(self):
     try:
         log.debug('Starting file processor thread')
         while not self.stop_event.is_set() or not self.queue.empty():
             try:
                 src_file = self.queue.get_nowait()
                 if self.bucket:
                     self._move(src_file)
                 else:
                     log.debug('Skipping moving %s since local',
                               src_file.filepath)
                 self.queue.task_done()
             except Empty:
                 sleep(.5)
         log.debug('Ending file processor thread')
     # pylint: disable=broad-except
     except Exception as exception:
         self.exception = exception
Пример #8
0
 def run(self):
     try:
         log.debug('Starting twarc thread')
         api_method_type = self.config.get('type')
         log.debug("API method type is %s", api_method_type)
         with TweetWriterThread(self.collections_path, self.config['id'], self.harvest_timestamp, self.file_queue,
                                self.harvest_info, self.tweets_per_file) as self.writer:
             if api_method_type == 'user_timeline':
                 self.user_timelines()
             elif api_method_type == 'filter':
                 self.filter()
             elif api_method_type == 'search':
                 self.search()
             else:
                 raise KeyError('Unknown API method type: {}'.format(api_method_type))
         self.harvest_info.end()
         log.debug('Ending twarc thread')
     # pylint: disable=broad-except
     except Exception as exception:
         self.exception = exception
Пример #9
0
 def lock(self):
     log.debug('Locking')
     lock = {'harvest_id': self.harvest_timestamp.isoformat()}
     with FileQueueingWriter(self.lock_filepath,
                             self.file_queue) as lock_writer:
         lock_writer.write_json(lock, indent=2)
Пример #10
0
    def harvest(self):
        log.info('Starting harvester')
        # Sync
        if self.bucket:
            sync_collection_config(self.collections_path, self.collection_id,
                                   self.bucket)

        # Check if collection is locked
        assert_locked(
            get_lock_file(self.collection_id,
                          collections_path=self.collections_path))

        # Start the server
        ServerThread(self.stop_event, self.stopped_event, self.shutdown_event,
                     self.harvest_info, self.port).start()

        # Start the monitor
        if self.monitor:
            MonitoringThread().start()

        # Load the collection config
        collection_config = self._load_collection_config()

        with S3FileMoverThread(self.file_queue, self.collections_path,
                               self.bucket), CollectionLock(
                                   self.collections_path,
                                   self.collection_id,
                                   self.file_queue,
                                   harvest_timestamp=self.harvest_timestamp):
            # Write the collection config file to harvester
            self._write_harvest_collection_config(collection_config)

            # Start collecting
            twarc_thread = TwarcThread(collection_config,
                                       self.collections_path,
                                       self.harvest_timestamp, self.file_queue,
                                       self.changeset, self.stop_event,
                                       self.harvest_info, self.tweets_per_file)
            twarc_thread.start()

            # Wait for collection to stop
            twarc_thread.join()
            if twarc_thread.exception:
                raise twarc_thread.exception

            # Save harvester info
            with FileQueueingWriter(
                    get_harvest_info_file(
                        self.collection_id,
                        self.harvest_timestamp,
                        collections_path=self.collections_path),
                    self.file_queue) as harvest_info_writer:
                harvest_info_writer.write_json(self.harvest_info.to_dict(),
                                               indent=2)
            if self.changeset.has_changes():
                # Sync again
                if self.bucket:
                    sync_collection_config_file(self.collections_path,
                                                self.collection_id,
                                                self.bucket)
                latest_collection_config = self._load_collection_config()
                if latest_collection_config.get('timestamp',
                                                1) != collection_config.get(
                                                    'timestamp', 2):
                    # If it has changed, then delete any updates from changeset for users that no longer exist.
                    log.debug('Cleaning changeset')
                    self.changeset.clean_changeset(latest_collection_config)
                # Merge changes into latest config
                latest_collection_config.merge_changeset(self.changeset)
                # Write config
                with FileQueueingWriter(
                        get_collection_config_filepath(
                            self.collection_id,
                            collections_path=self.collections_path),
                        self.file_queue) as changeset_writer:
                    changeset_writer.write_json(latest_collection_config,
                                                indent=2)

                # Write changeset
                change_timestamp = dateutil.parser.parse(
                    self.changeset['change_timestamp'])
                with FileQueueingWriter(
                        get_changeset_file(
                            self.collection_id,
                            change_timestamp,
                            collections_path=self.collections_path),
                        self.file_queue) as changeset_writer:
                    changeset_writer.write_json(self.changeset, indent=2)

        log.info('Harvesting stopped')
        # All done
        self.stopped_event.set()

        log.debug('Waiting to shut down')
        while not self.shutdown_event.is_set():
            sleep(.5)
        log.info('Shut down')