def _user_timeline(self, user_id=None, since_id=None): max_id = int(since_id) if since_id else 0 for count, tweet in enumerate(self.twarc.timeline(user_id=user_id, since_id=since_id)): if not count % 100: log.debug("Collected %s tweets for %s", count, user_id) self.writer.write(tweet) max_id = max(max_id, tweet['id']) if self.stop_event.is_set(): break return str(max_id) if max_id else None
def _move(self, src_file): dest_filepath = src_file.filepath.replace(self.collections_path, DEFAULT_COLLECTIONS_PATH) if isinstance(src_file, AddFile): log.debug('Copying %s to s3://%s/%s', src_file.filepath, self.bucket, dest_filepath) aws_client('s3').upload_file(src_file.filepath, self.bucket, dest_filepath) if src_file.delete: os.remove(src_file.filepath) else: log.debug('Deleting s3://%s/%s', self.bucket, dest_filepath) aws_client('s3').delete_object(Bucket=self.bucket, Key=dest_filepath)
def download_all(bucket, path, local_path): paginator = aws_resource('s3').meta.client.get_paginator('list_objects') for result in paginator.paginate(Bucket=bucket, Prefix=_prefix(path)): for obj in result['Contents']: object_key = obj['Key'] if object_key.endswith('/'): continue dest_filepath = os.path.join(local_path, _remove_prefix(object_key, path)) if os.path.isfile(dest_filepath) and os.path.getsize(dest_filepath) == obj['Size']: log.debug('Skipping downloading s3://%s/%s to %s', bucket, object_key, dest_filepath) else: log.debug('Downloading s3://%s/%s to %s', bucket, object_key, dest_filepath) os.makedirs(os.path.dirname(dest_filepath), exist_ok=True) aws_resource('s3').Bucket(bucket).download_file(object_key, dest_filepath)
def user_timelines(self): assert 'users' in self.config user_ids = self.config['users'].keys() user_changes = [] with FileQueueingWriter( get_users_filepath(self.config['id'], self.harvest_timestamp, collections_path=self.collections_path), self.file_queue, delete=True) as users_writer: for count, user_id in enumerate(user_ids): user_details = self.config['users'][user_id] screen_name = user_details.get('screen_name') result, user = self._lookup_user(user_id) if result != 'OK': change_details = { 'user_id': user_id, 'change': result } if 'screen_name' in user_details: change_details['screen_name'] = user_details['screen_name'] user_changes.append(change_details) if result in self.config.get('delete_users_for', []): self.changeset.delete_user(user_id) continue users_writer.write_json(user) if 'screen_name' not in user_details: user_changes.append({ 'user_id': user_id, 'change': 'screen name found', 'screen_name': user['screen_name'] }) self.changeset.update_user('screen_name', user['screen_name'], user_id) elif user_details['screen_name'] != user['screen_name']: user_changes.append({ 'user_id': user_id, 'change': 'screen name changed', 'screen_name': user['screen_name'] }) self.changeset.update_user('screen_name', user['screen_name'], user_id) log.debug("Collecting timeline of %s (%s of %s)", screen_name or user_id, count + 1, len(user_ids)) new_max_id = self._user_timeline(user_id=user_id, since_id=user_details.get('since_id')) if new_max_id and (new_max_id != user_details.get('since_id')): self.changeset.update_user('since_id', new_max_id, user_id) if self.stop_event.is_set(): break with FileQueueingWriter(get_user_changes_filepath(self.config['id'], self.harvest_timestamp, collections_path=self.collections_path), self.file_queue, delete=True) as user_changes_writer: user_changes_writer.write_json(user_changes, indent=2)
def filter(self): filter_config = self.config.get('filter') track = filter_config.get('track') follow = filter_config.get('follow') locations = filter_config.get('locations') assert track or follow or locations max_records = int(self.config['filter'].get('max_records', 0)) for count, tweet in enumerate( self.twarc.filter(track=track, follow=follow, locations=locations, event=self.stop_event)): if not count % 1000: log.debug("Collected %s tweets", count) self.writer.write(tweet) if max_records and max_records-1 == count: log.debug("Reached max records of %s", max_records) self.stop_event.set()
def search(self): assert 'query' in self.config.get('search', {}) query = self.config['search']['query'] since_id = self.config['search'].get('since_id') max_records = int(self.config['search'].get('max_records', 0)) max_id = int(since_id) if since_id else 0 for count, tweet in enumerate(self.twarc.search(q=query, since_id=since_id)): if not count % 1000: log.debug("Collected %s tweets", count) self.writer.write(tweet) max_id = max(max_id, tweet['id']) if self.stop_event.is_set(): break if max_records and max_records-1 == count: log.debug("Reached max records of %s", max_records) break # Set since_id on changeset self.changeset.update_search(max_id)
def run(self): try: log.debug('Starting file processor thread') while not self.stop_event.is_set() or not self.queue.empty(): try: src_file = self.queue.get_nowait() if self.bucket: self._move(src_file) else: log.debug('Skipping moving %s since local', src_file.filepath) self.queue.task_done() except Empty: sleep(.5) log.debug('Ending file processor thread') # pylint: disable=broad-except except Exception as exception: self.exception = exception
def run(self): try: log.debug('Starting twarc thread') api_method_type = self.config.get('type') log.debug("API method type is %s", api_method_type) with TweetWriterThread(self.collections_path, self.config['id'], self.harvest_timestamp, self.file_queue, self.harvest_info, self.tweets_per_file) as self.writer: if api_method_type == 'user_timeline': self.user_timelines() elif api_method_type == 'filter': self.filter() elif api_method_type == 'search': self.search() else: raise KeyError('Unknown API method type: {}'.format(api_method_type)) self.harvest_info.end() log.debug('Ending twarc thread') # pylint: disable=broad-except except Exception as exception: self.exception = exception
def lock(self): log.debug('Locking') lock = {'harvest_id': self.harvest_timestamp.isoformat()} with FileQueueingWriter(self.lock_filepath, self.file_queue) as lock_writer: lock_writer.write_json(lock, indent=2)
def harvest(self): log.info('Starting harvester') # Sync if self.bucket: sync_collection_config(self.collections_path, self.collection_id, self.bucket) # Check if collection is locked assert_locked( get_lock_file(self.collection_id, collections_path=self.collections_path)) # Start the server ServerThread(self.stop_event, self.stopped_event, self.shutdown_event, self.harvest_info, self.port).start() # Start the monitor if self.monitor: MonitoringThread().start() # Load the collection config collection_config = self._load_collection_config() with S3FileMoverThread(self.file_queue, self.collections_path, self.bucket), CollectionLock( self.collections_path, self.collection_id, self.file_queue, harvest_timestamp=self.harvest_timestamp): # Write the collection config file to harvester self._write_harvest_collection_config(collection_config) # Start collecting twarc_thread = TwarcThread(collection_config, self.collections_path, self.harvest_timestamp, self.file_queue, self.changeset, self.stop_event, self.harvest_info, self.tweets_per_file) twarc_thread.start() # Wait for collection to stop twarc_thread.join() if twarc_thread.exception: raise twarc_thread.exception # Save harvester info with FileQueueingWriter( get_harvest_info_file( self.collection_id, self.harvest_timestamp, collections_path=self.collections_path), self.file_queue) as harvest_info_writer: harvest_info_writer.write_json(self.harvest_info.to_dict(), indent=2) if self.changeset.has_changes(): # Sync again if self.bucket: sync_collection_config_file(self.collections_path, self.collection_id, self.bucket) latest_collection_config = self._load_collection_config() if latest_collection_config.get('timestamp', 1) != collection_config.get( 'timestamp', 2): # If it has changed, then delete any updates from changeset for users that no longer exist. log.debug('Cleaning changeset') self.changeset.clean_changeset(latest_collection_config) # Merge changes into latest config latest_collection_config.merge_changeset(self.changeset) # Write config with FileQueueingWriter( get_collection_config_filepath( self.collection_id, collections_path=self.collections_path), self.file_queue) as changeset_writer: changeset_writer.write_json(latest_collection_config, indent=2) # Write changeset change_timestamp = dateutil.parser.parse( self.changeset['change_timestamp']) with FileQueueingWriter( get_changeset_file( self.collection_id, change_timestamp, collections_path=self.collections_path), self.file_queue) as changeset_writer: changeset_writer.write_json(self.changeset, indent=2) log.info('Harvesting stopped') # All done self.stopped_event.set() log.debug('Waiting to shut down') while not self.shutdown_event.is_set(): sleep(.5) log.info('Shut down')