def mediacloud_search_action(cli_args): writer = csv.writer(cli_args.output) writer.writerow(MEDIACLOUD_STORIES_CSV_HEADER) client = MediacloudAPIClient(cli_args.token) kwargs = { 'collections': cli_args.collections, 'medias': cli_args.medias, 'publish_day': cli_args.publish_day, 'publish_month': cli_args.publish_month, 'publish_year': cli_args.publish_year, 'filter_query': cli_args.filter_query } loading_bar = LoadingBar('Searching stories', unit='story', unit_plural='stories') try: if not cli_args.skip_count: count = client.count(cli_args.query, **kwargs) loading_bar.update_total(count) iterator = client.search(cli_args.query, **kwargs) for story in iterator: writer.writerow(story.as_csv_row()) loading_bar.update() except MediacloudServerError as e: loading_bar.die( ['Aborted due to a mediacloud server error:', e.server_error])
def extract_action(cli_args): if cli_args.glob is None and cli_args.input_dir is None: cli_args.input_dir = DEFAULT_CONTENT_FOLDER input_data = cli_args.report if cli_args.glob is not None: input_data = dummy_csv_file_from_glob(cli_args.glob, cli_args.input_dir) enricher = casanova.enricher( input_data, cli_args.output, keep=cli_args.select, add=OUTPUT_ADDITIONAL_HEADERS ) loading_bar = LoadingBar( desc='Extracting content', total=cli_args.total, unit='doc' ) def on_irrelevant_row(reason, row, i): loading_bar.update() loading_bar.print('Row n°{n} could not be processed: {reason}'.format(n=i + 1, reason=reason)) enricher.writerow(row, format_error(reason)) if ( cli_args.glob is None and 'raw_contents' not in enricher.headers and not isdir(cli_args.input_dir) ): loading_bar.die([ 'Could not find the "%s" directory!' % cli_args.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) files = create_report_iterator( cli_args, enricher, on_irrelevant_row=on_irrelevant_row ) pool = LazyPool(cli_args.processes) loading_bar.update_stats(p=pool.processes) with pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, format_error(report_error(error))) continue if result is None: enricher.writerow(row, format_error('no-result')) continue enricher.writerow(row, result)
def mediacloud_search_action(namespace, output_file): writer = csv.writer(output_file) writer.writerow(MEDIACLOUD_STORIES_CSV_HEADER) client = MediacloudAPIClient(namespace.token) kwargs = { 'collections': namespace.collections, 'medias': namespace.medias, 'publish_day': namespace.publish_day, 'publish_month': namespace.publish_month, 'publish_year': namespace.publish_year } loading_bar = LoadingBar('Searching stories', unit='story', unit_plural='stories') try: if not namespace.skip_count: count = client.count(namespace.query, **kwargs) loading_bar.update_total(count) iterator = client.search(namespace.query, format='csv_row', **kwargs) for story in iterator: writer.writerow(story) loading_bar.update() except MediacloudServerError as e: loading_bar.die( ['Aborted due to a mediacloud server error:', e.server_error])
def twitter_scrape_action(cli_args): scraper = TwitterAPIScraper() # Stats loading_bar = LoadingBar('Collecting tweets', total=cli_args.limit, unit='tweet', stats={ 'tokens': 1, 'queries': 0 }) enricher = casanova.enricher(cli_args.file, cli_args.output, add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS, keep=cli_args.select) def before_sleep(retry_state): exc = retry_state.outcome.exception() if isinstance(exc, TwitterPublicAPIRateLimitError): loading_bar.inc('tokens') else: loading_bar.inc('failures') loading_bar.print( 'Failed to call Twitter search. Will retry in %s' % prettyprint_seconds(retry_state.idle_for)) for row, query in enricher.cells(cli_args.query, with_rows=True): # Templating? if cli_args.query_template is not None: query = CUSTOM_FORMATTER.format(cli_args.query_template, value=query) loading_bar.print('Searching for "%s"' % query) loading_bar.inc('queries') iterator = scraper.search( query, limit=cli_args.limit, before_sleep=before_sleep, include_referenced_tweets=cli_args.include_refs, with_meta=True) try: for tweet, meta in iterator: loading_bar.update() tweet_row = format_tweet_as_csv_row(tweet) enricher.writerow(row, tweet_row + format_meta_row(meta)) except TwitterPublicAPIOverCapacityError: loading_bar.die('Got an "Over Capacity" error. Shutting down...')
def extract_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher( namespace.report, output_file, keep=namespace.select, add=OUTPUT_ADDITIONAL_HEADERS ) loading_bar = LoadingBar( desc='Extracting content', total=namespace.total, unit='doc' ) def on_irrelevant_row(reason, row): loading_bar.update() enricher.writerow(row, format_error(reason)) try: files = create_report_iterator( namespace, enricher, on_irrelevant_row=on_irrelevant_row ) except NotADirectoryError: loading_bar.die([ 'Could not find the "%s" directory!' % namespace.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) pool = LazyPool(namespace.processes) loading_bar.update_stats(p=pool.processes) with pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, format_error(report_error(error))) continue if result is None: enricher.writerow(row, format_error('no-content')) continue enricher.writerow(row, result) loading_bar.close() output_file.close()
def mediacloud_medias_action(cli_args): added_headers = MEDIACLOUD_MEDIA_CSV_HEADER[1:] feeds_writer = None if cli_args.feeds: added_headers.append('feeds') feeds_writer = csv.writer(cli_args.feeds) feeds_writer.writerow(MEDIACLOUD_FEED_CSV_HEADER) enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=added_headers ) loading_bar = LoadingBar( desc='Fetching medias', unit='media', total=cli_args.total ) client = MediacloudAPIClient(cli_args.token) for row, media_id in enricher.cells(cli_args.column, with_rows=True): try: result = client.media(media_id) result = result.as_csv_row()[1:] if cli_args.feeds: feeds = client.feeds(media_id) enricher.writerow(row, result + [len(feeds)]) for feed in feeds: feeds_writer.writerow(feed.as_csv_row()) else: enricher.writerow(row, result) except MediacloudServerError as e: loading_bar.die([ 'Aborted due to a mediacloud server error:', e.server_error ]) loading_bar.update()
def twitter_scrape_action(cli_args): scraper = TwitterAPIScraper() # Stats loading_bar = LoadingBar( 'Collecting tweets', total=cli_args.limit, unit='tweet', stats={'tokens': 1, 'queries': 0} ) enricher = casanova.enricher( cli_args.file, cli_args.output, add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS, keep=cli_args.select ) for row, query in enricher.cells(cli_args.query, with_rows=True): # Templating? if cli_args.query_template is not None: query = CUSTOM_FORMATTER.format( cli_args.query_template, value=query ) loading_bar.print('Searching for "%s"' % query) loading_bar.inc('queries') iterator = scraper.search( query, limit=cli_args.limit, include_referenced_tweets=cli_args.include_refs, with_meta=True ) try: for tweet, meta in iterator: loading_bar.update() tweet_row = format_tweet_as_csv_row(tweet) enricher.writerow(row, tweet_row + format_meta_row(meta)) except TwitterPublicAPIOverCapacityError: loading_bar.die('Got an "Over Capacity" error. Shutting down...')
def facebook_url_likes_action(cli_args): enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=REPORT_HEADERS, total=cli_args.total, prebuffer_bytes=DEFAULT_PREBUFFER_BYTES ) if cli_args.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column ]) loading_bar = LoadingBar( desc='Retrieving likes', unit='url', total=enricher.total ) for row, url in enricher.cells(cli_args.column, with_rows=True): loading_bar.update() url = url.strip() if not url or not is_url(url, require_protocol=False): enricher.writerow(row) continue err, html = make_request(url) if err is not None: loading_bar.die('An error occurred while fetching like button for this url: %s' % url) scraped = scrape(html) if scraped is None: loading_bar.die('Could not extract Facebook likes from this url\'s like button: %s' % url) enricher.writerow(row, scraped)
def scrape_action(cli_args): # Parsing scraper definition try: scraper = Scraper(cli_args.scraper, strain=cli_args.strain) except DefinitionInvalidFormatError: die(['Unknown scraper format!', 'It should be a JSON or YAML file.']) except FileNotFoundError: die('Could not find scraper file!') except InvalidScraperError as error: print('Your scraper is invalid! You need to fix the following errors:', file=sys.stderr) print(file=sys.stderr) sys.stderr.write( report_scraper_validation_errors(error.validation_errors)) die() except CSSSelectorTooComplex: die([ 'Your strainer\'s CSS selector %s is too complex.' % colored(cli_args.strain, 'blue'), 'You cannot use relations to create a strainer.', 'Try to simplify the selector you passed to --strain.' ]) if cli_args.validate: print('Your scraper is valid.', file=sys.stderr) sys.exit(0) if scraper.headers is None and cli_args.format == 'csv': die([ 'Your scraper does not yield tabular data.', 'Try changing it or setting --format to "jsonl".' ]) loading_bar = LoadingBar(desc='Scraping pages', total=cli_args.total, unit='page') worker_args = (cli_args.format, cli_args.separator) def on_irrelevant_row(reason, row): loading_bar.update() if cli_args.glob is not None: files = create_glob_iterator(cli_args, worker_args) else: reader = casanova.reader(cli_args.report) try: files = create_report_iterator(cli_args, reader, worker_args=worker_args, on_irrelevant_row=on_irrelevant_row) except NotADirectoryError: loading_bar.die([ 'Could not find the "%s" directory!' % cli_args.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) if cli_args.format == 'csv': output_writer = csv.DictWriter(cli_args.output, fieldnames=scraper.headers) output_writer.writeheader() else: output_writer = ndjson.writer(cli_args.output) pool = LazyPool(cli_args.processes, initializer=init_process, initargs=(scraper.definition, cli_args.strain)) loading_bar.update_stats(p=pool.processes) with pool: for error, items in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: if isinstance(error, (ScraperEvalError, ScraperEvalTypeError, ScraperEvalNoneError)): loading_bar.print(report_scraper_evaluation_error(error), end='') loading_bar.inc('errors') continue for item in items: output_writer.writerow(item)
def action(cli_args): resume = getattr(cli_args, 'resume', False) # Validation if resume: if cli_args.sort_by != 'date': die('Cannot --resume if --sort_by is not `date`.') if cli_args.format != 'csv': die('Cannot --resume jsonl format yet.') if cli_args.format == 'csv': fieldnames = csv_headers(cli_args) if callable( csv_headers) else csv_headers writer = casanova.writer(cli_args.output, fieldnames) else: writer = ndjson.writer(cli_args.output) # Acquiring state from resumer if getattr(cli_args, 'resume', False): last_date = cli_args.output.pop_state() if last_date is not None: cli_args.end_date = last_date.replace(' ', 'T') print_err('Resuming from: %s' % cli_args.end_date) if callable(announce): print_err(announce(cli_args)) # Loading bar loading_bar = LoadingBar(desc='Fetching %s' % item_name, unit=item_name[:-1], total=cli_args.limit) args = [] if callable(get_args): args = get_args(cli_args) client = CrowdTangleAPIClient(cli_args.token, rate_limit=cli_args.rate_limit) create_iterator = getattr(client, method_name) iterator = create_iterator(*args, limit=cli_args.limit, raw=cli_args.format != 'csv', per_call=True, detailed=True, namespace=cli_args) try: for details, items in iterator: loading_bar.update(len(items)) if details is not None: loading_bar.update_stats(**details) for item in items: if cli_args.format == 'csv': item = item.as_csv_row() writer.writerow(item) except CrowdTangleInvalidTokenError: loading_bar.die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ])
def action(cli_args): resume = getattr(cli_args, 'resume', False) # Validation if resume: if cli_args.sort_by != 'date': die('Cannot --resume if --sort_by is not `date`.') if cli_args.format != 'csv': die('Cannot --resume jsonl format yet.') if cli_args.format == 'csv': fieldnames = csv_headers(cli_args) if callable( csv_headers) else csv_headers writer = casanova.writer(cli_args.output, fieldnames) else: writer = ndjson.writer(cli_args.output) # Acquiring state from resumer if getattr(cli_args, 'resume', False): last_date = cli_args.output.pop_state() if last_date is not None: cli_args.end_date = last_date.replace(' ', 'T') print_err('Resuming from: %s' % cli_args.end_date) if callable(announce): print_err(announce(cli_args)) # Loading bar loading_bar = LoadingBar(desc='Fetching %s' % item_name, unit=item_name[:-1], total=cli_args.limit) client = CrowdTangleAPIClient(cli_args.token, rate_limit=cli_args.rate_limit) args = [] if callable(get_args): args = get_args(cli_args) def before_sleep(retry_state): exc = retry_state.outcome.exception() if isinstance(exc, CrowdTangleRateLimitExceeded): reason = 'Call failed because of rate limit!' elif isinstance(exc, CrowdTangleInvalidJSONError): reason = 'Call failed because of invalid JSON payload!' else: reason = 'Call failed because of server timeout!' loading_bar.print( '%s\nWill wait for %s before attempting again.' % (reason, prettyprint_seconds(retry_state.idle_for, granularity=2))) create_iterator = getattr(client, method_name) iterator = create_iterator(*args, limit=cli_args.limit, raw=cli_args.format != 'csv', per_call=True, detailed=True, namespace=cli_args, before_sleep=before_sleep) try: for details, items in iterator: loading_bar.update(len(items)) if details is not None: loading_bar.update_stats(**details) for item in items: if cli_args.format == 'csv': item = item.as_csv_row() writer.writerow(item) except CrowdTangleInvalidTokenError: loading_bar.die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ])