def extract_action(cli_args): if cli_args.glob is None and cli_args.input_dir is None: cli_args.input_dir = DEFAULT_CONTENT_FOLDER input_data = cli_args.report if cli_args.glob is not None: input_data = dummy_csv_file_from_glob(cli_args.glob, cli_args.input_dir) enricher = casanova.enricher( input_data, cli_args.output, keep=cli_args.select, add=OUTPUT_ADDITIONAL_HEADERS ) loading_bar = LoadingBar( desc='Extracting content', total=cli_args.total, unit='doc' ) def on_irrelevant_row(reason, row, i): loading_bar.update() loading_bar.print('Row n°{n} could not be processed: {reason}'.format(n=i + 1, reason=reason)) enricher.writerow(row, format_error(reason)) if ( cli_args.glob is None and 'raw_contents' not in enricher.headers and not isdir(cli_args.input_dir) ): loading_bar.die([ 'Could not find the "%s" directory!' % cli_args.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) files = create_report_iterator( cli_args, enricher, on_irrelevant_row=on_irrelevant_row ) pool = LazyPool(cli_args.processes) loading_bar.update_stats(p=pool.processes) with pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, format_error(report_error(error))) continue if result is None: enricher.writerow(row, format_error('no-result')) continue enricher.writerow(row, result)
def extract_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher(namespace.report, output_file, keep=namespace.select, add=OUTPUT_ADDITIONAL_HEADERS) loading_bar = tqdm(desc='Extracting content', total=namespace.total, dynamic_ncols=True, unit=' docs') files = create_report_iterator(namespace, enricher, loading_bar=loading_bar) with Pool(namespace.processes) as pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, [report_error(error)] + PADDING) continue if result is None: enricher.writerow(row, ['no-content'] + PADDING) continue enricher.writerow(row, result) output_file.close()
def extract_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher( namespace.report, output_file, keep=namespace.select, add=OUTPUT_ADDITIONAL_HEADERS ) loading_bar = LoadingBar( desc='Extracting content', total=namespace.total, unit='doc' ) def on_irrelevant_row(reason, row): loading_bar.update() enricher.writerow(row, format_error(reason)) try: files = create_report_iterator( namespace, enricher, on_irrelevant_row=on_irrelevant_row ) except NotADirectoryError: loading_bar.die([ 'Could not find the "%s" directory!' % namespace.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) pool = LazyPool(namespace.processes) loading_bar.update_stats(p=pool.processes) with pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, format_error(report_error(error))) continue if result is None: enricher.writerow(row, format_error('no-content')) continue enricher.writerow(row, result) loading_bar.close() output_file.close()
def scrape_action(namespace): output_file = open_output_file(namespace.output) # Parsing scraper definition try: scraper = load_definition(namespace.scraper) except TypeError: die(['Unknown scraper format.', 'Expecting a JSON or YAML file.']) except: die('Invalid scraper file.') if namespace.format == 'csv': output_headers = headers_from_definition(scraper) output_writer = csv.DictWriter(output_file, fieldnames=output_headers) output_writer.writeheader() else: output_writer = ndjson.writer(output_file) loading_bar = tqdm(desc='Scraping pages', total=namespace.total, dynamic_ncols=True, unit=' pages') loading_bar.set_postfix(p=namespace.processes) if namespace.glob is not None: files = create_glob_iterator(namespace, scraper) else: reader = casanova.reader(namespace.report) files = create_report_iterator(namespace, reader, scraper, loading_bar) with Pool(namespace.processes) as pool: for error, items in pool.imap_unordered(worker, files): loading_bar.update() if not isinstance(items, list): items = [items] for item in items: if not isinstance(item, dict): item = {'value': item} output_writer.writerow(item) output_file.close()
def extract_action(namespace): input_headers, pos, reader = custom_reader( namespace.report, ('status', 'filename', 'encoding')) selected_fields = namespace.select.split(',') if namespace.select else None selected_pos = [input_headers.index(h) for h in selected_fields] if selected_fields else None output_headers = (list(input_headers) if not selected_pos else [input_headers[i] for i in selected_pos]) output_headers += OUTPUT_ADDITIONAL_HEADERS output_file = open_output_file(namespace.output) output_writer = csv.writer(output_file) output_writer.writerow(output_headers) loading_bar = tqdm(desc='Extracting content', total=namespace.total, dynamic_ncols=True, unit=' docs') namespace.report.close() namespace.report = open(namespace.report.name) files = create_report_iterator(namespace, loading_bar=loading_bar) with Pool(namespace.processes) as pool: for error, line, content in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: message = report_error(error) line.extend([message, '']) output_writer.writerow(line) continue line.extend(['', content]) output_writer.writerow(line) output_file.close()
def scrape_action(cli_args): # Parsing scraper definition try: scraper = Scraper(cli_args.scraper, strain=cli_args.strain) except DefinitionInvalidFormatError: die(['Unknown scraper format!', 'It should be a JSON or YAML file.']) except FileNotFoundError: die('Could not find scraper file!') except InvalidScraperError as error: print('Your scraper is invalid! You need to fix the following errors:', file=sys.stderr) print(file=sys.stderr) sys.stderr.write( report_scraper_validation_errors(error.validation_errors)) die() except CSSSelectorTooComplex: die([ 'Your strainer\'s CSS selector %s is too complex.' % colored(cli_args.strain, 'blue'), 'You cannot use relations to create a strainer.', 'Try to simplify the selector you passed to --strain.' ]) if cli_args.validate: print('Your scraper is valid.', file=sys.stderr) sys.exit(0) if scraper.headers is None and cli_args.format == 'csv': die([ 'Your scraper does not yield tabular data.', 'Try changing it or setting --format to "jsonl".' ]) loading_bar = LoadingBar(desc='Scraping pages', total=cli_args.total, unit='page') worker_args = (cli_args.format, cli_args.separator) def on_irrelevant_row(reason, row): loading_bar.update() if cli_args.glob is not None: files = create_glob_iterator(cli_args, worker_args) else: reader = casanova.reader(cli_args.report) try: files = create_report_iterator(cli_args, reader, worker_args=worker_args, on_irrelevant_row=on_irrelevant_row) except NotADirectoryError: loading_bar.die([ 'Could not find the "%s" directory!' % cli_args.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) if cli_args.format == 'csv': output_writer = csv.DictWriter(cli_args.output, fieldnames=scraper.headers) output_writer.writeheader() else: output_writer = ndjson.writer(cli_args.output) pool = LazyPool(cli_args.processes, initializer=init_process, initargs=(scraper.definition, cli_args.strain)) loading_bar.update_stats(p=pool.processes) with pool: for error, items in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: if isinstance(error, (ScraperEvalError, ScraperEvalTypeError, ScraperEvalNoneError)): loading_bar.print(report_scraper_evaluation_error(error), end='') loading_bar.inc('errors') continue for item in items: output_writer.writerow(item)