예제 #1
0
파일: extract.py 프로젝트: medialab/minet
def extract_action(cli_args):
    if cli_args.glob is None and cli_args.input_dir is None:
        cli_args.input_dir = DEFAULT_CONTENT_FOLDER

    input_data = cli_args.report

    if cli_args.glob is not None:
        input_data = dummy_csv_file_from_glob(cli_args.glob, cli_args.input_dir)

    enricher = casanova.enricher(
        input_data,
        cli_args.output,
        keep=cli_args.select,
        add=OUTPUT_ADDITIONAL_HEADERS
    )

    loading_bar = LoadingBar(
        desc='Extracting content',
        total=cli_args.total,
        unit='doc'
    )

    def on_irrelevant_row(reason, row, i):
        loading_bar.update()
        loading_bar.print('Row n°{n} could not be processed: {reason}'.format(n=i + 1, reason=reason))
        enricher.writerow(row, format_error(reason))

    if (
        cli_args.glob is None and
        'raw_contents' not in enricher.headers and
        not isdir(cli_args.input_dir)
    ):
        loading_bar.die([
            'Could not find the "%s" directory!' % cli_args.input_dir,
            'Did you forget to specify it with -i/--input-dir?'
        ])

    files = create_report_iterator(
        cli_args,
        enricher,
        on_irrelevant_row=on_irrelevant_row
    )

    pool = LazyPool(cli_args.processes)

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, format_error(report_error(error)))
                continue

            if result is None:
                enricher.writerow(row, format_error('no-result'))
                continue

            enricher.writerow(row, result)
예제 #2
0
def extract_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(namespace.report,
                                 output_file,
                                 keep=namespace.select,
                                 add=OUTPUT_ADDITIONAL_HEADERS)

    loading_bar = tqdm(desc='Extracting content',
                       total=namespace.total,
                       dynamic_ncols=True,
                       unit=' docs')

    files = create_report_iterator(namespace,
                                   enricher,
                                   loading_bar=loading_bar)

    with Pool(namespace.processes) as pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, [report_error(error)] + PADDING)
                continue

            if result is None:
                enricher.writerow(row, ['no-content'] + PADDING)
                continue

            enricher.writerow(row, result)

    output_file.close()
예제 #3
0
def extract_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(
        namespace.report,
        output_file,
        keep=namespace.select,
        add=OUTPUT_ADDITIONAL_HEADERS
    )

    loading_bar = LoadingBar(
        desc='Extracting content',
        total=namespace.total,
        unit='doc'
    )

    def on_irrelevant_row(reason, row):
        loading_bar.update()
        enricher.writerow(row, format_error(reason))

    try:
        files = create_report_iterator(
            namespace,
            enricher,
            on_irrelevant_row=on_irrelevant_row
        )
    except NotADirectoryError:
        loading_bar.die([
            'Could not find the "%s" directory!' % namespace.input_dir,
            'Did you forget to specify it with -i/--input-dir?'
        ])

    pool = LazyPool(namespace.processes)

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, format_error(report_error(error)))
                continue

            if result is None:
                enricher.writerow(row, format_error('no-content'))
                continue

            enricher.writerow(row, result)

    loading_bar.close()
    output_file.close()
예제 #4
0
파일: scrape.py 프로젝트: rangsutu88/minet
def scrape_action(namespace):

    output_file = open_output_file(namespace.output)

    # Parsing scraper definition
    try:
        scraper = load_definition(namespace.scraper)
    except TypeError:
        die(['Unknown scraper format.', 'Expecting a JSON or YAML file.'])
    except:
        die('Invalid scraper file.')

    if namespace.format == 'csv':
        output_headers = headers_from_definition(scraper)
        output_writer = csv.DictWriter(output_file, fieldnames=output_headers)
        output_writer.writeheader()
    else:
        output_writer = ndjson.writer(output_file)

    loading_bar = tqdm(desc='Scraping pages',
                       total=namespace.total,
                       dynamic_ncols=True,
                       unit=' pages')

    loading_bar.set_postfix(p=namespace.processes)

    if namespace.glob is not None:
        files = create_glob_iterator(namespace, scraper)
    else:
        reader = casanova.reader(namespace.report)
        files = create_report_iterator(namespace, reader, scraper, loading_bar)

    with Pool(namespace.processes) as pool:
        for error, items in pool.imap_unordered(worker, files):
            loading_bar.update()

            if not isinstance(items, list):
                items = [items]

            for item in items:
                if not isinstance(item, dict):
                    item = {'value': item}

                output_writer.writerow(item)

    output_file.close()
예제 #5
0
def extract_action(namespace):
    input_headers, pos, reader = custom_reader(
        namespace.report, ('status', 'filename', 'encoding'))

    selected_fields = namespace.select.split(',') if namespace.select else None
    selected_pos = [input_headers.index(h)
                    for h in selected_fields] if selected_fields else None

    output_headers = (list(input_headers) if not selected_pos else
                      [input_headers[i] for i in selected_pos])
    output_headers += OUTPUT_ADDITIONAL_HEADERS

    output_file = open_output_file(namespace.output)

    output_writer = csv.writer(output_file)
    output_writer.writerow(output_headers)

    loading_bar = tqdm(desc='Extracting content',
                       total=namespace.total,
                       dynamic_ncols=True,
                       unit=' docs')

    namespace.report.close()
    namespace.report = open(namespace.report.name)
    files = create_report_iterator(namespace, loading_bar=loading_bar)

    with Pool(namespace.processes) as pool:
        for error, line, content in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                message = report_error(error)
                line.extend([message, ''])
                output_writer.writerow(line)
                continue

            line.extend(['', content])
            output_writer.writerow(line)

    output_file.close()
예제 #6
0
파일: scrape.py 프로젝트: zanachka/minet
def scrape_action(cli_args):

    # Parsing scraper definition
    try:
        scraper = Scraper(cli_args.scraper, strain=cli_args.strain)
    except DefinitionInvalidFormatError:
        die(['Unknown scraper format!', 'It should be a JSON or YAML file.'])
    except FileNotFoundError:
        die('Could not find scraper file!')
    except InvalidScraperError as error:
        print('Your scraper is invalid! You need to fix the following errors:',
              file=sys.stderr)
        print(file=sys.stderr)
        sys.stderr.write(
            report_scraper_validation_errors(error.validation_errors))
        die()
    except CSSSelectorTooComplex:
        die([
            'Your strainer\'s CSS selector %s is too complex.' %
            colored(cli_args.strain, 'blue'),
            'You cannot use relations to create a strainer.',
            'Try to simplify the selector you passed to --strain.'
        ])

    if cli_args.validate:
        print('Your scraper is valid.', file=sys.stderr)
        sys.exit(0)

    if scraper.headers is None and cli_args.format == 'csv':
        die([
            'Your scraper does not yield tabular data.',
            'Try changing it or setting --format to "jsonl".'
        ])

    loading_bar = LoadingBar(desc='Scraping pages',
                             total=cli_args.total,
                             unit='page')

    worker_args = (cli_args.format, cli_args.separator)

    def on_irrelevant_row(reason, row):
        loading_bar.update()

    if cli_args.glob is not None:
        files = create_glob_iterator(cli_args, worker_args)
    else:
        reader = casanova.reader(cli_args.report)

        try:
            files = create_report_iterator(cli_args,
                                           reader,
                                           worker_args=worker_args,
                                           on_irrelevant_row=on_irrelevant_row)
        except NotADirectoryError:
            loading_bar.die([
                'Could not find the "%s" directory!' % cli_args.input_dir,
                'Did you forget to specify it with -i/--input-dir?'
            ])

    if cli_args.format == 'csv':
        output_writer = csv.DictWriter(cli_args.output,
                                       fieldnames=scraper.headers)
        output_writer.writeheader()
    else:
        output_writer = ndjson.writer(cli_args.output)

    pool = LazyPool(cli_args.processes,
                    initializer=init_process,
                    initargs=(scraper.definition, cli_args.strain))

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, items in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                if isinstance(error, (ScraperEvalError, ScraperEvalTypeError,
                                      ScraperEvalNoneError)):
                    loading_bar.print(report_scraper_evaluation_error(error),
                                      end='')
                loading_bar.inc('errors')
                continue

            for item in items:
                output_writer.writerow(item)