예제 #1
0
def fetch_action(namespace):

    # Are we resuming
    resuming = namespace.resume

    if resuming and not namespace.output:
        die(['Cannot --resume without specifying -o/--output.'])

    # Do we need to fetch only a single url?
    if namespace.file is sys.stdin and is_url(namespace.column):
        namespace.file = StringIO('url\n%s' % namespace.column)
        namespace.column = 'url'

        # If we are hitting a single url we enable contents_in_report
        if namespace.contents_in_report is None:
            namespace.contents_in_report = True

    input_headers, pos, reader = custom_reader(namespace.file,
                                               namespace.column)
    filename_pos = input_headers.index(
        namespace.filename) if namespace.filename else None
    indexed_input_headers = {h: p for p, h in enumerate(input_headers)}

    selected_fields = namespace.select.split(',') if namespace.select else None
    selected_pos = [input_headers.index(h)
                    for h in selected_fields] if selected_fields else None

    # HTTP method
    http_method = namespace.method

    # Cookie grabber
    get_cookie = None
    if namespace.grab_cookies:
        get_cookie = grab_cookies(namespace.grab_cookies)

    # Global headers
    global_headers = None
    if namespace.headers:
        global_headers = {}

        for header in namespace.headers:
            k, v = parse_http_header(header)
            global_headers = v

    # Reading output
    output_headers = (list(input_headers) if not selected_pos else
                      [input_headers[i] for i in selected_pos])
    output_headers += OUTPUT_ADDITIONAL_HEADERS

    if namespace.contents_in_report:
        output_headers.append('raw_content')

    flag = 'w'

    if namespace.output is not None and resuming and isfile(namespace.output):
        flag = 'r+'

    output_file = open_output_file(namespace.output, flag=flag)

    output_writer = csv.writer(output_file)

    if not resuming:
        output_writer.writerow(output_headers)
    else:

        # Reading report to know what need to be done
        _, rpos, resuming_reader = custom_reader(output_file, 'line')

        resuming_reader_loading = tqdm(resuming_reader,
                                       desc='Resuming',
                                       dynamic_ncols=True,
                                       unit=' lines')

        already_done = ContiguousRangeSet()

        for line in resuming_reader_loading:
            index = line[rpos]

            already_done.add(int(index))

    # Loading bar
    total = namespace.total

    if total is not None and resuming:
        total -= len(already_done)

    loading_bar = tqdm(desc='Fetching pages',
                       total=total,
                       dynamic_ncols=True,
                       unit=' urls')

    def url_key(item):
        line = item[1]
        url = line[pos].strip()

        if not url:
            return

        # Url templating
        if namespace.url_template:
            return namespace.url_template.format(value=url)

        return url

    def request_args(url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {'method': http_method, 'cookie': cookie, 'headers': headers}

    def write_output(index,
                     line,
                     resolved=None,
                     status=None,
                     error=None,
                     filename=None,
                     encoding=None,
                     data=None):

        if selected_pos:
            line = [line[p] for p in selected_pos]

        line.extend([
            index, resolved or '', status or '', error or '', filename or '',
            encoding or ''
        ])

        if namespace.contents_in_report:
            line.append(data or '')

        output_writer.writerow(line)

    errors = 0
    status_codes = Counter()

    target_iterator = enumerate(reader)

    if resuming:
        target_iterator = (pair for pair in target_iterator
                           if not already_done.stateful_contains(pair[0]))

    multithreaded_iterator = multithreaded_fetch(target_iterator,
                                                 key=url_key,
                                                 request_args=request_args,
                                                 threads=namespace.threads,
                                                 throttle=namespace.throttle)

    for result in multithreaded_iterator:
        line_index, line = result.item

        if not result.url:

            write_output(line_index, line)

            loading_bar.update()
            continue

        response = result.response
        data = response.data if response is not None else None

        content_write_flag = 'wb'

        # Updating stats
        if result.error is not None:
            errors += 1
        else:
            if response.status >= 400:
                status_codes[response.status] += 1

        postfix = {'errors': errors}

        for code, count in status_codes.most_common(1):
            postfix[str(code)] = count

        loading_bar.set_postfix(**postfix)
        loading_bar.update()

        # No error
        if result.error is None:

            filename = None

            # Building filename
            if data:
                if filename_pos is not None or namespace.filename_template:
                    if namespace.filename_template:
                        filename = CUSTOM_FORMATTER.format(
                            namespace.filename_template,
                            value=line[filename_pos]
                            if filename_pos is not None else None,
                            ext=result.meta['ext'],
                            line=LazyLineDict(indexed_input_headers, line))
                    else:
                        filename = line[filename_pos] + result.meta['ext']
                else:
                    # NOTE: it would be nice to have an id that can be sorted by time
                    filename = str(uuid4()) + result.meta['ext']

            # Standardize encoding?
            encoding = result.meta['encoding']

            if data and namespace.standardize_encoding or namespace.contents_in_report:
                if encoding is None or encoding != 'utf-8' or namespace.contents_in_report:
                    data = data.decode(
                        encoding if encoding is not None else 'utf-8',
                        errors='replace')
                    encoding = 'utf-8'
                    content_write_flag = 'w'

            # Writing file on disk
            if data and not namespace.contents_in_report:

                if namespace.compress:
                    filename += '.gz'

                resource_path = join(namespace.output_dir, filename)
                resource_dir = dirname(resource_path)

                os.makedirs(resource_dir, exist_ok=True)

                with open(resource_path, content_write_flag) as f:

                    # TODO: what if standardize_encoding + compress?
                    f.write(
                        gzip.compress(data) if namespace.compress else data)

            # Reporting in output
            resolved_url = response.geturl()

            write_output(
                line_index,
                line,
                resolved=resolved_url if resolved_url != result.url else None,
                status=response.status,
                filename=filename,
                encoding=encoding,
                data=data)

        # Handling potential errors
        else:
            error_code = report_error(result.error)

            write_output(line_index, line, error=error_code)

    # Closing files
    if namespace.output is not None:
        output_file.close()
예제 #2
0
def fetch_action(cli_args, resolve=False, defer=None):

    # If we are hitting a single url we enable contents_in_report by default
    if not resolve and isinstance(cli_args.file, StringIO) and cli_args.contents_in_report is None:
        cli_args.contents_in_report = True

    if not resolve and cli_args.contents_in_report and cli_args.compress:
        raise InvalidArgumentsError('Cannot both --compress and output --contents-in-report!')

    # HTTP method
    http_method = cli_args.method

    # Cookie grabber
    get_cookie = None
    if cli_args.grab_cookies:
        get_cookie = grab_cookies(cli_args.grab_cookies)

    # Global headers
    global_headers = None
    if cli_args.headers:
        global_headers = {}

        for header in cli_args.headers:
            k, v = parse_http_header(header)
            global_headers[k] = v

    # Resume listener
    skipped_rows = 0
    resuming_reader_loading = None

    if cli_args.resume and cli_args.output.can_resume():
        resuming_reader_loading = LoadingBar(
            desc='Resuming',
            unit='line'
        )

        def output_read_listener(event, row):
            nonlocal skipped_rows

            if event != 'output.row':
                return

            skipped_rows += 1
            resuming_reader_loading.update()

        cli_args.output.listener = output_read_listener

    if resolve:
        additional_headers = RESOLVE_ADDITIONAL_HEADERS
    else:
        additional_headers = FETCH_ADDITIONAL_HEADERS

        if cli_args.contents_in_report:
            additional_headers = additional_headers + ['raw_contents']

    # Enricher
    multiplex = None

    if cli_args.separator is not None:
        multiplex = (cli_args.column, cli_args.separator)

    enricher = casanova.threadsafe_enricher(
        cli_args.file,
        cli_args.output,
        add=additional_headers,
        keep=cli_args.select,
        total=cli_args.total,
        prebuffer_bytes=DEFAULT_PREBUFFER_BYTES,
        multiplex=multiplex
    )

    if resuming_reader_loading is not None:
        resuming_reader_loading.close()

    if cli_args.column not in enricher.headers:
        raise InvalidArgumentsError('Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column)

    url_pos = enricher.headers[cli_args.column]

    filename_pos = None

    if not resolve and cli_args.filename is not None:
        if cli_args.filename not in enricher.headers:
            raise InvalidArgumentsError('Could not find the "%s" column containing the filenames in the given CSV file.' % cli_args.filename)

        filename_pos = enricher.headers[cli_args.filename]

    # Loading bar
    loading_bar = LoadingBar(
        desc='Fetching pages',
        total=enricher.total,
        unit='url',
        initial=skipped_rows
    )
    defer(loading_bar.close)  # NOTE: it could be dangerous with multithreaded execution, not to close it ourselves

    def update_loading_bar(result):
        nonlocal errors

        if result.error is not None:
            errors += 1
        else:
            if resolve:
                status = result.stack[-1].status
            else:
                status = result.response.status

            if status >= 400:
                status_codes[status] += 1

        stats = {'errors': errors}

        for code, count in status_codes.most_common(1):
            stats[str(code)] = count

        loading_bar.update_stats(**stats)
        loading_bar.update()

    only_shortened = getattr(cli_args, 'only_shortened', False)

    def url_key(item):
        url = item[1][url_pos].strip()

        if not url:
            return

        if only_shortened and not is_shortened_url(url):
            return

        # Url templating
        if cli_args.url_template:
            return cli_args.url_template.format(value=url)

        return url

    def request_args(domain, url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {
            'method': http_method,
            'cookie': cookie,
            'headers': headers
        }

    # Worker callback internals
    filename_builder = None
    files_writer = None

    if not resolve:
        try:
            filename_builder = FilenameBuilder(
                folder_strategy=cli_args.folder_strategy,
                template=cli_args.filename_template
            )
        except TypeError:
            die([
                'Invalid "%s" --folder-strategy!' % cli_args.folder_strategy,
                'Check the list at the end of the command help:',
                '  $ minet fetch -h'
            ])

        files_writer = ThreadSafeFilesWriter(cli_args.output_dir)

    def worker_callback(result):
        # NOTE: at this point the callback is only fired on success
        row = result.item[1]
        response = result.response
        meta = result.meta

        if cli_args.keep_failed_contents and response.status != 200:
            return

        # First we need to build a filename
        filename_cell = row[filename_pos] if filename_pos else None

        formatter_kwargs = {}

        if cli_args.filename_template and 'line' in cli_args.filename_template:
            formatter_kwargs['line'] = enricher.wrap(row)

        try:
            filename = filename_builder(
                result.resolved,
                filename=filename_cell,
                ext=meta.get('ext'),
                formatter_kwargs=formatter_kwargs,
                compressed=cli_args.compress
            )
        except FilenameFormattingError as e:
            result.error = e
            return

        meta['filename'] = filename

        # Decoding the response data?
        is_text = meta.get('is_text', False)
        original_encoding = meta.get('encoding', 'utf-8')

        data = response.data
        binary = True

        if is_text and (cli_args.standardize_encoding or cli_args.contents_in_report):
            data = data.decode(original_encoding, errors='replace')
            binary = False

            if cli_args.contents_in_report:
                meta['decoded_contents'] = data

        # Writing the file?
        # TODO: specify what should happen when contents are empty (e.g. POST queries)
        if data and not cli_args.contents_in_report:
            files_writer.write(
                filename,
                data,
                binary=binary,
                compress=cli_args.compress
            )

    def write_fetch_output(index, row, resolved=None, status=None, error=None,
                           filename=None, encoding=None, mimetype=None, data=None):

        addendum = [
            resolved or '',
            status or '',
            error or '',
            filename or '',
            mimetype or '',
            encoding or ''
        ]

        if cli_args.contents_in_report:
            addendum.append(data or '')

        enricher.writerow(index, row, addendum)

    def write_resolve_output(index, row, resolved=None, status=None, error=None,
                             redirects=None, chain=None):
        addendum = [
            resolved or '',
            status or '',
            error or '',
            redirects or '',
            chain or ''
        ]

        enricher.writerow(index, row, addendum)

    errors = 0
    status_codes = Counter()

    common_kwargs = {
        'key': url_key,
        'insecure': cli_args.insecure,
        'threads': cli_args.threads,
        'throttle': cli_args.throttle,
        'domain_parallelism': cli_args.domain_parallelism,
        'max_redirects': cli_args.max_redirects,
        'wait': False,
        'daemonic': True
    }

    if cli_args.timeout is not None:
        common_kwargs['timeout'] = cli_args.timeout

    # Normal fetch
    if not resolve:

        multithreaded_iterator = multithreaded_fetch(
            enricher,
            request_args=request_args,
            callback=worker_callback,
            **common_kwargs
        )

        for result in multithreaded_iterator:
            index, row = result.item

            if not result.url:

                write_fetch_output(
                    index,
                    row
                )

                loading_bar.update()
                continue

            # Updating stats
            update_loading_bar(result)

            # No error
            if result.error is None:
                meta = result.meta

                # Final url target
                resolved_url = result.resolved

                if resolved_url == result.url:
                    resolved_url = None

                # Reporting in output
                write_fetch_output(
                    index,
                    row,
                    resolved=resolved_url,
                    status=result.response.status,
                    filename=meta.get('filename'),
                    encoding=meta.get('encoding'),
                    mimetype=meta.get('mimetype'),
                    data=meta.get('decoded_contents')
                )

            # Handling potential errors
            else:
                error_code = report_error(result.error)

                resolved = None

                if isinstance(result.error, InvalidURLError):
                    resolved = result.error.url

                if isinstance(result.error, FilenameFormattingError):
                    loading_bar.print(report_filename_formatting_error(result.error))

                write_fetch_output(
                    index,
                    row,
                    error=error_code,
                    resolved=resolved
                )

    # Resolve
    else:

        multithreaded_iterator = multithreaded_resolve(
            enricher,
            resolve_args=request_args,
            follow_meta_refresh=cli_args.follow_meta_refresh,
            follow_js_relocation=cli_args.follow_js_relocation,
            infer_redirection=cli_args.infer_redirection,
            **common_kwargs
        )

        for result in multithreaded_iterator:
            index, row = result.item

            if not result.url:

                write_resolve_output(
                    index,
                    row
                )

                loading_bar.update()
                continue

            # Updating stats
            update_loading_bar(result)

            # No error
            if result.error is None:

                # Reporting in output
                last = result.stack[-1]

                write_resolve_output(
                    index,
                    row,
                    resolved=last.url,
                    status=last.status,
                    redirects=len(result.stack) - 1,
                    chain='|'.join(step.type for step in result.stack)
                )

            # Handling potential errors
            else:
                error_code = report_error(result.error)

                write_resolve_output(
                    index,
                    row,
                    error=error_code,
                    redirects=(len(result.stack) - 1) if result.stack else None,
                    chain='|'.join(step.type for step in result.stack) if result.stack else None
                )
예제 #3
0
def fetch_action(namespace):

    # Are we resuming
    resuming = namespace.resume

    if resuming and not namespace.output:
        die(['Cannot --resume without specifying -o/--output.'])

    # Do we need to fetch only a single url?
    single_url = namespace.file is sys.stdin and is_url(namespace.column)

    if single_url:
        edit_namespace_with_csv_io(namespace, 'url')

        # If we are hitting a single url we enable contents_in_report
        if namespace.contents_in_report is None:
            namespace.contents_in_report = True

    # HTTP method
    http_method = namespace.method

    # Cookie grabber
    get_cookie = None
    if namespace.grab_cookies:
        get_cookie = grab_cookies(namespace.grab_cookies)

    # Global headers
    global_headers = None
    if namespace.headers:
        global_headers = {}

        for header in namespace.headers:
            k, v = parse_http_header(header)
            global_headers = v

    flag = 'w'
    if namespace.output is not None and resuming and isfile(namespace.output):
        flag = 'r+'

    output_file = open_output_file(namespace.output, flag=flag)

    # Resume listener
    listener = None
    resuming_reader_loading = None
    skipped = 0

    if resuming:
        resuming_reader_loading = tqdm(desc='Resuming',
                                       dynamic_ncols=True,
                                       unit=' lines')

        def listener(event, row):
            nonlocal skipped

            if event == 'resume.output':
                resuming_reader_loading.update()

            if event == 'resume.input':
                skipped += 1
                loading_bar.set_postfix(skipped=skipped)
                loading_bar.update()

    # Enricher
    enricher = casanova.threadsafe_enricher(
        namespace.file,
        output_file,
        resumable=resuming,
        auto_resume=False,
        add=OUTPUT_ADDITIONAL_HEADERS +
        (['raw_contents'] if namespace.contents_in_report else []),
        keep=namespace.select,
        listener=listener)

    if namespace.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.'
            % namespace.column
        ])

    url_pos = enricher.pos[namespace.column]

    filename_pos = None

    if namespace.filename is not None:
        if namespace.filename not in enricher.pos:
            die([
                'Could not find the "%s" column containing the filenames in the given CSV file.'
                % namespace.filename
            ])

        filename_pos = enricher.pos[namespace.filename]

    indexed_input_headers = {h: i for i, h in enumerate(enricher.fieldnames)}

    if resuming:
        enricher.resume()
        resuming_reader_loading.close()

    # Loading bar
    total = namespace.total

    loading_bar = tqdm(desc='Fetching pages',
                       total=total,
                       dynamic_ncols=True,
                       unit=' urls')

    def url_key(item):
        url = item[1][url_pos].strip()

        if not url:
            return

        # Url templating
        if namespace.url_template:
            return namespace.url_template.format(value=url)

        return url

    def request_args(url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {'method': http_method, 'cookie': cookie, 'headers': headers}

    def write_output(index,
                     row,
                     resolved=None,
                     status=None,
                     error=None,
                     filename=None,
                     encoding=None,
                     data=None):

        addendum = [
            resolved or '', status or '', error or '', filename or '', encoding
            or ''
        ]

        if namespace.contents_in_report:
            addendum.append(data or '')

        enricher.writerow(index, row, addendum)

    errors = 0
    status_codes = Counter()

    fetch_kwargs = {
        'threads': namespace.threads,
        'throttle': namespace.throttle,
        'domain_parallelism': namespace.domain_parallelism
    }

    if namespace.timeout is not None:
        fetch_kwargs['timeout'] = namespace.timeout

    multithreaded_iterator = multithreaded_fetch(enricher,
                                                 key=url_key,
                                                 request_args=request_args,
                                                 **fetch_kwargs)

    for result in multithreaded_iterator:
        index, row = result.item

        if not result.url:

            write_output(index, row)

            loading_bar.update()
            continue

        response = result.response
        data = response.data if response is not None else None

        content_write_flag = 'wb'

        # Updating stats
        if result.error is not None:
            errors += 1
        else:
            if response.status >= 400:
                status_codes[response.status] += 1

        postfix = {'errors': errors}

        for code, count in status_codes.most_common(1):
            postfix[str(code)] = count

        loading_bar.set_postfix(**postfix)
        loading_bar.update()

        # No error
        if result.error is None:

            filename = None

            # Building filename
            if data:
                if filename_pos is not None or namespace.filename_template:
                    if namespace.filename_template:
                        filename = CUSTOM_FORMATTER.format(
                            namespace.filename_template,
                            value=row[filename_pos]
                            if filename_pos is not None else None,
                            ext=result.meta['ext'],
                            line=LazyLineDict(indexed_input_headers, row))
                    else:
                        filename = row[filename_pos] + result.meta['ext']
                else:
                    # NOTE: it would be nice to have an id that can be sorted by time
                    filename = str(uuid4()) + result.meta['ext']

            # Standardize encoding?
            encoding = result.meta['encoding']

            if data and namespace.standardize_encoding or namespace.contents_in_report:
                if encoding is None or encoding != 'utf-8' or namespace.contents_in_report:
                    data = data.decode(
                        encoding if encoding is not None else 'utf-8',
                        errors='replace')
                    encoding = 'utf-8'
                    content_write_flag = 'w'

            # Writing file on disk
            if data and not namespace.contents_in_report:

                if namespace.compress:
                    filename += '.gz'

                resource_path = join(namespace.output_dir, filename)
                resource_dir = dirname(resource_path)

                os.makedirs(resource_dir, exist_ok=True)

                with open(resource_path, content_write_flag) as f:

                    # TODO: what if standardize_encoding + compress?
                    f.write(
                        gzip.compress(data) if namespace.compress else data)

            # Reporting in output
            resolved_url = response.geturl()

            write_output(
                index,
                row,
                resolved=resolved_url if resolved_url != result.url else None,
                status=response.status,
                filename=filename,
                encoding=encoding,
                data=data)

        # Handling potential errors
        else:
            error_code = report_error(result.error)

            write_output(index, row, error=error_code)

    # Closing files
    output_file.close()