示例#1
0
    def __init__(self, cookie='firefox'):
        self.pool = create_pool(timeout=INSTAGRAM_PUBLIC_API_DEFAULT_TIMEOUT)

        if cookie in COOKIE_BROWSERS:
            get_cookie_for_url = grab_cookies(cookie)
            cookie = get_cookie_for_url(INSTAGRAM_URL)

        if not cookie:
            raise InstagramInvalidCookieError

        self.cookie = cookie
示例#2
0
def grab_facebook_cookie(source):
    if source in COOKIE_BROWSERS:
        get_cookie_for_url = grab_cookies(source)

        if get_cookie_for_url is None:
            return None

        cookie = get_cookie_for_url(FACEBOOK_URL + '/')

    else:
        cookie = source.strip()

    if not cookie or 'c_user=' not in cookie.lower():
        return None

    return fix_cookie(cookie)
示例#3
0
def fetch_action(cli_args, resolve=False, defer=None):

    # If we are hitting a single url we enable contents_in_report by default
    if not resolve and isinstance(cli_args.file, StringIO) and cli_args.contents_in_report is None:
        cli_args.contents_in_report = True

    if not resolve and cli_args.contents_in_report and cli_args.compress:
        raise InvalidArgumentsError('Cannot both --compress and output --contents-in-report!')

    # HTTP method
    http_method = cli_args.method

    # Cookie grabber
    get_cookie = None
    if cli_args.grab_cookies:
        get_cookie = grab_cookies(cli_args.grab_cookies)

    # Global headers
    global_headers = None
    if cli_args.headers:
        global_headers = {}

        for header in cli_args.headers:
            k, v = parse_http_header(header)
            global_headers[k] = v

    # Resume listener
    skipped_rows = 0
    resuming_reader_loading = None

    if cli_args.resume and cli_args.output.can_resume():
        resuming_reader_loading = LoadingBar(
            desc='Resuming',
            unit='line'
        )

        def output_read_listener(event, row):
            nonlocal skipped_rows

            if event != 'output.row':
                return

            skipped_rows += 1
            resuming_reader_loading.update()

        cli_args.output.listener = output_read_listener

    if resolve:
        additional_headers = RESOLVE_ADDITIONAL_HEADERS
    else:
        additional_headers = FETCH_ADDITIONAL_HEADERS

        if cli_args.contents_in_report:
            additional_headers = additional_headers + ['raw_contents']

    # Enricher
    multiplex = None

    if cli_args.separator is not None:
        multiplex = (cli_args.column, cli_args.separator)

    enricher = casanova.threadsafe_enricher(
        cli_args.file,
        cli_args.output,
        add=additional_headers,
        keep=cli_args.select,
        total=cli_args.total,
        prebuffer_bytes=DEFAULT_PREBUFFER_BYTES,
        multiplex=multiplex
    )

    if resuming_reader_loading is not None:
        resuming_reader_loading.close()

    if cli_args.column not in enricher.headers:
        raise InvalidArgumentsError('Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column)

    url_pos = enricher.headers[cli_args.column]

    filename_pos = None

    if not resolve and cli_args.filename is not None:
        if cli_args.filename not in enricher.headers:
            raise InvalidArgumentsError('Could not find the "%s" column containing the filenames in the given CSV file.' % cli_args.filename)

        filename_pos = enricher.headers[cli_args.filename]

    # Loading bar
    loading_bar = LoadingBar(
        desc='Fetching pages',
        total=enricher.total,
        unit='url',
        initial=skipped_rows
    )
    defer(loading_bar.close)  # NOTE: it could be dangerous with multithreaded execution, not to close it ourselves

    def update_loading_bar(result):
        nonlocal errors

        if result.error is not None:
            errors += 1
        else:
            if resolve:
                status = result.stack[-1].status
            else:
                status = result.response.status

            if status >= 400:
                status_codes[status] += 1

        stats = {'errors': errors}

        for code, count in status_codes.most_common(1):
            stats[str(code)] = count

        loading_bar.update_stats(**stats)
        loading_bar.update()

    only_shortened = getattr(cli_args, 'only_shortened', False)

    def url_key(item):
        url = item[1][url_pos].strip()

        if not url:
            return

        if only_shortened and not is_shortened_url(url):
            return

        # Url templating
        if cli_args.url_template:
            return cli_args.url_template.format(value=url)

        return url

    def request_args(domain, url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {
            'method': http_method,
            'cookie': cookie,
            'headers': headers
        }

    # Worker callback internals
    filename_builder = None
    files_writer = None

    if not resolve:
        try:
            filename_builder = FilenameBuilder(
                folder_strategy=cli_args.folder_strategy,
                template=cli_args.filename_template
            )
        except TypeError:
            die([
                'Invalid "%s" --folder-strategy!' % cli_args.folder_strategy,
                'Check the list at the end of the command help:',
                '  $ minet fetch -h'
            ])

        files_writer = ThreadSafeFilesWriter(cli_args.output_dir)

    def worker_callback(result):
        # NOTE: at this point the callback is only fired on success
        row = result.item[1]
        response = result.response
        meta = result.meta

        if cli_args.keep_failed_contents and response.status != 200:
            return

        # First we need to build a filename
        filename_cell = row[filename_pos] if filename_pos else None

        formatter_kwargs = {}

        if cli_args.filename_template and 'line' in cli_args.filename_template:
            formatter_kwargs['line'] = enricher.wrap(row)

        try:
            filename = filename_builder(
                result.resolved,
                filename=filename_cell,
                ext=meta.get('ext'),
                formatter_kwargs=formatter_kwargs,
                compressed=cli_args.compress
            )
        except FilenameFormattingError as e:
            result.error = e
            return

        meta['filename'] = filename

        # Decoding the response data?
        is_text = meta.get('is_text', False)
        original_encoding = meta.get('encoding', 'utf-8')

        data = response.data
        binary = True

        if is_text and (cli_args.standardize_encoding or cli_args.contents_in_report):
            data = data.decode(original_encoding, errors='replace')
            binary = False

            if cli_args.contents_in_report:
                meta['decoded_contents'] = data

        # Writing the file?
        # TODO: specify what should happen when contents are empty (e.g. POST queries)
        if data and not cli_args.contents_in_report:
            files_writer.write(
                filename,
                data,
                binary=binary,
                compress=cli_args.compress
            )

    def write_fetch_output(index, row, resolved=None, status=None, error=None,
                           filename=None, encoding=None, mimetype=None, data=None):

        addendum = [
            resolved or '',
            status or '',
            error or '',
            filename or '',
            mimetype or '',
            encoding or ''
        ]

        if cli_args.contents_in_report:
            addendum.append(data or '')

        enricher.writerow(index, row, addendum)

    def write_resolve_output(index, row, resolved=None, status=None, error=None,
                             redirects=None, chain=None):
        addendum = [
            resolved or '',
            status or '',
            error or '',
            redirects or '',
            chain or ''
        ]

        enricher.writerow(index, row, addendum)

    errors = 0
    status_codes = Counter()

    common_kwargs = {
        'key': url_key,
        'insecure': cli_args.insecure,
        'threads': cli_args.threads,
        'throttle': cli_args.throttle,
        'domain_parallelism': cli_args.domain_parallelism,
        'max_redirects': cli_args.max_redirects,
        'wait': False,
        'daemonic': True
    }

    if cli_args.timeout is not None:
        common_kwargs['timeout'] = cli_args.timeout

    # Normal fetch
    if not resolve:

        multithreaded_iterator = multithreaded_fetch(
            enricher,
            request_args=request_args,
            callback=worker_callback,
            **common_kwargs
        )

        for result in multithreaded_iterator:
            index, row = result.item

            if not result.url:

                write_fetch_output(
                    index,
                    row
                )

                loading_bar.update()
                continue

            # Updating stats
            update_loading_bar(result)

            # No error
            if result.error is None:
                meta = result.meta

                # Final url target
                resolved_url = result.resolved

                if resolved_url == result.url:
                    resolved_url = None

                # Reporting in output
                write_fetch_output(
                    index,
                    row,
                    resolved=resolved_url,
                    status=result.response.status,
                    filename=meta.get('filename'),
                    encoding=meta.get('encoding'),
                    mimetype=meta.get('mimetype'),
                    data=meta.get('decoded_contents')
                )

            # Handling potential errors
            else:
                error_code = report_error(result.error)

                resolved = None

                if isinstance(result.error, InvalidURLError):
                    resolved = result.error.url

                if isinstance(result.error, FilenameFormattingError):
                    loading_bar.print(report_filename_formatting_error(result.error))

                write_fetch_output(
                    index,
                    row,
                    error=error_code,
                    resolved=resolved
                )

    # Resolve
    else:

        multithreaded_iterator = multithreaded_resolve(
            enricher,
            resolve_args=request_args,
            follow_meta_refresh=cli_args.follow_meta_refresh,
            follow_js_relocation=cli_args.follow_js_relocation,
            infer_redirection=cli_args.infer_redirection,
            **common_kwargs
        )

        for result in multithreaded_iterator:
            index, row = result.item

            if not result.url:

                write_resolve_output(
                    index,
                    row
                )

                loading_bar.update()
                continue

            # Updating stats
            update_loading_bar(result)

            # No error
            if result.error is None:

                # Reporting in output
                last = result.stack[-1]

                write_resolve_output(
                    index,
                    row,
                    resolved=last.url,
                    status=last.status,
                    redirects=len(result.stack) - 1,
                    chain='|'.join(step.type for step in result.stack)
                )

            # Handling potential errors
            else:
                error_code = report_error(result.error)

                write_resolve_output(
                    index,
                    row,
                    error=error_code,
                    redirects=(len(result.stack) - 1) if result.stack else None,
                    chain='|'.join(step.type for step in result.stack) if result.stack else None
                )