Пример #1
0
def grab_facebook_cookie(source):
    if source == 'firefox' or source == 'chrome':
        get_cookie_for_url = grab_cookies(source)

        if get_cookie_for_url is None:
            return None

        cookie = get_cookie_for_url(FACEBOOK_URL + '/')

    else:
        cookie = source.strip()

    if not cookie:
        return None

    return fix_cookie(cookie)
Пример #2
0
def grab_facebook_cookie(namespace):
    if namespace.cookie == 'firefox' or namespace.cookie == 'chrome':
        get_cookie_for_url = grab_cookies(namespace.cookie)

        if get_cookie_for_url is None:
            die('Could not extract cookies from %s.' % namespace.cookie)

        cookie = get_cookie_for_url(FACEBOOK_URL + '/')

    else:
        cookie = namespace.cookie.strip()

    if not cookie:
        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to access Facebook pages.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    return fix_cookie(cookie)
Пример #3
0
def fetch_action(namespace):

    # Are we resuming
    resuming = namespace.resume

    if resuming and not namespace.output:
        die(['Cannot --resume without specifying -o/--output.'])

    # Do we need to fetch only a single url?
    if namespace.file is sys.stdin and is_url(namespace.column):
        namespace.file = StringIO('url\n%s' % namespace.column)
        namespace.column = 'url'

        # If we are hitting a single url we enable contents_in_report
        if namespace.contents_in_report is None:
            namespace.contents_in_report = True

    input_headers, pos, reader = custom_reader(namespace.file,
                                               namespace.column)
    filename_pos = input_headers.index(
        namespace.filename) if namespace.filename else None
    indexed_input_headers = {h: p for p, h in enumerate(input_headers)}

    selected_fields = namespace.select.split(',') if namespace.select else None
    selected_pos = [input_headers.index(h)
                    for h in selected_fields] if selected_fields else None

    # HTTP method
    http_method = namespace.method

    # Cookie grabber
    get_cookie = None
    if namespace.grab_cookies:
        get_cookie = grab_cookies(namespace.grab_cookies)

    # Global headers
    global_headers = None
    if namespace.headers:
        global_headers = {}

        for header in namespace.headers:
            k, v = parse_http_header(header)
            global_headers = v

    # Reading output
    output_headers = (list(input_headers) if not selected_pos else
                      [input_headers[i] for i in selected_pos])
    output_headers += OUTPUT_ADDITIONAL_HEADERS

    if namespace.contents_in_report:
        output_headers.append('raw_content')

    flag = 'w'

    if namespace.output is not None and resuming and isfile(namespace.output):
        flag = 'r+'

    output_file = open_output_file(namespace.output, flag=flag)

    output_writer = csv.writer(output_file)

    if not resuming:
        output_writer.writerow(output_headers)
    else:

        # Reading report to know what need to be done
        _, rpos, resuming_reader = custom_reader(output_file, 'line')

        resuming_reader_loading = tqdm(resuming_reader,
                                       desc='Resuming',
                                       dynamic_ncols=True,
                                       unit=' lines')

        already_done = ContiguousRangeSet()

        for line in resuming_reader_loading:
            index = line[rpos]

            already_done.add(int(index))

    # Loading bar
    total = namespace.total

    if total is not None and resuming:
        total -= len(already_done)

    loading_bar = tqdm(desc='Fetching pages',
                       total=total,
                       dynamic_ncols=True,
                       unit=' urls')

    def url_key(item):
        line = item[1]
        url = line[pos].strip()

        if not url:
            return

        # Url templating
        if namespace.url_template:
            return namespace.url_template.format(value=url)

        return url

    def request_args(url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {'method': http_method, 'cookie': cookie, 'headers': headers}

    def write_output(index,
                     line,
                     resolved=None,
                     status=None,
                     error=None,
                     filename=None,
                     encoding=None,
                     data=None):

        if selected_pos:
            line = [line[p] for p in selected_pos]

        line.extend([
            index, resolved or '', status or '', error or '', filename or '',
            encoding or ''
        ])

        if namespace.contents_in_report:
            line.append(data or '')

        output_writer.writerow(line)

    errors = 0
    status_codes = Counter()

    target_iterator = enumerate(reader)

    if resuming:
        target_iterator = (pair for pair in target_iterator
                           if not already_done.stateful_contains(pair[0]))

    multithreaded_iterator = multithreaded_fetch(target_iterator,
                                                 key=url_key,
                                                 request_args=request_args,
                                                 threads=namespace.threads,
                                                 throttle=namespace.throttle)

    for result in multithreaded_iterator:
        line_index, line = result.item

        if not result.url:

            write_output(line_index, line)

            loading_bar.update()
            continue

        response = result.response
        data = response.data if response is not None else None

        content_write_flag = 'wb'

        # Updating stats
        if result.error is not None:
            errors += 1
        else:
            if response.status >= 400:
                status_codes[response.status] += 1

        postfix = {'errors': errors}

        for code, count in status_codes.most_common(1):
            postfix[str(code)] = count

        loading_bar.set_postfix(**postfix)
        loading_bar.update()

        # No error
        if result.error is None:

            filename = None

            # Building filename
            if data:
                if filename_pos is not None or namespace.filename_template:
                    if namespace.filename_template:
                        filename = CUSTOM_FORMATTER.format(
                            namespace.filename_template,
                            value=line[filename_pos]
                            if filename_pos is not None else None,
                            ext=result.meta['ext'],
                            line=LazyLineDict(indexed_input_headers, line))
                    else:
                        filename = line[filename_pos] + result.meta['ext']
                else:
                    # NOTE: it would be nice to have an id that can be sorted by time
                    filename = str(uuid4()) + result.meta['ext']

            # Standardize encoding?
            encoding = result.meta['encoding']

            if data and namespace.standardize_encoding or namespace.contents_in_report:
                if encoding is None or encoding != 'utf-8' or namespace.contents_in_report:
                    data = data.decode(
                        encoding if encoding is not None else 'utf-8',
                        errors='replace')
                    encoding = 'utf-8'
                    content_write_flag = 'w'

            # Writing file on disk
            if data and not namespace.contents_in_report:

                if namespace.compress:
                    filename += '.gz'

                resource_path = join(namespace.output_dir, filename)
                resource_dir = dirname(resource_path)

                os.makedirs(resource_dir, exist_ok=True)

                with open(resource_path, content_write_flag) as f:

                    # TODO: what if standardize_encoding + compress?
                    f.write(
                        gzip.compress(data) if namespace.compress else data)

            # Reporting in output
            resolved_url = response.geturl()

            write_output(
                line_index,
                line,
                resolved=resolved_url if resolved_url != result.url else None,
                status=response.status,
                filename=filename,
                encoding=encoding,
                data=data)

        # Handling potential errors
        else:
            error_code = report_error(result.error)

            write_output(line_index, line, error=error_code)

    # Closing files
    if namespace.output is not None:
        output_file.close()
Пример #4
0
def fetch_action(namespace):

    # Are we resuming
    resuming = namespace.resume

    if resuming and not namespace.output:
        die(['Cannot --resume without specifying -o/--output.'])

    # Do we need to fetch only a single url?
    single_url = namespace.file is sys.stdin and is_url(namespace.column)

    if single_url:
        edit_namespace_with_csv_io(namespace, 'url')

        # If we are hitting a single url we enable contents_in_report
        if namespace.contents_in_report is None:
            namespace.contents_in_report = True

    # HTTP method
    http_method = namespace.method

    # Cookie grabber
    get_cookie = None
    if namespace.grab_cookies:
        get_cookie = grab_cookies(namespace.grab_cookies)

    # Global headers
    global_headers = None
    if namespace.headers:
        global_headers = {}

        for header in namespace.headers:
            k, v = parse_http_header(header)
            global_headers = v

    flag = 'w'
    if namespace.output is not None and resuming and isfile(namespace.output):
        flag = 'r+'

    output_file = open_output_file(namespace.output, flag=flag)

    # Resume listener
    listener = None
    resuming_reader_loading = None
    skipped = 0

    if resuming:
        resuming_reader_loading = tqdm(desc='Resuming',
                                       dynamic_ncols=True,
                                       unit=' lines')

        def listener(event, row):
            nonlocal skipped

            if event == 'resume.output':
                resuming_reader_loading.update()

            if event == 'resume.input':
                skipped += 1
                loading_bar.set_postfix(skipped=skipped)
                loading_bar.update()

    # Enricher
    enricher = casanova.threadsafe_enricher(
        namespace.file,
        output_file,
        resumable=resuming,
        auto_resume=False,
        add=OUTPUT_ADDITIONAL_HEADERS +
        (['raw_contents'] if namespace.contents_in_report else []),
        keep=namespace.select,
        listener=listener)

    if namespace.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.'
            % namespace.column
        ])

    url_pos = enricher.pos[namespace.column]

    filename_pos = None

    if namespace.filename is not None:
        if namespace.filename not in enricher.pos:
            die([
                'Could not find the "%s" column containing the filenames in the given CSV file.'
                % namespace.filename
            ])

        filename_pos = enricher.pos[namespace.filename]

    indexed_input_headers = {h: i for i, h in enumerate(enricher.fieldnames)}

    if resuming:
        enricher.resume()
        resuming_reader_loading.close()

    # Loading bar
    total = namespace.total

    loading_bar = tqdm(desc='Fetching pages',
                       total=total,
                       dynamic_ncols=True,
                       unit=' urls')

    def url_key(item):
        url = item[1][url_pos].strip()

        if not url:
            return

        # Url templating
        if namespace.url_template:
            return namespace.url_template.format(value=url)

        return url

    def request_args(url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {'method': http_method, 'cookie': cookie, 'headers': headers}

    def write_output(index,
                     row,
                     resolved=None,
                     status=None,
                     error=None,
                     filename=None,
                     encoding=None,
                     data=None):

        addendum = [
            resolved or '', status or '', error or '', filename or '', encoding
            or ''
        ]

        if namespace.contents_in_report:
            addendum.append(data or '')

        enricher.writerow(index, row, addendum)

    errors = 0
    status_codes = Counter()

    fetch_kwargs = {
        'threads': namespace.threads,
        'throttle': namespace.throttle,
        'domain_parallelism': namespace.domain_parallelism
    }

    if namespace.timeout is not None:
        fetch_kwargs['timeout'] = namespace.timeout

    multithreaded_iterator = multithreaded_fetch(enricher,
                                                 key=url_key,
                                                 request_args=request_args,
                                                 **fetch_kwargs)

    for result in multithreaded_iterator:
        index, row = result.item

        if not result.url:

            write_output(index, row)

            loading_bar.update()
            continue

        response = result.response
        data = response.data if response is not None else None

        content_write_flag = 'wb'

        # Updating stats
        if result.error is not None:
            errors += 1
        else:
            if response.status >= 400:
                status_codes[response.status] += 1

        postfix = {'errors': errors}

        for code, count in status_codes.most_common(1):
            postfix[str(code)] = count

        loading_bar.set_postfix(**postfix)
        loading_bar.update()

        # No error
        if result.error is None:

            filename = None

            # Building filename
            if data:
                if filename_pos is not None or namespace.filename_template:
                    if namespace.filename_template:
                        filename = CUSTOM_FORMATTER.format(
                            namespace.filename_template,
                            value=row[filename_pos]
                            if filename_pos is not None else None,
                            ext=result.meta['ext'],
                            line=LazyLineDict(indexed_input_headers, row))
                    else:
                        filename = row[filename_pos] + result.meta['ext']
                else:
                    # NOTE: it would be nice to have an id that can be sorted by time
                    filename = str(uuid4()) + result.meta['ext']

            # Standardize encoding?
            encoding = result.meta['encoding']

            if data and namespace.standardize_encoding or namespace.contents_in_report:
                if encoding is None or encoding != 'utf-8' or namespace.contents_in_report:
                    data = data.decode(
                        encoding if encoding is not None else 'utf-8',
                        errors='replace')
                    encoding = 'utf-8'
                    content_write_flag = 'w'

            # Writing file on disk
            if data and not namespace.contents_in_report:

                if namespace.compress:
                    filename += '.gz'

                resource_path = join(namespace.output_dir, filename)
                resource_dir = dirname(resource_path)

                os.makedirs(resource_dir, exist_ok=True)

                with open(resource_path, content_write_flag) as f:

                    # TODO: what if standardize_encoding + compress?
                    f.write(
                        gzip.compress(data) if namespace.compress else data)

            # Reporting in output
            resolved_url = response.geturl()

            write_output(
                index,
                row,
                resolved=resolved_url if resolved_url != result.url else None,
                status=response.status,
                filename=filename,
                encoding=encoding,
                data=data)

        # Handling potential errors
        else:
            error_code = report_error(result.error)

            write_output(index, row, error=error_code)

    # Closing files
    output_file.close()