예제 #1
0
    def test_basics(self):

        for url, result in DEFAULT_TESTS:
            assert is_url(url) == result

        for url, result in NO_PROTOCOL_TESTS:
            assert is_url(url, require_protocol=False) == result

        for url, result in TLD_AWARE_TESTS:
            assert is_url(url, require_protocol=False, tld_aware=True) == result

        for url, result in RELAXED_TESTS:
            assert is_url(url, require_protocol=False, allow_spaces_in_path=True) == result

        for url, result in ONLY_HTTP_HTTPS_TESTS:
            assert is_url(url, only_http_https=True) == result
예제 #2
0
def raw_request(http, url, method='GET', headers=None,
                preload_content=True, release_conn=True, timeout=None,
                body=None):
    """
    Generic request helpers using a urllib3 pool to access some resource.
    """

    # Validating URL
    if not ural.is_url(url, require_protocol=True, tld_aware=True, allow_spaces_in_path=True):
        return InvalidURLError(url=url), None

    # Performing request
    request_kwargs = {
        'headers': headers,
        'body': body,
        'preload_content': preload_content,
        'release_conn': release_conn,
        'redirect': False,
        'retries': False
    }

    if timeout is not None:
        request_kwargs['timeout'] = timeout

    try:
        response = http.request(
            method,
            url,
            **request_kwargs
        )
    except Exception as e:
        return e, None

    return None, response
예제 #3
0
파일: summary.py 프로젝트: lebelgique/minet
def crowdtangle_summary_action(namespace, output_file):
    if not namespace.start_date:
        die('Missing --start-date!')

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'url')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select.split(',') if namespace.select else None,
        add=CROWDTANGLE_SUMMARY_CSV_HEADERS)

    posts_writer = None

    if namespace.posts is not None:
        posts_writer = csv.writer(namespace.posts)
        posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK)

    loading_bar = tqdm(desc='Collecting data',
                       dynamic_ncols=True,
                       total=namespace.total,
                       unit=' urls')

    client = CrowdTangleAPIClient(namespace.token,
                                  rate_limit=namespace.rate_limit)

    for row, url in enricher.cells(namespace.column, with_rows=True):
        url = url.strip()

        try:
            stats = client.summary(url,
                                   start_date=namespace.start_date,
                                   with_top_posts=namespace.posts is not None,
                                   sort_by=namespace.sort_by,
                                   format='csv_row',
                                   platforms=namespace.platforms)

        except CrowdTangleInvalidTokenError:
            die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])

        except Exception as err:
            raise err

        if namespace.posts is not None:
            stats, posts = stats

            if posts is not None:
                for post in posts:
                    posts_writer.writerow([url] + post)

        enricher.writerow(row, stats)

        loading_bar.update()
예제 #4
0
    def create(self, request):

        #checks if url field is blank
        if request.data.get('url') == None:
            return Response({"message": "URL field cannot be blank"}, status=status.HTTP_400_BAD_REQUEST)

        #proccesses a malformed url,rejects values that do not form valid url
        schemed_url = ensure_protocol(
            request.data.get('url'), protocol='https')
        if is_url(schemed_url) == False:
            return Response({"message": "Enter a valid url"}, status=status.HTTP_400_BAD_REQUEST)

        #checks if name input contains only letters,numbers,underscore and hyphen
        pattern = "^[A-Za-z0-9_-]*$"
        name=request.data.get('name')
        if name is not None and bool(re.match(pattern, name)) == False:
            return Response({"message": "Name can contain only letters,numbers,underscore and hyphen"}, status=status.HTTP_400_BAD_REQUEST)

        # maps request data to serializer class to get an object
        serializer = serializers.MemeSerializer(data={"name": request.data.get(
            'name'), "url": schemed_url, "caption": request.data.get("caption")})

        # checks validity of the serializer object whether all required fields are present
        if serializer.is_valid():

            # extract the various parameters sent in request data
            creator = serializer.data.get('name')
            caption = serializer.data.get('caption')
            url = serializer.data.get('url')

            # set creationDateTime,creationDate,lastUpdate as current date and time
            creationDateTime = timezone.now()
            creationDate = date.today()
            updatedDateTime = timezone.now()

            # create a meme object with data extracted
            obj = Meme(caption=caption, url=schemed_url, name=creator, creationDateTime=creationDateTime,
                       creationDate=creationDate, lastUpdate=updatedDateTime)

            # check if meme object  already exists
            query_obj2 = Meme.objects.filter(url=schemed_url).filter(
                name=creator).filter(caption=caption)
            if len(query_obj2) >= 1:
                return Response({'message': 'This meme already exists'}, status=status.HTTP_409_CONFLICT)

            # if meme object does not exit create a new meme by saving it to database
            obj.save()
            # get the id of the meme object created
            postCounter = obj.id
            # return the id of the meme object created with accepted status code
            return Response({'id': str(postCounter)}, status=status.HTTP_201_CREATED)
        else:
            """if any required data was missing or if serializer object could
            not be created,return the exact serialization error that occured
            with bad request status code"""
            return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
예제 #5
0
def facebook_comments_action(namespace):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'post_url')

    try:
        scraper = FacebookCommentScraper(namespace.cookie)
    except FacebookInvalidCookieError:
        if namespace.cookie in ['firefox', 'chrome']:
            die('Could not extract cookies from %s.' % namespace.cookie)

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to access Facebook post comments.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=FACEBOOK_COMMENT_CSV_HEADERS)

    # Loading bar
    loading_bar = tqdm(desc='Scraping comments',
                       dynamic_ncols=True,
                       unit=' comments')

    for i, (row,
            url) in enumerate(enricher.cells(namespace.column,
                                             with_rows=True)):

        if not is_facebook_post_url(url):
            loading_bar.close()
            die('Given url (line %i) is not a Facebook post url: %s' %
                (i + 1, url))

        batches = scraper(url, per_call=True, detailed=True, format='csv_row')

        for details, batch in batches:
            for comment in batch:
                enricher.writerow(row, comment)

            loading_bar.update(len(batch))
            loading_bar.set_postfix(calls=details['calls'],
                                    replies=details['replies'],
                                    q=details['queue_size'],
                                    posts=i + 1)

    loading_bar.close()
예제 #6
0
def url_parse_action(namespace):

    output_file = open_output_file(namespace.output)

    headers = REPORT_HEADERS

    if namespace.facebook:
        headers = FACEBOOK_REPORT_HEADERS
    elif namespace.youtube:
        headers = YOUTUBE_REPORT_HEADERS

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        add=headers,
        keep=namespace.select
    )

    loading_bar = tqdm(
        desc='Parsing',
        dynamic_ncols=True,
        unit=' rows',
        total=namespace.total
    )

    for row, url in enricher.cells(namespace.column, with_rows=True):
        url = url.strip()

        loading_bar.update()

        if namespace.separator:
            urls = url.split(namespace.separator)
        else:
            urls = [url]

        for url in urls:
            if not is_url(url, allow_spaces_in_path=True, require_protocol=False):
                enricher.writerow(row)
                continue

            if namespace.facebook:
                addendum = extract_facebook_addendum(url)
            elif namespace.youtube:
                addendum = extract_youtube_addendum(url)
            else:
                addendum = extract_standard_addendum(namespace, url)

            if addendum is None:
                enricher.writerow(row)
                continue

            enricher.writerow(row, addendum)

    output_file.close()
예제 #7
0
def facebook_url_likes_action(namespace):
    output_file = open_output_file(namespace.output)

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'url')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=REPORT_HEADERS
    )

    if namespace.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column
        ])

    loading_bar = tqdm(
        desc='Retrieving likes',
        dynamic_ncols=True,
        unit=' urls',
        total=namespace.total
    )

    http = create_pool()

    for row, url in enricher.cells(namespace.column, with_rows=True):
        loading_bar.update()

        url = url.strip()

        err, html = make_request(http, url)

        if err is not None:
            loading_bar.close()
            die('An error occurred while fetching like button for this url: %s' % url)

        scraped = scrape(html)

        if scraped is None:
            loading_bar.close()
            die('Could not extract Facebook likes from this url\'s like button: %s' % url)

        enricher.writerow(row, scraped)
예제 #8
0
파일: url_parse.py 프로젝트: zanachka/minet
def url_parse_action(cli_args):
    headers = REPORT_HEADERS

    if cli_args.facebook:
        headers = FACEBOOK_REPORT_HEADERS
    elif cli_args.youtube:
        headers = YOUTUBE_REPORT_HEADERS

    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=headers,
                                 keep=cli_args.select)

    loading_bar = LoadingBar(desc='Parsing', unit='row', total=cli_args.total)

    for row, cell in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.update()

        if cli_args.separator:
            urls = cell.split(cli_args.separator)
        else:
            urls = [cell]

        for url in urls:
            url = url.strip()

            if not is_url(
                    url, allow_spaces_in_path=True, require_protocol=False):
                enricher.writerow(row)
                continue

            if cli_args.facebook:
                addendum = extract_facebook_addendum(url)
            elif cli_args.youtube:
                addendum = extract_youtube_addendum(url)
            else:
                addendum = extract_standard_addendum(cli_args, url)

            if addendum is None:
                enricher.writerow(row)
                continue

            enricher.writerow(row, addendum)
예제 #9
0
def facebook_url_likes_action(cli_args):
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=REPORT_HEADERS,
        total=cli_args.total,
        prebuffer_bytes=DEFAULT_PREBUFFER_BYTES
    )

    if cli_args.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column
        ])

    loading_bar = LoadingBar(
        desc='Retrieving likes',
        unit='url',
        total=enricher.total
    )

    for row, url in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.update()

        url = url.strip()

        if not url or not is_url(url, require_protocol=False):
            enricher.writerow(row)
            continue

        err, html = make_request(url)

        if err is not None:
            loading_bar.die('An error occurred while fetching like button for this url: %s' % url)

        scraped = scrape(html)

        if scraped is None:
            loading_bar.die('Could not extract Facebook likes from this url\'s like button: %s' % url)

        enricher.writerow(row, scraped)
예제 #10
0
def url_parse_action(namespace):

    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=REPORT_HEADERS,
                                 keep=namespace.select)

    loading_bar = tqdm(desc='Parsing',
                       dynamic_ncols=True,
                       unit=' rows',
                       total=namespace.total)

    for row, url in enricher.cells(namespace.column, with_rows=True):
        url = url.strip()

        loading_bar.update()

        if namespace.separator:
            urls = url.split(namespace.separator)
        else:
            urls = [url]

        for url in urls:
            if not is_url(url, allow_spaces_in_path=True):
                enricher.writerow(row)
                continue

            enricher.writerow(row, [
                normalize_url(url,
                              strip_protocol=namespace.strip_protocol,
                              strip_trailing_slash=True),
                get_domain_name(url),
                get_hostname(url),
                get_normalized_hostname(url)
            ])

    output_file.close()
예제 #11
0
    def partial_update(self, request, pk=None):
        #if id input is anything other than positive integer,return status bad request
        try:
            val = int(pk)
        except ValueError:
            return Response({"message": "Enter positive number"}, status=status.HTTP_400_BAD_REQUEST)
        
        #if id input is negative integer,return status bad request
        if int(pk) < 0:
            return Response({"message": "Enter positive number"}, status=status.HTTP_400_BAD_REQUEST)
        # get meme object by its id
        queryset = Meme.objects.filter(id=pk)

        # check if meme object exists,if it does not return http status not found
        if len(queryset) == 0:
            return Response(status=status.HTTP_404_NOT_FOUND)

        if request.data.get('name') != None:
            return Response({"message": "Creator name cannot be changed!!"}, status=status.HTTP_400_BAD_REQUEST)

        if request.data.get('url') == None and request.data.get('caption') == None:
            return Response({"message": "Both url and caption cannot be none"}, status=status.HTTP_400_BAD_REQUEST)

        url = request.data.get('url')
        caption = request.data.get('caption')

        # if only caption is supplied,only caption is updated
        if url == None and caption != None:
            obj = queryset[0]
            obj.caption = caption
            obj.lastUpdate = timezone.now()
            obj.save()
            # return response no content if successfully updated
            return Response(status=status.HTTP_204_NO_CONTENT)

        # if url entered has no scheme,add scheme to url and check if a valid url is formed
        schemed_url = ensure_protocol(
            request.data.get('url'), protocol='https')
        if is_url(schemed_url) == False:
            return Response({"message": "Enter a valid url"}, status=status.HTTP_400_BAD_REQUEST)

        # if meme with that id exists,map request data to serializer to extract attributes based on data supplied
        # ie either url or caption or both
        if url != None and caption != None:
            serializer = serializers.MemeUpdateSerializer(
                data={"url": schemed_url, "caption": request.data.get("caption")}, partial=True)
        else:
            serializer = serializers.MemeUpdateSerializer(
                data={"url": schemed_url}, partial=True)

        # check if serializer object is valid i.e all required fields are present and no extra fields are present
        if serializer.is_valid():
            obj = queryset[0]

            # extract the caption and url of the meme object
            oldCaption = obj.caption
            oldUrl = obj.url

            # set caption and url sent as request to new caption and new url
            newCaption = serializer.data.get('caption')
            newUrl = serializer.data.get('url')

            # check if new caption is not same as existing caption.If not then update caption field of meme object.
            if newCaption is not None and newCaption != oldCaption:
                obj.caption = newCaption
            # check if new url is not same as existing url.If not then update url field of meme object.
            if newUrl is not None and newUrl != oldUrl:
                obj.url = newUrl
            # if any of the fields were updated,set lastUpdate field of the object to current date and time
            if newUrl != oldUrl or newCaption != oldCaption:
                obj.lastUpdate = timezone.now()

            # save the meme object
            obj.save()
            # return response no content if successfully updated
            return Response(status=status.HTTP_204_NO_CONTENT)
        else:
            return Response(serializer.errors, status=status.HTTP_404_NOT_FOUND)
예제 #12
0
def facebook_comments_action(namespace):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'post_url')

    try:
        scraper = FacebookMobileScraper(namespace.cookie,
                                        throttle=namespace.throttle)
    except FacebookInvalidCookieError:
        if namespace.cookie in COOKIE_BROWSERS:
            die([
                'Could not extract relevant cookie from "%s".' %
                namespace.cookie
            ])

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to access Facebook post comments.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=FACEBOOK_COMMENT_CSV_HEADERS)

    # Loading bar
    loading_bar = tqdm(desc='Scraping comments',
                       dynamic_ncols=True,
                       unit=' comments')

    for i, (row,
            url) in enumerate(enricher.cells(namespace.column,
                                             with_rows=True)):

        if not has_facebook_comments(url):
            tqdm.write(
                'Given url (line %i) probably cannot have Facebook comments: %s'
                % (i + 1, url),
                file=sys.stderr)
            continue

        batches = scraper.comments(url, per_call=True, detailed=True)

        for details, batch in batches:
            for comment in batch:
                enricher.writerow(row, comment.as_csv_row())

            loading_bar.update(len(batch))
            loading_bar.set_postfix(calls=details['calls'],
                                    replies=details['replies'],
                                    q=details['queue_size'],
                                    posts=i + 1)

    loading_bar.close()
예제 #13
0
파일: is_url_test.py 프로젝트: oubine/ural
 def test_basics(self):
     for url, result in DEFAULT_TESTS:
         assert is_url(url) == result
     for url, result in NO_PROTOCOL_TESTS:
         assert is_url(url, require_protocol=False) == result
예제 #14
0
def fetch_action(namespace):

    # Are we resuming
    resuming = namespace.resume

    if resuming and not namespace.output:
        die(['Cannot --resume without specifying -o/--output.'])

    # Do we need to fetch only a single url?
    if namespace.file is sys.stdin and is_url(namespace.column):
        namespace.file = StringIO('url\n%s' % namespace.column)
        namespace.column = 'url'

        # If we are hitting a single url we enable contents_in_report
        if namespace.contents_in_report is None:
            namespace.contents_in_report = True

    input_headers, pos, reader = custom_reader(namespace.file,
                                               namespace.column)
    filename_pos = input_headers.index(
        namespace.filename) if namespace.filename else None
    indexed_input_headers = {h: p for p, h in enumerate(input_headers)}

    selected_fields = namespace.select.split(',') if namespace.select else None
    selected_pos = [input_headers.index(h)
                    for h in selected_fields] if selected_fields else None

    # HTTP method
    http_method = namespace.method

    # Cookie grabber
    get_cookie = None
    if namespace.grab_cookies:
        get_cookie = grab_cookies(namespace.grab_cookies)

    # Global headers
    global_headers = None
    if namespace.headers:
        global_headers = {}

        for header in namespace.headers:
            k, v = parse_http_header(header)
            global_headers = v

    # Reading output
    output_headers = (list(input_headers) if not selected_pos else
                      [input_headers[i] for i in selected_pos])
    output_headers += OUTPUT_ADDITIONAL_HEADERS

    if namespace.contents_in_report:
        output_headers.append('raw_content')

    flag = 'w'

    if namespace.output is not None and resuming and isfile(namespace.output):
        flag = 'r+'

    output_file = open_output_file(namespace.output, flag=flag)

    output_writer = csv.writer(output_file)

    if not resuming:
        output_writer.writerow(output_headers)
    else:

        # Reading report to know what need to be done
        _, rpos, resuming_reader = custom_reader(output_file, 'line')

        resuming_reader_loading = tqdm(resuming_reader,
                                       desc='Resuming',
                                       dynamic_ncols=True,
                                       unit=' lines')

        already_done = ContiguousRangeSet()

        for line in resuming_reader_loading:
            index = line[rpos]

            already_done.add(int(index))

    # Loading bar
    total = namespace.total

    if total is not None and resuming:
        total -= len(already_done)

    loading_bar = tqdm(desc='Fetching pages',
                       total=total,
                       dynamic_ncols=True,
                       unit=' urls')

    def url_key(item):
        line = item[1]
        url = line[pos].strip()

        if not url:
            return

        # Url templating
        if namespace.url_template:
            return namespace.url_template.format(value=url)

        return url

    def request_args(url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {'method': http_method, 'cookie': cookie, 'headers': headers}

    def write_output(index,
                     line,
                     resolved=None,
                     status=None,
                     error=None,
                     filename=None,
                     encoding=None,
                     data=None):

        if selected_pos:
            line = [line[p] for p in selected_pos]

        line.extend([
            index, resolved or '', status or '', error or '', filename or '',
            encoding or ''
        ])

        if namespace.contents_in_report:
            line.append(data or '')

        output_writer.writerow(line)

    errors = 0
    status_codes = Counter()

    target_iterator = enumerate(reader)

    if resuming:
        target_iterator = (pair for pair in target_iterator
                           if not already_done.stateful_contains(pair[0]))

    multithreaded_iterator = multithreaded_fetch(target_iterator,
                                                 key=url_key,
                                                 request_args=request_args,
                                                 threads=namespace.threads,
                                                 throttle=namespace.throttle)

    for result in multithreaded_iterator:
        line_index, line = result.item

        if not result.url:

            write_output(line_index, line)

            loading_bar.update()
            continue

        response = result.response
        data = response.data if response is not None else None

        content_write_flag = 'wb'

        # Updating stats
        if result.error is not None:
            errors += 1
        else:
            if response.status >= 400:
                status_codes[response.status] += 1

        postfix = {'errors': errors}

        for code, count in status_codes.most_common(1):
            postfix[str(code)] = count

        loading_bar.set_postfix(**postfix)
        loading_bar.update()

        # No error
        if result.error is None:

            filename = None

            # Building filename
            if data:
                if filename_pos is not None or namespace.filename_template:
                    if namespace.filename_template:
                        filename = CUSTOM_FORMATTER.format(
                            namespace.filename_template,
                            value=line[filename_pos]
                            if filename_pos is not None else None,
                            ext=result.meta['ext'],
                            line=LazyLineDict(indexed_input_headers, line))
                    else:
                        filename = line[filename_pos] + result.meta['ext']
                else:
                    # NOTE: it would be nice to have an id that can be sorted by time
                    filename = str(uuid4()) + result.meta['ext']

            # Standardize encoding?
            encoding = result.meta['encoding']

            if data and namespace.standardize_encoding or namespace.contents_in_report:
                if encoding is None or encoding != 'utf-8' or namespace.contents_in_report:
                    data = data.decode(
                        encoding if encoding is not None else 'utf-8',
                        errors='replace')
                    encoding = 'utf-8'
                    content_write_flag = 'w'

            # Writing file on disk
            if data and not namespace.contents_in_report:

                if namespace.compress:
                    filename += '.gz'

                resource_path = join(namespace.output_dir, filename)
                resource_dir = dirname(resource_path)

                os.makedirs(resource_dir, exist_ok=True)

                with open(resource_path, content_write_flag) as f:

                    # TODO: what if standardize_encoding + compress?
                    f.write(
                        gzip.compress(data) if namespace.compress else data)

            # Reporting in output
            resolved_url = response.geturl()

            write_output(
                line_index,
                line,
                resolved=resolved_url if resolved_url != result.url else None,
                status=response.status,
                filename=filename,
                encoding=encoding,
                data=data)

        # Handling potential errors
        else:
            error_code = report_error(result.error)

            write_output(line_index, line, error=error_code)

    # Closing files
    if namespace.output is not None:
        output_file.close()
예제 #15
0
def export_google_sheets_as_csv(url,
                                cookie=None,
                                authuser=None,
                                max_authuser_attempts=4):
    if is_url(url):
        parsed = parse_google_drive_url(url)

        if parsed is None or parsed.type != 'spreadsheets':
            raise GoogleSheetsInvalidTargetError
    else:
        parsed = GoogleDriveFile('spreadsheets', url)

    base_export_url = parsed.get_export_url()
    export_url = base_export_url

    if authuser is not None:
        if not isinstance(authuser, int) or authuser < 0:
            raise TypeError('authuser should be an int >= 0')

        export_url = append_authuser(export_url, authuser)
        max_authuser_attempts = 1
    else:
        authuser = 0

    if cookie is not None and cookie in COOKIE_BROWSERS:
        jar = getattr(browser_cookie3, cookie)()
        resolver = CookieResolver(jar)
        cookie = resolver(export_url)

        if cookie is None:
            raise GoogleSheetsMissingCookieError

    attempts = max_authuser_attempts

    while True:
        attempts -= 1

        err, response = request(export_url, cookie=cookie)

        if err:
            raise err

        if response.status == 404:
            raise GoogleSheetsNotFoundError

        if response.status == 401:
            raise GoogleSheetsUnauthorizedError

        if response.status == 403:
            authuser += 1

            if attempts != 0:
                export_url = append_authuser(base_export_url, authuser)
                continue

            raise GoogleSheetsMaxAttemptsExceeded

        if 'csv' not in response.headers.get('Content-Type', '').lower():
            raise GoogleSheetsInvalidContentTypeError

        break

    return response.data.decode('utf-8')
예제 #16
0
def fetch_action(namespace):

    # Are we resuming
    resuming = namespace.resume

    if resuming and not namespace.output:
        die(['Cannot --resume without specifying -o/--output.'])

    # Do we need to fetch only a single url?
    single_url = namespace.file is sys.stdin and is_url(namespace.column)

    if single_url:
        edit_namespace_with_csv_io(namespace, 'url')

        # If we are hitting a single url we enable contents_in_report
        if namespace.contents_in_report is None:
            namespace.contents_in_report = True

    # HTTP method
    http_method = namespace.method

    # Cookie grabber
    get_cookie = None
    if namespace.grab_cookies:
        get_cookie = grab_cookies(namespace.grab_cookies)

    # Global headers
    global_headers = None
    if namespace.headers:
        global_headers = {}

        for header in namespace.headers:
            k, v = parse_http_header(header)
            global_headers = v

    flag = 'w'
    if namespace.output is not None and resuming and isfile(namespace.output):
        flag = 'r+'

    output_file = open_output_file(namespace.output, flag=flag)

    # Resume listener
    listener = None
    resuming_reader_loading = None
    skipped = 0

    if resuming:
        resuming_reader_loading = tqdm(desc='Resuming',
                                       dynamic_ncols=True,
                                       unit=' lines')

        def listener(event, row):
            nonlocal skipped

            if event == 'resume.output':
                resuming_reader_loading.update()

            if event == 'resume.input':
                skipped += 1
                loading_bar.set_postfix(skipped=skipped)
                loading_bar.update()

    # Enricher
    enricher = casanova.threadsafe_enricher(
        namespace.file,
        output_file,
        resumable=resuming,
        auto_resume=False,
        add=OUTPUT_ADDITIONAL_HEADERS +
        (['raw_contents'] if namespace.contents_in_report else []),
        keep=namespace.select,
        listener=listener)

    if namespace.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.'
            % namespace.column
        ])

    url_pos = enricher.pos[namespace.column]

    filename_pos = None

    if namespace.filename is not None:
        if namespace.filename not in enricher.pos:
            die([
                'Could not find the "%s" column containing the filenames in the given CSV file.'
                % namespace.filename
            ])

        filename_pos = enricher.pos[namespace.filename]

    indexed_input_headers = {h: i for i, h in enumerate(enricher.fieldnames)}

    if resuming:
        enricher.resume()
        resuming_reader_loading.close()

    # Loading bar
    total = namespace.total

    loading_bar = tqdm(desc='Fetching pages',
                       total=total,
                       dynamic_ncols=True,
                       unit=' urls')

    def url_key(item):
        url = item[1][url_pos].strip()

        if not url:
            return

        # Url templating
        if namespace.url_template:
            return namespace.url_template.format(value=url)

        return url

    def request_args(url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {'method': http_method, 'cookie': cookie, 'headers': headers}

    def write_output(index,
                     row,
                     resolved=None,
                     status=None,
                     error=None,
                     filename=None,
                     encoding=None,
                     data=None):

        addendum = [
            resolved or '', status or '', error or '', filename or '', encoding
            or ''
        ]

        if namespace.contents_in_report:
            addendum.append(data or '')

        enricher.writerow(index, row, addendum)

    errors = 0
    status_codes = Counter()

    fetch_kwargs = {
        'threads': namespace.threads,
        'throttle': namespace.throttle,
        'domain_parallelism': namespace.domain_parallelism
    }

    if namespace.timeout is not None:
        fetch_kwargs['timeout'] = namespace.timeout

    multithreaded_iterator = multithreaded_fetch(enricher,
                                                 key=url_key,
                                                 request_args=request_args,
                                                 **fetch_kwargs)

    for result in multithreaded_iterator:
        index, row = result.item

        if not result.url:

            write_output(index, row)

            loading_bar.update()
            continue

        response = result.response
        data = response.data if response is not None else None

        content_write_flag = 'wb'

        # Updating stats
        if result.error is not None:
            errors += 1
        else:
            if response.status >= 400:
                status_codes[response.status] += 1

        postfix = {'errors': errors}

        for code, count in status_codes.most_common(1):
            postfix[str(code)] = count

        loading_bar.set_postfix(**postfix)
        loading_bar.update()

        # No error
        if result.error is None:

            filename = None

            # Building filename
            if data:
                if filename_pos is not None or namespace.filename_template:
                    if namespace.filename_template:
                        filename = CUSTOM_FORMATTER.format(
                            namespace.filename_template,
                            value=row[filename_pos]
                            if filename_pos is not None else None,
                            ext=result.meta['ext'],
                            line=LazyLineDict(indexed_input_headers, row))
                    else:
                        filename = row[filename_pos] + result.meta['ext']
                else:
                    # NOTE: it would be nice to have an id that can be sorted by time
                    filename = str(uuid4()) + result.meta['ext']

            # Standardize encoding?
            encoding = result.meta['encoding']

            if data and namespace.standardize_encoding or namespace.contents_in_report:
                if encoding is None or encoding != 'utf-8' or namespace.contents_in_report:
                    data = data.decode(
                        encoding if encoding is not None else 'utf-8',
                        errors='replace')
                    encoding = 'utf-8'
                    content_write_flag = 'w'

            # Writing file on disk
            if data and not namespace.contents_in_report:

                if namespace.compress:
                    filename += '.gz'

                resource_path = join(namespace.output_dir, filename)
                resource_dir = dirname(resource_path)

                os.makedirs(resource_dir, exist_ok=True)

                with open(resource_path, content_write_flag) as f:

                    # TODO: what if standardize_encoding + compress?
                    f.write(
                        gzip.compress(data) if namespace.compress else data)

            # Reporting in output
            resolved_url = response.geturl()

            write_output(
                index,
                row,
                resolved=resolved_url if resolved_url != result.url else None,
                status=response.status,
                filename=filename,
                encoding=encoding,
                data=data)

        # Handling potential errors
        else:
            error_code = report_error(result.error)

            write_output(index, row, error=error_code)

    # Closing files
    output_file.close()