Пример #1
0
def captions_action(namespace, output_file):

    # Handling output
    single_video = namespace.file is sys.stdin and sys.stdin.isatty()

    if single_video:
        edit_namespace_with_csv_io(namespace, 'video')

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=YOUTUBE_CAPTIONS_CSV_HEADERS,
                                 keep=namespace.select)

    loading_bar = LoadingBar('Retrieving captions', unit='video')

    for row, video in enricher.cells(namespace.column, with_rows=True):
        result = get_video_captions(video, langs=namespace.lang)
        loading_bar.update()

        if result is None:
            continue

        track, lines = result

        prefix = [track.lang, '1' if track.generated else '']

        for line in lines:
            enricher.writerow(row, prefix + list(line))

    loading_bar.close()
Пример #2
0
def comments_action(namespace, output_file):

    # Handling output
    single_video = namespace.file is sys.stdin and sys.stdin.isatty()

    if single_video:
        edit_namespace_with_csv_io(namespace, 'video')

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=YOUTUBE_COMMENT_CSV_HEADERS,
                                 keep=namespace.select)

    loading_bar = LoadingBar('Retrieving comments',
                             unit='comment',
                             stats={'videos': 0})

    def before_sleep_until_midnight(seconds):
        loading_bar.print(
            'API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        namespace.key, before_sleep_until_midnight=before_sleep_until_midnight)

    for row, video in enricher.cells(namespace.column, with_rows=True):
        generator = client.comments(video)

        for comment in generator:
            loading_bar.update()
            enricher.writerow(row, comment.as_csv_row())

        loading_bar.inc('videos')

    loading_bar.close()
Пример #3
0
def crowdtangle_summary_action(namespace, output_file):
    if not namespace.start_date:
        die('Missing --start-date!')

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'url')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select.split(',') if namespace.select else None,
        add=CROWDTANGLE_SUMMARY_CSV_HEADERS)

    posts_writer = None

    if namespace.posts is not None:
        posts_writer = csv.writer(namespace.posts)
        posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK)

    loading_bar = tqdm(desc='Collecting data',
                       dynamic_ncols=True,
                       total=namespace.total,
                       unit=' urls')

    client = CrowdTangleAPIClient(namespace.token,
                                  rate_limit=namespace.rate_limit)

    for row, url in enricher.cells(namespace.column, with_rows=True):
        url = url.strip()

        try:
            stats = client.summary(url,
                                   start_date=namespace.start_date,
                                   with_top_posts=namespace.posts is not None,
                                   sort_by=namespace.sort_by,
                                   format='csv_row',
                                   platforms=namespace.platforms)

        except CrowdTangleInvalidTokenError:
            die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])

        except Exception as err:
            raise err

        if namespace.posts is not None:
            stats, posts = stats

            if posts is not None:
                for post in posts:
                    posts_writer.writerow([url] + post)

        enricher.writerow(row, stats)

        loading_bar.update()
Пример #4
0
def facebook_comments_action(namespace):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'post_url')

    try:
        scraper = FacebookCommentScraper(namespace.cookie)
    except FacebookInvalidCookieError:
        if namespace.cookie in ['firefox', 'chrome']:
            die('Could not extract cookies from %s.' % namespace.cookie)

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to access Facebook post comments.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=FACEBOOK_COMMENT_CSV_HEADERS)

    # Loading bar
    loading_bar = tqdm(desc='Scraping comments',
                       dynamic_ncols=True,
                       unit=' comments')

    for i, (row,
            url) in enumerate(enricher.cells(namespace.column,
                                             with_rows=True)):

        if not is_facebook_post_url(url):
            loading_bar.close()
            die('Given url (line %i) is not a Facebook post url: %s' %
                (i + 1, url))

        batches = scraper(url, per_call=True, detailed=True, format='csv_row')

        for details, batch in batches:
            for comment in batch:
                enricher.writerow(row, comment)

            loading_bar.update(len(batch))
            loading_bar.set_postfix(calls=details['calls'],
                                    replies=details['replies'],
                                    q=details['queue_size'],
                                    posts=i + 1)

    loading_bar.close()
Пример #5
0
def search_action(namespace, output_file):

    # Handling output
    output_file = open_output_file(namespace.output)

    edit_namespace_with_csv_io(namespace, 'keyword')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=CSV_HEADERS
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit='videos',
    )
    http = create_pool()
    error_file = DummyTqdmFile(sys.stderr)
    limit = namespace.limit

    for (row, keyword) in enricher.cells(namespace.column, with_rows=True):
        url = URL_template_accurate % {'subject': keyword, 'key': namespace.key}
        next_page = True
        while next_page:
            if next_page is True:
                err, response, result = request_json(http, url)
            else:
                url_next = url + '&pageToken=' + next_page
                err, response, result = request_json(http, url_next)
            if err:
                die(err)
            elif response.status == 403:
                error_file.write('Running out of API points. You will have to wait until midnight, Pacific time!')
                time.sleep(seconds_to_midnight_pacific_time())
                continue
            elif response.status >= 400:
                die(response.status)
            next_page, data_l = get_data(result)
            for data in data_l:
                if limit is not(None):
                    if limit == 0:
                        return True
                    else:
                        limit -= 1
                        enricher.writerow(row, data)
                else:
                    enricher.writerow(row, data)
Пример #6
0
def facebook_url_likes_action(namespace):
    output_file = open_output_file(namespace.output)

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'url')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=REPORT_HEADERS
    )

    if namespace.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column
        ])

    loading_bar = tqdm(
        desc='Retrieving likes',
        dynamic_ncols=True,
        unit=' urls',
        total=namespace.total
    )

    http = create_pool()

    for row, url in enricher.cells(namespace.column, with_rows=True):
        loading_bar.update()

        url = url.strip()

        err, html = make_request(http, url)

        if err is not None:
            loading_bar.close()
            die('An error occurred while fetching like button for this url: %s' % url)

        scraped = scrape(html)

        if scraped is None:
            loading_bar.close()
            die('Could not extract Facebook likes from this url\'s like button: %s' % url)

        enricher.writerow(row, scraped)
Пример #7
0
def search_action(namespace, output_file):

    # Handling output
    single_query = namespace.file is sys.stdin and sys.stdin.isatty()

    if single_query:
        edit_namespace_with_csv_io(namespace, 'query')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        add=YOUTUBE_VIDEO_SNIPPET_CSV_HEADERS,
        keep=namespace.select
    )

    loading_bar = LoadingBar(
        'Searching videos',
        unit='video'
    )

    def before_sleep_until_midnight(seconds):
        loading_bar.print('API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        namespace.key,
        before_sleep_until_midnight=before_sleep_until_midnight
    )

    for row, query in enricher.cells(namespace.column, with_rows=True):
        loading_bar.print('Searching for "%s"' % query)

        searcher = client.search(query, order=namespace.order)

        if namespace.limit:
            searcher = islice(searcher, namespace.limit)

        for video in searcher:
            loading_bar.update()
            enricher.writerow(row, video.as_csv_row())

    loading_bar.close()
Пример #8
0
def facebook_comments_action(namespace):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input

    if is_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'post_url')

    try:
        scraper = FacebookMobileScraper(namespace.cookie,
                                        throttle=namespace.throttle)
    except FacebookInvalidCookieError:
        if namespace.cookie in COOKIE_BROWSERS:
            die([
                'Could not extract relevant cookie from "%s".' %
                namespace.cookie
            ])

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to access Facebook post comments.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=FACEBOOK_COMMENT_CSV_HEADERS)

    # Loading bar
    loading_bar = tqdm(desc='Scraping comments',
                       dynamic_ncols=True,
                       unit=' comments')

    for i, (row,
            url) in enumerate(enricher.cells(namespace.column,
                                             with_rows=True)):

        if not has_facebook_comments(url):
            tqdm.write(
                'Given url (line %i) probably cannot have Facebook comments: %s'
                % (i + 1, url),
                file=sys.stderr)
            continue

        batches = scraper.comments(url, per_call=True, detailed=True)

        for details, batch in batches:
            for comment in batch:
                enricher.writerow(row, comment.as_csv_row())

            loading_bar.update(len(batch))
            loading_bar.set_postfix(calls=details['calls'],
                                    replies=details['replies'],
                                    q=details['queue_size'],
                                    posts=i + 1)

    loading_bar.close()
Пример #9
0
def comments_action(namespace, output_file):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input
    if is_youtube_video_id(namespace.column):
        edit_namespace_with_csv_io(namespace, 'video_id')
    elif is_youtube_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'video_url')

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=CSV_HEADERS)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' comments',
    )

    http = create_pool()
    error_file = DummyTqdmFile(sys.stderr)

    def make_requests(current_url, http=http):
        return (request_json(http, current_url), current_url)

    for (row, url_id) in enricher.cells(namespace.column, with_rows=True):

        if is_youtube_url(url_id):
            yt_id = extract_video_id_from_youtube_url(url_id)
            if yt_id:
                url = URL_TEMPLATE % {'id': yt_id, 'key': namespace.key}
        elif is_youtube_video_id(url_id):
            url = URL_TEMPLATE % {'id': url_id, 'key': namespace.key}
        else:
            continue
        url_queue = deque([url])
        while len(url_queue) != 0:
            couche = []
            with ThreadPoolExecutor(max_workers=25) as executor:
                time.sleep(0.01)
                couche = executor.map(make_requests, url_queue)
            url_queue = deque()
            for resp in couche:
                ((err, response, result), current_url) = resp
                if err:
                    error_file.write('{} for {}'.format(err, current_url))
                    continue
                elif response.status == 403 and result.get('error').get(
                        'errors')[0].get('reason') == 'commentsDisabled':
                    error_file.write(
                        'Comments are disabled for {}'.format(current_url))
                    continue
                elif response.status == 403:
                    error_file.write(
                        'Running out of API points. You will have to wait until midnight, Pacific time!'
                    )
                    time.sleep(seconds_to_midnight_pacific_time())
                    continue
                elif response.status >= 400:
                    error_file.write('Error {} for {}'.format(
                        response.status, current_url))
                    continue
                kind = result.get('kind', None)
                next_page = result.get('nextPageToken', None)
                if next_page:
                    url_next = current_url + '&pageToken=' + next_page
                    url_queue.append(url_next)
                if kind == 'youtube#commentThreadListResponse':
                    # Handling comments pagination
                    items = result.get('items', None)
                    for item in items:
                        snippet = item['snippet']
                        replies = item.get('replies')
                        if replies:
                            # Checking whether youtube's API send a subset of the replies or not
                            if snippet['totalReplyCount'] != len(
                                    replies['comments']) and namespace.full:
                                # If we want the replies and those are not all given by the API, we add the URL specific to the topComment
                                # to the queue, and we deal with that topLevelComment
                                new_url = URL_PARENTID_TEMPLATE % {
                                    'id': snippet['topLevelComment']['id'],
                                    'key': namespace.key
                                }
                                url_queue.append(new_url)
                                data = get_data_full(snippet, True)
                                enricher.writerow(row, data)
                            else:
                                dataTop = get_data_full(snippet, True)
                                enricher.writerow(row, dataTop)
                                for rep in replies['comments']:
                                    enricher.writerow(
                                        row, get_data_full(rep, False))
                        else:
                            # if there is not 'replies' key, it means that the comment we fetch is only a topLevelComment
                            top_comment = get_data_full(snippet, True)
                            enricher.writerow(row, top_comment)
                else:
                    # Handling, commentList, nothing to see here, dealing commments by comments
                    items = result.get('items', None)
                    for item in items:
                        data = get_data_full(item, False)
                        enricher.writerow(row, data)
Пример #10
0
def twitter_scrape_action(namespace, output_file):
    single_query = namespace.file is sys.stdin and sys.stdin.isatty()

    if single_query:
        edit_namespace_with_csv_io(namespace, 'query', attr_name='query')

    scraper = TwitterAPIScraper()

    # Stats
    loading_bar = LoadingBar('Collecting tweets',
                             total=namespace.limit,
                             unit='tweet',
                             stats={
                                 'tokens': 1,
                                 'queries': 0
                             })

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS,
                                 keep=namespace.select)

    def before_sleep(retry_state):
        exc = retry_state.outcome.exception()

        if isinstance(exc, TwitterPublicAPIRateLimitError):
            loading_bar.inc('tokens')

        else:
            loading_bar.inc('failures')
            loading_bar.print(
                'Failed to call Twitter search. Will retry in %s' %
                prettyprint_seconds(retry_state.idle_for))

    for row, query in enricher.cells(namespace.query, with_rows=True):

        # Templating?
        if namespace.query_template is not None:
            query = CUSTOM_FORMATTER.format(namespace.query_template,
                                            value=query)

        loading_bar.print('Searching for "%s"' % query)
        loading_bar.inc('queries')

        iterator = scraper.search(
            query,
            limit=namespace.limit,
            before_sleep=before_sleep,
            include_referenced_tweets=namespace.include_refs,
            with_meta=True)

        try:
            for tweet, meta in iterator:
                loading_bar.update()

                tweet_row = format_tweet_as_csv_row(tweet)
                enricher.writerow(row, tweet_row + format_meta_row(meta))
        except TwitterPublicAPIOverCapacityError:
            loading_bar.die('Got an "Over Capacity" error. Shutting down...')

    loading_bar.close()
Пример #11
0
def fetch_action(namespace):

    # Are we resuming
    resuming = namespace.resume

    if resuming and not namespace.output:
        die(['Cannot --resume without specifying -o/--output.'])

    # Do we need to fetch only a single url?
    single_url = namespace.file is sys.stdin and is_url(namespace.column)

    if single_url:
        edit_namespace_with_csv_io(namespace, 'url')

        # If we are hitting a single url we enable contents_in_report
        if namespace.contents_in_report is None:
            namespace.contents_in_report = True

    # HTTP method
    http_method = namespace.method

    # Cookie grabber
    get_cookie = None
    if namespace.grab_cookies:
        get_cookie = grab_cookies(namespace.grab_cookies)

    # Global headers
    global_headers = None
    if namespace.headers:
        global_headers = {}

        for header in namespace.headers:
            k, v = parse_http_header(header)
            global_headers = v

    flag = 'w'
    if namespace.output is not None and resuming and isfile(namespace.output):
        flag = 'r+'

    output_file = open_output_file(namespace.output, flag=flag)

    # Resume listener
    listener = None
    resuming_reader_loading = None
    skipped = 0

    if resuming:
        resuming_reader_loading = tqdm(desc='Resuming',
                                       dynamic_ncols=True,
                                       unit=' lines')

        def listener(event, row):
            nonlocal skipped

            if event == 'resume.output':
                resuming_reader_loading.update()

            if event == 'resume.input':
                skipped += 1
                loading_bar.set_postfix(skipped=skipped)
                loading_bar.update()

    # Enricher
    enricher = casanova.threadsafe_enricher(
        namespace.file,
        output_file,
        resumable=resuming,
        auto_resume=False,
        add=OUTPUT_ADDITIONAL_HEADERS +
        (['raw_contents'] if namespace.contents_in_report else []),
        keep=namespace.select,
        listener=listener)

    if namespace.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.'
            % namespace.column
        ])

    url_pos = enricher.pos[namespace.column]

    filename_pos = None

    if namespace.filename is not None:
        if namespace.filename not in enricher.pos:
            die([
                'Could not find the "%s" column containing the filenames in the given CSV file.'
                % namespace.filename
            ])

        filename_pos = enricher.pos[namespace.filename]

    indexed_input_headers = {h: i for i, h in enumerate(enricher.fieldnames)}

    if resuming:
        enricher.resume()
        resuming_reader_loading.close()

    # Loading bar
    total = namespace.total

    loading_bar = tqdm(desc='Fetching pages',
                       total=total,
                       dynamic_ncols=True,
                       unit=' urls')

    def url_key(item):
        url = item[1][url_pos].strip()

        if not url:
            return

        # Url templating
        if namespace.url_template:
            return namespace.url_template.format(value=url)

        return url

    def request_args(url, item):
        cookie = None

        # Cookie
        if get_cookie:
            cookie = get_cookie(url)

        # Headers
        headers = None

        if global_headers:
            headers = global_headers

        return {'method': http_method, 'cookie': cookie, 'headers': headers}

    def write_output(index,
                     row,
                     resolved=None,
                     status=None,
                     error=None,
                     filename=None,
                     encoding=None,
                     data=None):

        addendum = [
            resolved or '', status or '', error or '', filename or '', encoding
            or ''
        ]

        if namespace.contents_in_report:
            addendum.append(data or '')

        enricher.writerow(index, row, addendum)

    errors = 0
    status_codes = Counter()

    fetch_kwargs = {
        'threads': namespace.threads,
        'throttle': namespace.throttle,
        'domain_parallelism': namespace.domain_parallelism
    }

    if namespace.timeout is not None:
        fetch_kwargs['timeout'] = namespace.timeout

    multithreaded_iterator = multithreaded_fetch(enricher,
                                                 key=url_key,
                                                 request_args=request_args,
                                                 **fetch_kwargs)

    for result in multithreaded_iterator:
        index, row = result.item

        if not result.url:

            write_output(index, row)

            loading_bar.update()
            continue

        response = result.response
        data = response.data if response is not None else None

        content_write_flag = 'wb'

        # Updating stats
        if result.error is not None:
            errors += 1
        else:
            if response.status >= 400:
                status_codes[response.status] += 1

        postfix = {'errors': errors}

        for code, count in status_codes.most_common(1):
            postfix[str(code)] = count

        loading_bar.set_postfix(**postfix)
        loading_bar.update()

        # No error
        if result.error is None:

            filename = None

            # Building filename
            if data:
                if filename_pos is not None or namespace.filename_template:
                    if namespace.filename_template:
                        filename = CUSTOM_FORMATTER.format(
                            namespace.filename_template,
                            value=row[filename_pos]
                            if filename_pos is not None else None,
                            ext=result.meta['ext'],
                            line=LazyLineDict(indexed_input_headers, row))
                    else:
                        filename = row[filename_pos] + result.meta['ext']
                else:
                    # NOTE: it would be nice to have an id that can be sorted by time
                    filename = str(uuid4()) + result.meta['ext']

            # Standardize encoding?
            encoding = result.meta['encoding']

            if data and namespace.standardize_encoding or namespace.contents_in_report:
                if encoding is None or encoding != 'utf-8' or namespace.contents_in_report:
                    data = data.decode(
                        encoding if encoding is not None else 'utf-8',
                        errors='replace')
                    encoding = 'utf-8'
                    content_write_flag = 'w'

            # Writing file on disk
            if data and not namespace.contents_in_report:

                if namespace.compress:
                    filename += '.gz'

                resource_path = join(namespace.output_dir, filename)
                resource_dir = dirname(resource_path)

                os.makedirs(resource_dir, exist_ok=True)

                with open(resource_path, content_write_flag) as f:

                    # TODO: what if standardize_encoding + compress?
                    f.write(
                        gzip.compress(data) if namespace.compress else data)

            # Reporting in output
            resolved_url = response.geturl()

            write_output(
                index,
                row,
                resolved=resolved_url if resolved_url != result.url else None,
                status=response.status,
                filename=filename,
                encoding=encoding,
                data=data)

        # Handling potential errors
        else:
            error_code = report_error(result.error)

            write_output(index, row, error=error_code)

    # Closing files
    output_file.close()