예제 #1
0
파일: comments.py 프로젝트: zanachka/minet
def comments_action(cli_args):
    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=YOUTUBE_COMMENT_CSV_HEADERS,
                                 keep=cli_args.select)

    loading_bar = LoadingBar('Retrieving comments',
                             unit='comment',
                             stats={'videos': 0})

    def before_sleep_until_midnight(seconds):
        loading_bar.print(
            'API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        cli_args.key, before_sleep_until_midnight=before_sleep_until_midnight)

    for row, video in enricher.cells(cli_args.column, with_rows=True):
        generator = client.comments(video)

        for comment in generator:
            loading_bar.update()
            enricher.writerow(row, comment.as_csv_row())

        loading_bar.inc('videos')
예제 #2
0
def comments_action(namespace, output_file):

    # Handling output
    single_video = namespace.file is sys.stdin and sys.stdin.isatty()

    if single_video:
        edit_namespace_with_csv_io(namespace, 'video')

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=YOUTUBE_COMMENT_CSV_HEADERS,
                                 keep=namespace.select)

    loading_bar = LoadingBar('Retrieving comments',
                             unit='comment',
                             stats={'videos': 0})

    def before_sleep_until_midnight(seconds):
        loading_bar.print(
            'API limits reached. Will now wait until midnight Pacific time!')

    client = YouTubeAPIClient(
        namespace.key, before_sleep_until_midnight=before_sleep_until_midnight)

    for row, video in enricher.cells(namespace.column, with_rows=True):
        generator = client.comments(video)

        for comment in generator:
            loading_bar.update()
            enricher.writerow(row, comment.as_csv_row())

        loading_bar.inc('videos')

    loading_bar.close()
예제 #3
0
파일: posts.py 프로젝트: zanachka/minet
def facebook_posts_action(cli_args):
    try:
        scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle)
    except FacebookInvalidCookieError:
        if cli_args.cookie in COOKIE_BROWSERS:
            die([
                'Could not extract relevant cookie from "%s".' % cli_args.cookie
            ])

        die([
            'Relevant cookie not found.',
            'A Facebook authentication cookie is necessary to be able to scrape Facebook groups.',
            'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.'
        ])

    # Enricher
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=FACEBOOK_POST_CSV_HEADERS
    )

    # Loading bar
    loading_bar = LoadingBar(
        desc='Scraping posts',
        unit='post'
    )

    translated_langs = set()

    for i, (row, url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1):
        loading_bar.inc('groups')

        try:
            posts = scraper.posts(url)
        except FacebookInvalidTargetError:
            loading_bar.print('Given url (line %i) is probably not a Facebook group: %s' % (i, url))
            continue

        for post in posts:
            if post.translated_text and post.translated_from not in translated_langs:
                translated_langs.add(post.translated_from)
                lines = [
                    'Found text translated from %s!' % post.translated_from,
                    'Since it means original text may not be entirely retrieved you might want',
                    'to edit your Facebook language settings to add "%s" to' % post.translated_from,
                    'the "Languages you don\'t want to be offered translations for" list here:',
                    'https://www.facebook.com/settings/?tab=language'
                ]

                for line in lines:
                    loading_bar.print(line)

                loading_bar.print()

            loading_bar.update()
            enricher.writerow(row, post.as_csv_row())
예제 #4
0
def twitter_scrape_action(cli_args):
    scraper = TwitterAPIScraper()

    # Stats
    loading_bar = LoadingBar('Collecting tweets',
                             total=cli_args.limit,
                             unit='tweet',
                             stats={
                                 'tokens': 1,
                                 'queries': 0
                             })

    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS,
                                 keep=cli_args.select)

    def before_sleep(retry_state):
        exc = retry_state.outcome.exception()

        if isinstance(exc, TwitterPublicAPIRateLimitError):
            loading_bar.inc('tokens')

        else:
            loading_bar.inc('failures')
            loading_bar.print(
                'Failed to call Twitter search. Will retry in %s' %
                prettyprint_seconds(retry_state.idle_for))

    for row, query in enricher.cells(cli_args.query, with_rows=True):

        # Templating?
        if cli_args.query_template is not None:
            query = CUSTOM_FORMATTER.format(cli_args.query_template,
                                            value=query)

        loading_bar.print('Searching for "%s"' % query)
        loading_bar.inc('queries')

        iterator = scraper.search(
            query,
            limit=cli_args.limit,
            before_sleep=before_sleep,
            include_referenced_tweets=cli_args.include_refs,
            with_meta=True)

        try:
            for tweet, meta in iterator:
                loading_bar.update()

                tweet_row = format_tweet_as_csv_row(tweet)
                enricher.writerow(row, tweet_row + format_meta_row(meta))
        except TwitterPublicAPIOverCapacityError:
            loading_bar.die('Got an "Over Capacity" error. Shutting down...')
예제 #5
0
파일: scrape.py 프로젝트: medialab/minet
def twitter_scrape_action(cli_args):
    scraper = TwitterAPIScraper()

    # Stats
    loading_bar = LoadingBar(
        'Collecting tweets',
        total=cli_args.limit,
        unit='tweet',
        stats={'tokens': 1, 'queries': 0}
    )

    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS,
        keep=cli_args.select
    )

    for row, query in enricher.cells(cli_args.query, with_rows=True):

        # Templating?
        if cli_args.query_template is not None:
            query = CUSTOM_FORMATTER.format(
                cli_args.query_template,
                value=query
            )

        loading_bar.print('Searching for "%s"' % query)
        loading_bar.inc('queries')

        iterator = scraper.search(
            query,
            limit=cli_args.limit,
            include_referenced_tweets=cli_args.include_refs,
            with_meta=True
        )

        try:
            for tweet, meta in iterator:
                loading_bar.update()

                tweet_row = format_tweet_as_csv_row(tweet)
                enricher.writerow(row, tweet_row + format_meta_row(meta))
        except TwitterPublicAPIOverCapacityError:
            loading_bar.die('Got an "Over Capacity" error. Shutting down...')
예제 #6
0
    def action(cli_args):
        enricher = casanova.batch_enricher(cli_args.file,
                                           cli_args.output,
                                           keep=cli_args.select,
                                           add=csv_headers)

        loading_bar = LoadingBar(desc='Retrieving ids',
                                 unit=method_name[:-1],
                                 stats={'users': 0})

        # TODO: this is temp debug
        def listener(event, data):
            loading_bar.print(event)
            loading_bar.print(repr(data))

        wrapper = TwitterWrapper(cli_args.access_token,
                                 cli_args.access_token_secret,
                                 cli_args.api_key,
                                 cli_args.api_secret_key,
                                 listener=listener)

        resuming_state = None

        if cli_args.resume:
            resuming_state = cli_args.output.pop_state()

        for row, user in enricher.cells(cli_args.column, with_rows=True):
            loading_bar.update_stats(user=user)

            all_ids = []
            next_cursor = -1
            result = None

            if resuming_state is not None and resuming_state.last_cursor:
                next_cursor = int(resuming_state.last_cursor)

            if cli_args.ids:
                wrapper_kwargs = {'user_id': user}
            else:
                wrapper_kwargs = {'screen_name': user}

            while next_cursor != 0:
                wrapper_kwargs['cursor'] = next_cursor

                skip_in_output = None

                if resuming_state:
                    skip_in_output = resuming_state.values_to_skip
                    resuming_state = None

                try:
                    result = wrapper.call([method_name, 'ids'],
                                          **wrapper_kwargs)
                except TwitterHTTPError as e:

                    # The user does not exist
                    loading_bar.inc('users_not_found')
                    break

                if result is not None:
                    all_ids = result.get('ids', [])
                    next_cursor = result.get('next_cursor', 0)

                    loading_bar.update(len(all_ids))

                    batch = []

                    for user_id in all_ids:
                        if skip_in_output and user_id in skip_in_output:
                            continue

                        batch.append([user_id])

                    enricher.writebatch(row, batch, next_cursor or None)
                else:
                    next_cursor = 0

            loading_bar.inc('users')
예제 #7
0
파일: scrape.py 프로젝트: zanachka/minet
def scrape_action(cli_args):

    # Parsing scraper definition
    try:
        scraper = Scraper(cli_args.scraper, strain=cli_args.strain)
    except DefinitionInvalidFormatError:
        die(['Unknown scraper format!', 'It should be a JSON or YAML file.'])
    except FileNotFoundError:
        die('Could not find scraper file!')
    except InvalidScraperError as error:
        print('Your scraper is invalid! You need to fix the following errors:',
              file=sys.stderr)
        print(file=sys.stderr)
        sys.stderr.write(
            report_scraper_validation_errors(error.validation_errors))
        die()
    except CSSSelectorTooComplex:
        die([
            'Your strainer\'s CSS selector %s is too complex.' %
            colored(cli_args.strain, 'blue'),
            'You cannot use relations to create a strainer.',
            'Try to simplify the selector you passed to --strain.'
        ])

    if cli_args.validate:
        print('Your scraper is valid.', file=sys.stderr)
        sys.exit(0)

    if scraper.headers is None and cli_args.format == 'csv':
        die([
            'Your scraper does not yield tabular data.',
            'Try changing it or setting --format to "jsonl".'
        ])

    loading_bar = LoadingBar(desc='Scraping pages',
                             total=cli_args.total,
                             unit='page')

    worker_args = (cli_args.format, cli_args.separator)

    def on_irrelevant_row(reason, row):
        loading_bar.update()

    if cli_args.glob is not None:
        files = create_glob_iterator(cli_args, worker_args)
    else:
        reader = casanova.reader(cli_args.report)

        try:
            files = create_report_iterator(cli_args,
                                           reader,
                                           worker_args=worker_args,
                                           on_irrelevant_row=on_irrelevant_row)
        except NotADirectoryError:
            loading_bar.die([
                'Could not find the "%s" directory!' % cli_args.input_dir,
                'Did you forget to specify it with -i/--input-dir?'
            ])

    if cli_args.format == 'csv':
        output_writer = csv.DictWriter(cli_args.output,
                                       fieldnames=scraper.headers)
        output_writer.writeheader()
    else:
        output_writer = ndjson.writer(cli_args.output)

    pool = LazyPool(cli_args.processes,
                    initializer=init_process,
                    initargs=(scraper.definition, cli_args.strain))

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, items in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                if isinstance(error, (ScraperEvalError, ScraperEvalTypeError,
                                      ScraperEvalNoneError)):
                    loading_bar.print(report_scraper_evaluation_error(error),
                                      end='')
                loading_bar.inc('errors')
                continue

            for item in items:
                output_writer.writerow(item)
예제 #8
0
def twitter_user_tweets_action(namespace, output_file):

    wrapper = TwitterWrapper(namespace.access_token,
                             namespace.access_token_secret, namespace.api_key,
                             namespace.api_secret_key)

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=TWEET_FIELDS)

    loading_bar = LoadingBar('Retrieving tweets',
                             total=namespace.total,
                             unit='tweet')

    for row, user in enricher.cells(namespace.column, with_rows=True):
        max_id = None

        loading_bar.update_stats(user=user)

        while True:
            if namespace.ids:
                kwargs = {'user_id': user}
            else:
                kwargs = {'screen_name': user}

            kwargs['include_rts'] = not namespace.exclude_retweets
            kwargs['count'] = TWITTER_API_MAX_STATUSES_COUNT
            kwargs['tweet_mode'] = 'extended'

            if max_id is not None:
                kwargs['max_id'] = max_id

            loading_bar.inc('calls')

            try:
                tweets = wrapper.call(['statuses', 'user_timeline'], **kwargs)
            except TwitterHTTPError as e:
                loading_bar.inc('errors')

                if e.e.code == 404:
                    loading_bar.print('Could not find user "%s"' % user)
                else:
                    loading_bar.print(
                        'An error happened when attempting to retrieve tweets from "%s"'
                        % user)

                break

            if not tweets:
                break

            loading_bar.update(len(tweets))

            max_id = min(int(tweet['id_str']) for tweet in tweets) - 1

            for tweet in tweets:
                tweet = normalize_tweet(tweet, collection_source='api')
                addendum = format_tweet_as_csv_row(tweet)

                enricher.writerow(row, addendum)

        loading_bar.inc('done')

    loading_bar.close()