示例#1
0
文件: search.py 项目: zanachka/minet
def mediacloud_search_action(cli_args):
    writer = csv.writer(cli_args.output)
    writer.writerow(MEDIACLOUD_STORIES_CSV_HEADER)

    client = MediacloudAPIClient(cli_args.token)

    kwargs = {
        'collections': cli_args.collections,
        'medias': cli_args.medias,
        'publish_day': cli_args.publish_day,
        'publish_month': cli_args.publish_month,
        'publish_year': cli_args.publish_year,
        'filter_query': cli_args.filter_query
    }

    loading_bar = LoadingBar('Searching stories',
                             unit='story',
                             unit_plural='stories')

    try:
        if not cli_args.skip_count:
            count = client.count(cli_args.query, **kwargs)

            loading_bar.update_total(count)

        iterator = client.search(cli_args.query, **kwargs)

        for story in iterator:
            writer.writerow(story.as_csv_row())
            loading_bar.update()

    except MediacloudServerError as e:
        loading_bar.die(
            ['Aborted due to a mediacloud server error:', e.server_error])
示例#2
0
文件: extract.py 项目: medialab/minet
def extract_action(cli_args):
    if cli_args.glob is None and cli_args.input_dir is None:
        cli_args.input_dir = DEFAULT_CONTENT_FOLDER

    input_data = cli_args.report

    if cli_args.glob is not None:
        input_data = dummy_csv_file_from_glob(cli_args.glob, cli_args.input_dir)

    enricher = casanova.enricher(
        input_data,
        cli_args.output,
        keep=cli_args.select,
        add=OUTPUT_ADDITIONAL_HEADERS
    )

    loading_bar = LoadingBar(
        desc='Extracting content',
        total=cli_args.total,
        unit='doc'
    )

    def on_irrelevant_row(reason, row, i):
        loading_bar.update()
        loading_bar.print('Row n°{n} could not be processed: {reason}'.format(n=i + 1, reason=reason))
        enricher.writerow(row, format_error(reason))

    if (
        cli_args.glob is None and
        'raw_contents' not in enricher.headers and
        not isdir(cli_args.input_dir)
    ):
        loading_bar.die([
            'Could not find the "%s" directory!' % cli_args.input_dir,
            'Did you forget to specify it with -i/--input-dir?'
        ])

    files = create_report_iterator(
        cli_args,
        enricher,
        on_irrelevant_row=on_irrelevant_row
    )

    pool = LazyPool(cli_args.processes)

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, format_error(report_error(error)))
                continue

            if result is None:
                enricher.writerow(row, format_error('no-result'))
                continue

            enricher.writerow(row, result)
示例#3
0
def mediacloud_search_action(namespace, output_file):
    writer = csv.writer(output_file)
    writer.writerow(MEDIACLOUD_STORIES_CSV_HEADER)

    client = MediacloudAPIClient(namespace.token)

    kwargs = {
        'collections': namespace.collections,
        'medias': namespace.medias,
        'publish_day': namespace.publish_day,
        'publish_month': namespace.publish_month,
        'publish_year': namespace.publish_year
    }

    loading_bar = LoadingBar('Searching stories',
                             unit='story',
                             unit_plural='stories')

    try:
        if not namespace.skip_count:
            count = client.count(namespace.query, **kwargs)

            loading_bar.update_total(count)

        iterator = client.search(namespace.query, format='csv_row', **kwargs)

        for story in iterator:
            writer.writerow(story)
            loading_bar.update()

    except MediacloudServerError as e:
        loading_bar.die(
            ['Aborted due to a mediacloud server error:', e.server_error])
示例#4
0
def twitter_scrape_action(cli_args):
    scraper = TwitterAPIScraper()

    # Stats
    loading_bar = LoadingBar('Collecting tweets',
                             total=cli_args.limit,
                             unit='tweet',
                             stats={
                                 'tokens': 1,
                                 'queries': 0
                             })

    enricher = casanova.enricher(cli_args.file,
                                 cli_args.output,
                                 add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS,
                                 keep=cli_args.select)

    def before_sleep(retry_state):
        exc = retry_state.outcome.exception()

        if isinstance(exc, TwitterPublicAPIRateLimitError):
            loading_bar.inc('tokens')

        else:
            loading_bar.inc('failures')
            loading_bar.print(
                'Failed to call Twitter search. Will retry in %s' %
                prettyprint_seconds(retry_state.idle_for))

    for row, query in enricher.cells(cli_args.query, with_rows=True):

        # Templating?
        if cli_args.query_template is not None:
            query = CUSTOM_FORMATTER.format(cli_args.query_template,
                                            value=query)

        loading_bar.print('Searching for "%s"' % query)
        loading_bar.inc('queries')

        iterator = scraper.search(
            query,
            limit=cli_args.limit,
            before_sleep=before_sleep,
            include_referenced_tweets=cli_args.include_refs,
            with_meta=True)

        try:
            for tweet, meta in iterator:
                loading_bar.update()

                tweet_row = format_tweet_as_csv_row(tweet)
                enricher.writerow(row, tweet_row + format_meta_row(meta))
        except TwitterPublicAPIOverCapacityError:
            loading_bar.die('Got an "Over Capacity" error. Shutting down...')
示例#5
0
def extract_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(
        namespace.report,
        output_file,
        keep=namespace.select,
        add=OUTPUT_ADDITIONAL_HEADERS
    )

    loading_bar = LoadingBar(
        desc='Extracting content',
        total=namespace.total,
        unit='doc'
    )

    def on_irrelevant_row(reason, row):
        loading_bar.update()
        enricher.writerow(row, format_error(reason))

    try:
        files = create_report_iterator(
            namespace,
            enricher,
            on_irrelevant_row=on_irrelevant_row
        )
    except NotADirectoryError:
        loading_bar.die([
            'Could not find the "%s" directory!' % namespace.input_dir,
            'Did you forget to specify it with -i/--input-dir?'
        ])

    pool = LazyPool(namespace.processes)

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, row, result in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                enricher.writerow(row, format_error(report_error(error)))
                continue

            if result is None:
                enricher.writerow(row, format_error('no-content'))
                continue

            enricher.writerow(row, result)

    loading_bar.close()
    output_file.close()
示例#6
0
def mediacloud_medias_action(cli_args):
    added_headers = MEDIACLOUD_MEDIA_CSV_HEADER[1:]

    feeds_writer = None

    if cli_args.feeds:
        added_headers.append('feeds')
        feeds_writer = csv.writer(cli_args.feeds)
        feeds_writer.writerow(MEDIACLOUD_FEED_CSV_HEADER)

    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=added_headers
    )

    loading_bar = LoadingBar(
        desc='Fetching medias',
        unit='media',
        total=cli_args.total
    )

    client = MediacloudAPIClient(cli_args.token)

    for row, media_id in enricher.cells(cli_args.column, with_rows=True):

        try:
            result = client.media(media_id)
            result = result.as_csv_row()[1:]

            if cli_args.feeds:
                feeds = client.feeds(media_id)

                enricher.writerow(row, result + [len(feeds)])

                for feed in feeds:
                    feeds_writer.writerow(feed.as_csv_row())
            else:
                enricher.writerow(row, result)
        except MediacloudServerError as e:
            loading_bar.die([
                'Aborted due to a mediacloud server error:',
                e.server_error
            ])

        loading_bar.update()
示例#7
0
文件: scrape.py 项目: medialab/minet
def twitter_scrape_action(cli_args):
    scraper = TwitterAPIScraper()

    # Stats
    loading_bar = LoadingBar(
        'Collecting tweets',
        total=cli_args.limit,
        unit='tweet',
        stats={'tokens': 1, 'queries': 0}
    )

    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS,
        keep=cli_args.select
    )

    for row, query in enricher.cells(cli_args.query, with_rows=True):

        # Templating?
        if cli_args.query_template is not None:
            query = CUSTOM_FORMATTER.format(
                cli_args.query_template,
                value=query
            )

        loading_bar.print('Searching for "%s"' % query)
        loading_bar.inc('queries')

        iterator = scraper.search(
            query,
            limit=cli_args.limit,
            include_referenced_tweets=cli_args.include_refs,
            with_meta=True
        )

        try:
            for tweet, meta in iterator:
                loading_bar.update()

                tweet_row = format_tweet_as_csv_row(tweet)
                enricher.writerow(row, tweet_row + format_meta_row(meta))
        except TwitterPublicAPIOverCapacityError:
            loading_bar.die('Got an "Over Capacity" error. Shutting down...')
示例#8
0
def facebook_url_likes_action(cli_args):
    enricher = casanova.enricher(
        cli_args.file,
        cli_args.output,
        keep=cli_args.select,
        add=REPORT_HEADERS,
        total=cli_args.total,
        prebuffer_bytes=DEFAULT_PREBUFFER_BYTES
    )

    if cli_args.column not in enricher.pos:
        die([
            'Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column
        ])

    loading_bar = LoadingBar(
        desc='Retrieving likes',
        unit='url',
        total=enricher.total
    )

    for row, url in enricher.cells(cli_args.column, with_rows=True):
        loading_bar.update()

        url = url.strip()

        if not url or not is_url(url, require_protocol=False):
            enricher.writerow(row)
            continue

        err, html = make_request(url)

        if err is not None:
            loading_bar.die('An error occurred while fetching like button for this url: %s' % url)

        scraped = scrape(html)

        if scraped is None:
            loading_bar.die('Could not extract Facebook likes from this url\'s like button: %s' % url)

        enricher.writerow(row, scraped)
示例#9
0
文件: scrape.py 项目: zanachka/minet
def scrape_action(cli_args):

    # Parsing scraper definition
    try:
        scraper = Scraper(cli_args.scraper, strain=cli_args.strain)
    except DefinitionInvalidFormatError:
        die(['Unknown scraper format!', 'It should be a JSON or YAML file.'])
    except FileNotFoundError:
        die('Could not find scraper file!')
    except InvalidScraperError as error:
        print('Your scraper is invalid! You need to fix the following errors:',
              file=sys.stderr)
        print(file=sys.stderr)
        sys.stderr.write(
            report_scraper_validation_errors(error.validation_errors))
        die()
    except CSSSelectorTooComplex:
        die([
            'Your strainer\'s CSS selector %s is too complex.' %
            colored(cli_args.strain, 'blue'),
            'You cannot use relations to create a strainer.',
            'Try to simplify the selector you passed to --strain.'
        ])

    if cli_args.validate:
        print('Your scraper is valid.', file=sys.stderr)
        sys.exit(0)

    if scraper.headers is None and cli_args.format == 'csv':
        die([
            'Your scraper does not yield tabular data.',
            'Try changing it or setting --format to "jsonl".'
        ])

    loading_bar = LoadingBar(desc='Scraping pages',
                             total=cli_args.total,
                             unit='page')

    worker_args = (cli_args.format, cli_args.separator)

    def on_irrelevant_row(reason, row):
        loading_bar.update()

    if cli_args.glob is not None:
        files = create_glob_iterator(cli_args, worker_args)
    else:
        reader = casanova.reader(cli_args.report)

        try:
            files = create_report_iterator(cli_args,
                                           reader,
                                           worker_args=worker_args,
                                           on_irrelevant_row=on_irrelevant_row)
        except NotADirectoryError:
            loading_bar.die([
                'Could not find the "%s" directory!' % cli_args.input_dir,
                'Did you forget to specify it with -i/--input-dir?'
            ])

    if cli_args.format == 'csv':
        output_writer = csv.DictWriter(cli_args.output,
                                       fieldnames=scraper.headers)
        output_writer.writeheader()
    else:
        output_writer = ndjson.writer(cli_args.output)

    pool = LazyPool(cli_args.processes,
                    initializer=init_process,
                    initargs=(scraper.definition, cli_args.strain))

    loading_bar.update_stats(p=pool.processes)

    with pool:
        for error, items in pool.imap_unordered(worker, files):
            loading_bar.update()

            if error is not None:
                if isinstance(error, (ScraperEvalError, ScraperEvalTypeError,
                                      ScraperEvalNoneError)):
                    loading_bar.print(report_scraper_evaluation_error(error),
                                      end='')
                loading_bar.inc('errors')
                continue

            for item in items:
                output_writer.writerow(item)
示例#10
0
文件: utils.py 项目: medialab/minet
    def action(cli_args):

        resume = getattr(cli_args, 'resume', False)

        # Validation
        if resume:
            if cli_args.sort_by != 'date':
                die('Cannot --resume if --sort_by is not `date`.')

            if cli_args.format != 'csv':
                die('Cannot --resume jsonl format yet.')

        if cli_args.format == 'csv':
            fieldnames = csv_headers(cli_args) if callable(
                csv_headers) else csv_headers
            writer = casanova.writer(cli_args.output, fieldnames)
        else:
            writer = ndjson.writer(cli_args.output)

        # Acquiring state from resumer
        if getattr(cli_args, 'resume', False):
            last_date = cli_args.output.pop_state()

            if last_date is not None:
                cli_args.end_date = last_date.replace(' ', 'T')
                print_err('Resuming from: %s' % cli_args.end_date)

        if callable(announce):
            print_err(announce(cli_args))

        # Loading bar
        loading_bar = LoadingBar(desc='Fetching %s' % item_name,
                                 unit=item_name[:-1],
                                 total=cli_args.limit)

        args = []

        if callable(get_args):
            args = get_args(cli_args)

        client = CrowdTangleAPIClient(cli_args.token,
                                      rate_limit=cli_args.rate_limit)

        create_iterator = getattr(client, method_name)
        iterator = create_iterator(*args,
                                   limit=cli_args.limit,
                                   raw=cli_args.format != 'csv',
                                   per_call=True,
                                   detailed=True,
                                   namespace=cli_args)

        try:
            for details, items in iterator:
                loading_bar.update(len(items))

                if details is not None:
                    loading_bar.update_stats(**details)

                for item in items:
                    if cli_args.format == 'csv':
                        item = item.as_csv_row()

                    writer.writerow(item)

        except CrowdTangleInvalidTokenError:
            loading_bar.die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])
示例#11
0
    def action(cli_args):

        resume = getattr(cli_args, 'resume', False)

        # Validation
        if resume:
            if cli_args.sort_by != 'date':
                die('Cannot --resume if --sort_by is not `date`.')

            if cli_args.format != 'csv':
                die('Cannot --resume jsonl format yet.')

        if cli_args.format == 'csv':
            fieldnames = csv_headers(cli_args) if callable(
                csv_headers) else csv_headers
            writer = casanova.writer(cli_args.output, fieldnames)
        else:
            writer = ndjson.writer(cli_args.output)

        # Acquiring state from resumer
        if getattr(cli_args, 'resume', False):
            last_date = cli_args.output.pop_state()

            if last_date is not None:
                cli_args.end_date = last_date.replace(' ', 'T')
                print_err('Resuming from: %s' % cli_args.end_date)

        if callable(announce):
            print_err(announce(cli_args))

        # Loading bar
        loading_bar = LoadingBar(desc='Fetching %s' % item_name,
                                 unit=item_name[:-1],
                                 total=cli_args.limit)

        client = CrowdTangleAPIClient(cli_args.token,
                                      rate_limit=cli_args.rate_limit)

        args = []

        if callable(get_args):
            args = get_args(cli_args)

        def before_sleep(retry_state):
            exc = retry_state.outcome.exception()

            if isinstance(exc, CrowdTangleRateLimitExceeded):
                reason = 'Call failed because of rate limit!'

            elif isinstance(exc, CrowdTangleInvalidJSONError):
                reason = 'Call failed because of invalid JSON payload!'

            else:
                reason = 'Call failed because of server timeout!'

            loading_bar.print(
                '%s\nWill wait for %s before attempting again.' %
                (reason,
                 prettyprint_seconds(retry_state.idle_for, granularity=2)))

        create_iterator = getattr(client, method_name)
        iterator = create_iterator(*args,
                                   limit=cli_args.limit,
                                   raw=cli_args.format != 'csv',
                                   per_call=True,
                                   detailed=True,
                                   namespace=cli_args,
                                   before_sleep=before_sleep)

        try:
            for details, items in iterator:
                loading_bar.update(len(items))

                if details is not None:
                    loading_bar.update_stats(**details)

                for item in items:
                    if cli_args.format == 'csv':
                        item = item.as_csv_row()

                    writer.writerow(item)

        except CrowdTangleInvalidTokenError:
            loading_bar.die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])