Exemplo n.º 1
0
def search_action(namespace, output_file):

    # Handling output
    output_file = open_output_file(namespace.output)

    edit_namespace_with_csv_io(namespace, 'keyword')

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=CSV_HEADERS
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit='videos',
    )
    http = create_pool()
    error_file = DummyTqdmFile(sys.stderr)
    limit = namespace.limit

    for (row, keyword) in enricher.cells(namespace.column, with_rows=True):
        url = URL_template_accurate % {'subject': keyword, 'key': namespace.key}
        next_page = True
        while next_page:
            if next_page is True:
                err, response, result = request_json(http, url)
            else:
                url_next = url + '&pageToken=' + next_page
                err, response, result = request_json(http, url_next)
            if err:
                die(err)
            elif response.status == 403:
                error_file.write('Running out of API points. You will have to wait until midnight, Pacific time!')
                time.sleep(seconds_to_midnight_pacific_time())
                continue
            elif response.status >= 400:
                die(response.status)
            next_page, data_l = get_data(result)
            for data in data_l:
                if limit is not(None):
                    if limit == 0:
                        return True
                    else:
                        limit -= 1
                        enricher.writerow(row, data)
                else:
                    enricher.writerow(row, data)
Exemplo n.º 2
0
def crowdtangle_post(http, post_id, token=None, format='csv_dict_row'):

    if token is None:
        raise CrowdTangleMissingTokenError

    if format not in CROWDTANGLE_OUTPUT_FORMATS:
        raise TypeError('minet.crowdtangle.post: unkown `format`.')

    # Fetching
    api_url = URL_TEMPLATE % (post_id, token)

    err, response, data = request_json(http, api_url)

    if err is not None:
        raise err

    if response.status == 401:
        raise CrowdTangleInvalidTokenError

    if response.status >= 400:
        raise CrowdTangleInvalidRequestError(api_url)

    post = nested_get(['result', 'posts', 0], data)

    if post is None:
        return

    if format == 'csv_dict_row':
        return format_post(post, as_dict=True)
    elif format == 'csv_row':
        return format_post(post)

    return post
Exemplo n.º 3
0
def videos_action(namespace, output_file):

    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 add=REPORT_HEADERS,
                                 keep=namespace.select)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()
    column = namespace.column

    def rows_with_videos_id():

        for row, ytb_data in enricher.cells(namespace.column, with_rows=True):
            video_id = None

            if is_youtube_video_id(ytb_data):
                video_id = ytb_data
            elif is_youtube_url(ytb_data):
                video_id = extract_video_id_from_youtube_url(ytb_data)

            yield row, video_id

    for chunk in chunks_iter(rows_with_videos_id(), 50):

        all_ids = [video_id for _, video_id in chunk if video_id]
        list_id = ",".join(all_ids)

        url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key}
        err, response, result = request_json(http, url)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        data = get_data(result)

        not_available = []

        id_available = set(data)
        not_available = set(all_ids).difference(id_available)

        loading_bar.update(len(chunk))

        line_empty = []

        for row, video_id in chunk:
            if video_id is None or video_id in not_available:
                enricher.writerow(row)
            else:
                enricher.writerow(row, data[video_id])
Exemplo n.º 4
0
def crowdtangle_lists(http, token=None, format='csv_dict_row'):

    if token is None:
        raise CrowdTangleMissingTokenError

    if format not in CROWDTANGLE_OUTPUT_FORMATS:
        raise TypeError('minet.crowdtangle.lists: unkown `format`.')

    # Fetching
    api_url = URL_TEMPLATE % token

    err, response, data = request_json(http, api_url)

    if err is not None:
        raise err

    if response.status == 401:
        raise CrowdTangleInvalidTokenError

    if response.status >= 400:
        raise CrowdTangleInvalidRequestError(api_url)

    lists = nested_get(['result', 'lists'], data)

    if format == 'csv_dict_row':
        return [format_list(l, as_dict=True) for l in lists]
    elif format == 'csv_row':
        return [format_list(l) for l in lists]

    return lists
Exemplo n.º 5
0
def crowdtangle_summary(http,
                        link,
                        token=None,
                        start_date=None,
                        with_top_posts=False,
                        sort_by=CROWDTANGLE_SUMMARY_DEFAULT_SORT_TYPE,
                        format='csv_dict_row',
                        platforms=None):

    if token is None:
        raise CrowdTangleMissingTokenError

    if format not in CROWDTANGLE_OUTPUT_FORMATS:
        raise TypeError('minet.crowdtangle.summary: unkown `format`.')

    if not isinstance(start_date, str):
        raise TypeError(
            'minet.crowdtangle.summary: expecting a `start_date` kwarg.')

    if sort_by not in CROWDTANGLE_SUMMARY_SORT_TYPES:
        raise TypeError('minet.crowdtangle.summary: unknown `sort_by`.')

    # Fetching
    api_url = url_forge(link, token, start_date, sort_by, platforms,
                        with_top_posts)

    err, response, data = request_json(http, api_url)

    if err is not None:
        raise err

    if response.status == 401:
        raise CrowdTangleInvalidTokenError

    if response.status >= 400:
        raise CrowdTangleInvalidRequestError(api_url)

    stats = nested_get(['result', 'summary', 'facebook'], data)
    posts = nested_get(['result', 'posts'], data) if with_top_posts else None

    if stats is not None:
        if format == 'csv_dict_row':
            stats = format_summary(stats, as_dict=True)
        elif format == 'csv_row':
            stats = format_summary(stats)

    if not with_top_posts:
        return stats

    else:
        if posts is not None:
            if format == 'csv_dict_row':
                posts = [format_post(post, as_dict=True) for post in posts]
            elif format == 'csv_row':
                posts = [format_post(post) for post in posts]

        return stats, posts
Exemplo n.º 6
0
def videos_action(namespace, output_file):

    enricher = CSVEnricher(
        namespace.file,
        namespace.column,
        output_file,
        report_headers=REPORT_HEADERS,
        select=namespace.select.split(',') if namespace.select else None
    )

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' videos',
    )

    http = create_pool()

    for chunk in gen_chunks(enricher):

        all_ids = [row[0] for row in chunk if row[0]]
        list_id = ",".join(all_ids)

        url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key}
        err, response, result = request_json(http, url)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        data = get_data(result)

        id_available = set(data)
        not_available = set(all_ids).difference(id_available)

        loading_bar.update(len(chunk))

        line_empty = []

        for item in chunk:
            video_id, line = item

            if video_id is None:
                enricher.write_empty(line)

            elif video_id in not_available:
                line_empty = [video_id] + [''] * (len(REPORT_HEADERS) - 1)
                enricher.write(line, line_empty)

            else:
                enricher.write(line, data[video_id])
Exemplo n.º 7
0
def comments_action(namespace, output_file):

    output_file = open_output_file(namespace.output)

    writer = csv.writer(output_file)
    writer.writerow(CSV_HEADERS)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' comments',
    )

    http = create_pool()

    url = URL_TEMPLATE % {'id': namespace.id, 'key': namespace.key}
    next_page = True
    all_data = []

    while next_page:

        if next_page is True:
            err, response, result = request_json(http, url)
        else:
            url_next = url + '&pageToken=' + next_page
            err, response, result = request_json(http, url_next)

        if err:
            die(err)
        elif response.status == 403:
            time.sleep(seconds_to_midnight_pacific_time())
            continue
        elif response.status >= 400:
            die(response.status)

        next_page, data = get_data(result)

        for comment in data:
            loading_bar.update()
            writer.writerow(comment)
Exemplo n.º 8
0
def mediacloud_topic_stories(http,
                             token,
                             topic_id,
                             link_id=None,
                             media_id=None,
                             from_media_id=None,
                             format='csv_dict_row'):

    while True:
        url = url_forge(
            token,
            topic_id=topic_id,
            link_id=link_id,
            media_id=media_id,
            from_media_id=from_media_id,
        )

        err, _, data = request_json(http, url)

        if err:
            raise err

        if 'stories' not in data or len(data['stories']) == 0:
            return

        next_link_id = get_next_link_id(data)

        for story in data['stories']:
            if format == 'csv_dict_row':
                yield format_topic_story(story, next_link_id, as_dict=True)
            elif format == 'csv_row':
                yield format_topic_story(story, next_link_id)
            else:
                yield story

        if next_link_id is None:
            return

        link_id = next_link_id
Exemplo n.º 9
0
    def generator():
        last_processed_stories_id = None

        while True:
            url = url_forge(
                token,
                query,
                collections=collections,
                count=count,
                last_processed_stories_id=last_processed_stories_id
            )

            err, response, data = request_json(http, url)

            if err:
                raise err

            if response.status >= 500:
                raise MediacloudServerError(server_error=data.get('error'))

            if count:
                yield data['count']
                return

            for story in data:
                if format == 'csv_dict_row':
                    yield format_story(story, as_dict=True)
                elif format == 'csv_row':
                    yield format_story(story)
                else:
                    yield story

            last_processed_stories_id = get_last_processed_stories_id(data)

            if last_processed_stories_id is None:
                return
Exemplo n.º 10
0
 def make_requests(current_url, http=http):
     return (request_json(http, current_url), current_url)
Exemplo n.º 11
0
def comments_action(namespace, output_file):

    # Handling output
    output_file = open_output_file(namespace.output)

    # Handling input
    if is_youtube_video_id(namespace.column):
        edit_namespace_with_csv_io(namespace, 'video_id')
    elif is_youtube_url(namespace.column):
        edit_namespace_with_csv_io(namespace, 'video_url')

    # Enricher
    enricher = casanova.enricher(namespace.file,
                                 output_file,
                                 keep=namespace.select,
                                 add=CSV_HEADERS)

    loading_bar = tqdm(
        desc='Retrieving',
        dynamic_ncols=True,
        unit=' comments',
    )

    http = create_pool()
    error_file = DummyTqdmFile(sys.stderr)

    for (row, url_id) in enricher.cells(namespace.column, with_rows=True):

        if is_youtube_url(url_id):
            yt_id = extract_video_id_from_youtube_url(url_id)
            if yt_id:
                url = URL_TEMPLATE % {'id': yt_id, 'key': namespace.key}
        elif is_youtube_video_id(url_id):
            url = URL_TEMPLATE % {'id': url_id, 'key': namespace.key}
        else:
            continue
        # FULL commentaries
        # if namespace.full:
        url_queue = deque([url])

        while len(url_queue) != 0:
            current_url = url_queue.popleft()
            err, response, result = request_json(http, current_url)
            if err:
                error_file.write('{} for {}'.format(err, current_url))
                continue
            elif response.status == 403 and result.get('error').get(
                    'errors')[0].get('reason') == 'commentsDisabled':
                error_file.write(
                    'Comments are disabled for {}'.format(current_url))
                continue
            elif response.status == 403:
                error_file.write(
                    'Running out of API points. You will have to wait until midnight, Pacific time!'
                )
                time.sleep(seconds_to_midnight_pacific_time())
                continue
            elif response.status >= 400:
                error_file.write('Error {} for {}'.format(
                    response.status, current_url))
                continue
            kind = result.get('kind', None)
            next_page = result.get('nextPageToken', None)
            if next_page:
                url_next = current_url + '&pageToken=' + next_page
                url_queue.append(url_next)
            if kind == 'youtube#commentThreadListResponse':
                # Handling comments pagination
                items = result.get('items', None)
                for item in items:
                    snippet = item['snippet']
                    replies = item.get('replies')
                    if replies:
                        # Checking whether youtube's API send a subset of the replies or not
                        if snippet['totalReplyCount'] != len(
                                replies['comments']) and namespace.full:
                            # If we want the replies and those are not all given by the API, we add the URL specific to the topComment
                            # to the queue, and we deal with that topLevelComment
                            new_url = URL_PARENTID_TEMPLATE % {
                                'id': snippet['topLevelComment']['id'],
                                'key': namespace.key
                            }
                            url_queue.append(new_url)
                            data = get_data_full(snippet, True)
                            enricher.writerow(row, data)
                        else:
                            dataTop = get_data_full(snippet, True)
                            enricher.writerow(row, dataTop)
                            for rep in replies['comments']:
                                enricher.writerow(row,
                                                  get_data_full(rep, False))
                    else:
                        # if there is not 'replies' key, it means that the comment we fetch is only a topLevelComment
                        top_comment = get_data_full(snippet, True)
                        enricher.writerow(row, top_comment)
            else:
                # Handling, commentList, nothing to see here, dealing commments by comments
                items = result.get('items', None)
                for item in items:
                    data = get_data_full(item, False)
                    enricher.writerow(row, data)
Exemplo n.º 12
0
 def request_json(self, url, headers=None):
     return request_json(self.http, url, spoof_ua=True, headers=headers)