示例#1
0
def extract_cursor_from_payload(payload):
    found_cursor = nested_get(CURSOR_FIRST_POSSIBLE_PATH, payload)

    if found_cursor is None:
        found_cursor = nested_get(CURSOR_SECOND_POSSIBLE_PATH, payload)

    return found_cursor
示例#2
0
def payload_tweets_iter(payload):
    tweet_index = payload['globalObjects']['tweets']
    user_index = payload['globalObjects']['users']

    for instruction in payload['timeline']['instructions']:
        if 'addEntries' in instruction:
            entries = instruction['addEntries']['entries']
        elif 'replaceEntry' in instruction:
            entries = [instruction['replaceEntry']['entry']]
        else:
            continue

        for entry in entries:
            entry_id = entry['entryId']

            # Filtering tweets
            if (not entry_id.startswith('sq-I-t-')
                    and not entry_id.startswith('tweet-')):
                continue

            tweet_meta = nested_get(['content', 'item', 'content', 'tweet'],
                                    entry)

            if tweet_meta is None:
                tweet_meta = nested_get(
                    ['content', 'item', 'content', 'tombstone', 'tweet'],
                    entry)

            # Parsing error?
            if tweet_meta is None:
                raise TwitterPublicAPIParsingError

            # Skipping ads
            if 'promotedMetadata' in tweet_meta:
                continue

            tweet = process_single_tweet(tweet_meta['id'], tweet_index,
                                         user_index)

            # Additional metadata
            meta = None

            if tweet is not None:

                if 'forwardPivot' in tweet_meta:
                    pivot = tweet_meta['forwardPivot']

                    meta = {
                        'intervention_text':
                        nested_get(['text', 'text'], pivot),
                        'intervention_type':
                        pivot.get('displayType'),
                        'intervention_url':
                        nested_get(['landingUrl', 'url'], pivot)
                    }

                yield tweet, meta
示例#3
0
        def generator():
            starting_url = forge_comments_url(
                self.key,
                video_id
            )

            queue = deque([(False, video_id, starting_url)])

            while len(queue) != 0:
                is_reply, item_id, url = queue.popleft()

                result = self.request_json(url)

                for item in result['items']:
                    comment_id = item['id']
                    replies = nested_get(['replies', 'comments'], item, [])
                    total_reply_count = nested_get(['snippet', 'totalReplyCount'], item, 0)

                    if not raw:
                        item = format_comment(item) if not is_reply else format_reply(item, video_id=video_id)

                    yield item

                    if is_reply:
                        continue

                    # Getting replies
                    if not full_replies or len(replies) >= total_reply_count:
                        for reply in replies:
                            if not raw:
                                reply = format_reply(reply)

                            yield reply
                    elif total_reply_count > 0:
                        replies_url = forge_replies_url(
                            self.key,
                            comment_id
                        )

                        queue.append((True, comment_id, replies_url))

                if len(result['items']) == 0:
                    break

                # Next page
                token = result.get('nextPageToken')

                if token is not None:
                    forge = forge_replies_url if is_reply else forge_comments_url

                    next_url = forge(
                        self.key,
                        item_id,
                        token=token
                    )

                    queue.append((is_reply, item_id, next_url))
示例#4
0
def crowdtangle_summary(http,
                        link,
                        token=None,
                        start_date=None,
                        with_top_posts=False,
                        sort_by=CROWDTANGLE_SUMMARY_DEFAULT_SORT_TYPE,
                        format='csv_dict_row',
                        platforms=None):

    if token is None:
        raise CrowdTangleMissingTokenError

    if format not in CROWDTANGLE_OUTPUT_FORMATS:
        raise TypeError('minet.crowdtangle.summary: unkown `format`.')

    if not isinstance(start_date, str):
        raise TypeError(
            'minet.crowdtangle.summary: expecting a `start_date` kwarg.')

    if sort_by not in CROWDTANGLE_SUMMARY_SORT_TYPES:
        raise TypeError('minet.crowdtangle.summary: unknown `sort_by`.')

    # Fetching
    api_url = url_forge(link, token, start_date, sort_by, platforms,
                        with_top_posts)

    err, response, data = request_json(http, api_url)

    if err is not None:
        raise err

    if response.status == 401:
        raise CrowdTangleInvalidTokenError

    if response.status >= 400:
        raise CrowdTangleInvalidRequestError(api_url)

    stats = nested_get(['result', 'summary', 'facebook'], data)
    posts = nested_get(['result', 'posts'], data) if with_top_posts else None

    if stats is not None:
        if format == 'csv_dict_row':
            stats = format_summary(stats, as_dict=True)
        elif format == 'csv_row':
            stats = format_summary(stats)

    if not with_top_posts:
        return stats

    else:
        if posts is not None:
            if format == 'csv_dict_row':
                posts = [format_post(post, as_dict=True) for post in posts]
            elif format == 'csv_row':
                posts = [format_post(post) for post in posts]

        return stats, posts
示例#5
0
def format_comment(item):
    meta = item['snippet']
    snippet = nested_get(['snippet', 'topLevelComment', 'snippet'], item)

    row = YouTubeComment(meta['videoId'], item['id'],
                         snippet['authorDisplayName'],
                         nested_get(['authorChannelId', 'value'],
                                    snippet), snippet['textOriginal'],
                         int(snippet['likeCount']),
                         snippet['publishedAt'], snippet['updatedAt'],
                         int(meta['totalReplyCount']), None)

    return row
示例#6
0
文件: lists.py 项目: lebelgique/minet
def crowdtangle_lists(pool, token=None, format='csv_dict_row'):

    if token is None:
        raise CrowdTangleMissingTokenError

    if format not in CROWDTANGLE_OUTPUT_FORMATS:
        raise TypeError('minet.crowdtangle.lists: unkown `format`.')

    # Fetching
    api_url = URL_TEMPLATE % token

    err, response, data = request_json(api_url, pool=pool)

    if err is not None:
        raise err

    if response.status == 401:
        raise CrowdTangleInvalidTokenError

    if response.status >= 400:
        raise CrowdTangleInvalidRequestError(api_url)

    lists = nested_get(['result', 'lists'], data)

    if format == 'csv_dict_row':
        return [format_list(l, as_dict=True) for l in lists]
    elif format == 'csv_row':
        return [format_list(l) for l in lists]

    return lists
示例#7
0
文件: post.py 项目: lebelgique/minet
def crowdtangle_post(pool, post_id, token=None, format='csv_dict_row'):

    if token is None:
        raise CrowdTangleMissingTokenError

    if format not in CROWDTANGLE_OUTPUT_FORMATS:
        raise TypeError('minet.crowdtangle.post: unkown `format`.')

    # Fetching
    api_url = URL_TEMPLATE % (post_id, token)

    err, response, data = request_json(api_url, pool=pool)

    if err is not None:
        raise err

    if response.status == 401:
        raise CrowdTangleInvalidTokenError

    if response.status >= 400:
        raise CrowdTangleInvalidRequestError(api_url)

    post = nested_get(['result', 'posts', 0], data)

    if post is None:
        return

    if format == 'csv_dict_row':
        return format_post(post, as_dict=True)
    elif format == 'csv_row':
        return format_post(post)

    return post
示例#8
0
    def request_search(self, query, cursor=None, refs=None, dump=False):
        params = forge_search_params(query, cursor=cursor)
        url = '%s?%s' % (TWITTER_PUBLIC_SEARCH_ENDPOINT, params)

        headers = {
            'Authorization': TWITTER_PUBLIC_API_AUTH_HEADER,
            'X-Guest-Token': self.guest_token,
            'Cookie': self.cookie,
            'Accept-Language': 'en'
        }

        err, response, data = self.request_json(url, headers=headers)

        if err:
            raise err

        if response.status == 429:
            self.reset()
            raise TwitterPublicAPIRateLimitError

        if response.status >= 400:
            error = nested_get(['errors', 0], data)

            if error is not None and error.get('code') == 130:
                raise TwitterPublicAPIOverCapacityError

            raise TwitterPublicAPIInvalidResponseError

        cursor = extract_cursor_from_payload(data)
        tweets = []

        if dump:
            return data

        for tweet, meta in payload_tweets_iter(data):
            result = normalize_tweet(tweet,
                                     extract_referenced_tweets=refs
                                     is not None,
                                     collection_source='scraping')

            if refs is not None:
                for is_first, extracted_tweet in with_is_first(result):

                    # Casting to int64 to save up memory
                    id_int64 = int(extracted_tweet['id'])

                    if id_int64 in refs:
                        continue

                    if is_first:
                        extracted_tweets.append((extracted_tweet, meta))
                    else:
                        extracted_tweets.append((extracted_tweet, None))

                    refs.add(id_int64)
            else:
                tweets.append((result, meta))

        return cursor, tweets
示例#9
0
    def resolve(self, config):

        # Attempting to resolve env variable
        env_var = rc_key_to_env_var(self.key)
        env_value = os.environ.get(env_var, '').strip()

        if env_value:
            return self.type(env_value)

        return nested_get(self.key, config, self.default)
示例#10
0
def payload_tweets_iter(payload):
    tweet_index = payload['globalObjects']['tweets']
    user_index = payload['globalObjects']['users']

    for instruction in payload['timeline']['instructions']:
        if 'addEntries' in instruction:
            entries = instruction['addEntries']['entries']
        elif 'replaceEntry' in instruction:
            entries = [instruction['replaceEntry']['entry']]
        else:
            continue

        for entry in entries:
            entry_id = entry['entryId']

            # Filtering tweets
            if (not entry_id.startswith('sq-I-t-')
                    and not entry_id.startswith('tweet-')):
                continue

            tweet_meta = nested_get(['content', 'item', 'content', 'tweet'],
                                    entry)

            if tweet_meta is None:
                tweet_meta = nested_get(
                    ['content', 'item', 'content', 'tombstone', 'tweet'],
                    entry)

            # Parsing error?
            if tweet_meta is None:
                raise TwitterPublicAPIParsingError

            # Skipping ads
            if 'promotedMetadata' in tweet_meta:
                continue

            tweet = process_single_tweet(tweet_meta['id'], tweet_index,
                                         user_index)

            if tweet is not None:
                yield tweet
示例#11
0
def format_reply(item, video_id=None):
    snippet = item['snippet']

    row = YouTubeComment(
        video_id if video_id is not None else snippet['videoId'], item['id'],
        snippet['authorDisplayName'],
        nested_get(['authorChannelId', 'value'],
                   snippet), snippet['textOriginal'],
        int(snippet['likeCount']), snippet['publishedAt'],
        snippet['updatedAt'], None, snippet['parentId'])

    return row
示例#12
0
    def search_hashtag(self, name):
        name = name.lstrip('#')
        cursor = None

        while True:
            url = forge_hashtag_search_url(name, cursor=cursor)
            print(url, cursor)

            data = self.request_json(url)

            data = nested_get(['data', 'hashtag', 'edge_hashtag_to_media'], data)
            edges = data.get('edges')

            for edge in edges:
                yield edge['node']['shortcode']

            print('Found %i posts' % len(edges))

            has_next_page = nested_get(['page_info', 'has_next_page'], data)

            if not has_next_page:
                break

            cursor = nested_get(['page_info', 'end_cursor'], data)
示例#13
0
def collect_top_reactions(data):
    edges = nested_get(['top_reactions', 'edges'], data)

    if edges is None:
        return

    index = {}

    for edge in edges:
        emotion = REACTION_KEYS.get(edge['node']['key'])

        if emotion is None:
            print_err('Found unkown emotion %s' % edge)
            continue

        index[emotion] = edge['reaction_count'] or 0

    return index
示例#14
0
    def request_json(self, url):
        err, response, data = request_json(url, pool=self.pool)

        if err:
            raise err

        if response.status == 403:
            sleep_time = seconds_to_midnight_pacific_time() + 10

            if callable(self.before_sleep):
                self.before_sleep(sleep_time)

            time.sleep(sleep_time)

            return self.request_json(url)

        if response.status >= 400:
            if data is not None and 'API key not valid' in nested_get(['error', 'message'], data, ''):
                raise YouTubeInvalidAPIKeyError

            raise YouTubeInvalidAPICall(url, response.status, data)

        return data
示例#15
0
def apply_scraper(scraper, element, root=None, html=None, context=None):

    # Is this a tail call of item
    if isinstance(scraper, str):
        if scraper in EXTRACTOR_NAMES:
            return extract(element, scraper)

        return element.get(scraper)

    sel = get_aliases(scraper, ['sel', '$'])
    iterator = get_aliases(scraper, ['iterator', 'it', '$$'])

    # First we need to solve local selection
    if sel is not None:
        element = element.select_one(sel)
    elif 'sel_eval' in scraper:

        # TODO: validate
        element = eval_expression(scraper['sel_eval'],
                                  element=element,
                                  elements=[],
                                  context=context,
                                  html=html,
                                  root=root)

    # Then we need to solve iterator
    single_value = True

    if iterator is not None:
        elements = element.select(iterator)
        single_value = False
    elif 'iterator_eval' in scraper:
        elements = eval_expression(scraper['iterator_eval'],
                                   element=element,
                                   elements=[],
                                   context=context,
                                   html=html,
                                   root=root)
        single_value = False
    else:
        elements = [element]

    # Handling local context
    if 'context' in scraper:
        local_context = {}

        for k, field_scraper in scraper['context'].items():
            local_context[k] = apply_scraper(field_scraper,
                                             element,
                                             root=root,
                                             html=html,
                                             context=context)

        context = merge_contexts(context, local_context)

    # Actual iteration
    acc = None if single_value else []

    already_seen = set() if 'uniq' in scraper and not single_value else None

    for element in elements:
        value = None

        # Do we have fields?
        if 'fields' in scraper:
            value = {}

            for k, field_scraper in scraper['fields'].items():
                value[k] = apply_scraper(field_scraper,
                                         element,
                                         root=root,
                                         html=html,
                                         context=context)

        # Do we have a scalar?
        elif 'item' in scraper:

            # Default value is text
            value = apply_scraper(scraper['item'],
                                  element,
                                  root=root,
                                  html=html,
                                  context=context)

        else:

            try:
                if 'attr' in scraper:
                    value = element.get(scraper['attr'])
                elif 'extract' in scraper:
                    value = extract(element, scraper['extract'])
                elif 'get' in scraper:
                    value = nested_get(scraper['get'], context)
                elif 'constant' in scraper:
                    value = scraper['constant']
                else:

                    # Default value is text
                    value = extract(element, 'text')

                # Format?
                if 'format' in scraper:
                    value = FORMATTER.format(scraper['format'],
                                             value=value,
                                             context=context)

                # Eval?
                if 'eval' in scraper:
                    value = eval_expression(scraper['eval'],
                                            element=element,
                                            elements=elements,
                                            value=value,
                                            context=context,
                                            html=html,
                                            root=root)
            except:
                value = None

        # Transform
        if 'transform' in scraper and value is not None:
            value = apply_transform_chain(scraper['transform'], value)

        # Default value?
        if 'default' in scraper and value is None:
            value = scraper['default']

        if single_value:
            acc = value
        else:

            # Filtering?
            if 'filter_eval' in scraper:
                passed_filter = eval_expression(scraper['filter_eval'],
                                                element=element,
                                                elements=elements,
                                                value=value,
                                                context=context,
                                                html=html,
                                                root=root)

                if not passed_filter:
                    continue

            if 'filter' in scraper:
                filtering_clause = scraper['filter']

                if filtering_clause is True and not value:
                    continue

                if isinstance(filtering_clause,
                              str) and not value.get(filtering_clause):
                    continue

            if 'uniq' in scraper:
                uniq_clause = scraper['uniq']
                k = value

                if uniq_clause is True and value in already_seen:
                    continue

                if isinstance(uniq_clause, str):
                    k = value.get(uniq_clause)

                    if k in already_seen:
                        continue

                already_seen.add(k)

            acc.append(value)

    # NOTE: this opens a way for reducers
    if not single_value and 'join' in scraper:
        acc = scraper['join'].join(acc)

    return acc
示例#16
0
def interpret_scraper(scraper, element, root=None, context=None, path=[], scope=None):
    if scope is None:
        scope = EvaluationScope()

    # Is this a tail call of item?
    if isinstance(scraper, str):
        if scraper in EXTRACTOR_NAMES:
            return extract(element, scraper)

        return element.get(scraper)

    sel = get_sel(scraper)
    iterator = get_iterator(scraper)

    # First we need to solve local selection
    if sel is not None:
        element = soupsieve.select_one(sel, element)
    elif 'sel_eval' in scraper:

        evaluated_sel = eval_expression(
            scraper['sel_eval'],
            element=element,
            elements=[],
            context=context,
            root=root,
            path=path + ['sel_eval'],
            expect=(Tag, str),
            allow_none=True,
            scope=scope
        )

        if isinstance(evaluated_sel, str):
            element = soupsieve.select_one(evaluated_sel, element)
        else:
            element = evaluated_sel

    if element is None:
        return None

    # Then we need to solve iterator
    single_value = True

    if iterator is not None:
        single_value = False
        elements = soupsieve.select(iterator, element)
    elif 'iterator_eval' in scraper:
        single_value = False
        evaluated_elements = eval_expression(
            scraper['iterator_eval'],
            element=element,
            elements=[],
            context=context,
            root=root,
            path=path + ['iterator_eval'],
            check=is_valid_iterator_eval_output,
            scope=scope
        )

        if isinstance(evaluated_elements, str):
            elements = soupsieve.select(evaluated_elements, element)
        else:
            elements = evaluated_elements
    else:
        elements = [element]

    # Handling local context
    if 'set_context' in scraper:
        local_context = {}

        for k, field_scraper in scraper['set_context'].items():
            local_context[k] = interpret_scraper(
                field_scraper,
                element,
                root=root,
                context=context,
                path=path + ['set_context', k],
                scope=scope
            )

        context = merge_contexts(context, local_context)

    # Actual iteration
    acc = None if single_value else []

    already_seen = set() if 'uniq' in scraper and not single_value else None

    for element in elements:
        value = None

        # Do we have fields?
        if 'fields' in scraper:
            value = {}

            for k, field_scraper in scraper['fields'].items():
                value[k] = interpret_scraper(
                    field_scraper,
                    element,
                    root=root,
                    context=context,
                    path=path + ['fields', k],
                    scope=scope
                )

        # Do we have a scalar?
        elif 'item' in scraper:

            # Default value is text
            value = interpret_scraper(
                scraper['item'],
                element,
                root=root,
                context=context,
                path=path + ['item'],
                scope=scope
            )

        else:

            if 'attr' in scraper:
                value = element.get(scraper['attr'])
            elif 'extract' in scraper:
                value = extract(element, scraper['extract'])
            elif 'get_context' in scraper:
                value = nested_get(scraper['get_context'], context)
            elif 'default' not in scraper:

                # Default value is text
                value = extract(element, 'text')

            # Eval?
            if 'eval' in scraper:
                value = eval_expression(
                    scraper['eval'],
                    element=element,
                    elements=elements,
                    value=value,
                    context=context,
                    root=root,
                    path=path + ['eval'],
                    expect=DATA_TYPES,
                    allow_none=True,
                    scope=scope
                )

        # Default value after all?
        if 'default' in scraper and value is None:
            value = scraper['default']

        if single_value:
            acc = value
        else:

            # Filtering?
            if 'filter_eval' in scraper:
                passed_filter = eval_expression(
                    scraper['filter_eval'],
                    element=element,
                    elements=elements,
                    value=value,
                    context=context,
                    root=root,
                    path=path + ['filter_eval'],
                    expect=bool,
                    allow_none=True,
                    scope=scope
                )

                if not passed_filter:
                    continue

            if 'filter' in scraper:
                filtering_clause = scraper['filter']

                if filtering_clause is True and not value:
                    continue

                if isinstance(filtering_clause, str) and not nested_get(filtering_clause, value):
                    continue

            if 'uniq' in scraper:
                uniq_clause = scraper['uniq']
                k = value

                if uniq_clause is True and value in already_seen:
                    continue

                if isinstance(uniq_clause, str):
                    k = nested_get(uniq_clause, value)

                    if k in already_seen:
                        continue

                already_seen.add(k)

            acc.append(value)

    return acc
示例#17
0
 def resolve(self, config):
     return nested_get(self.key, config, self.default)
示例#18
0
 def test_nested_get(self):
     assert nested_get('a.d.e', NESTED_OBJECT) == 5
     assert nested_get('b.d.a.a', NESTED_OBJECT) is None
     assert nested_get(['a', 'b', 0, 'c'], NESTED_OBJECT) == 4
     assert nested_get(['a', 'b', 1, 'c', 2], NESTED_OBJECT) is None
示例#19
0
    def fetch_facebook_page_stats(url):
        err, response = request(http, url, cookie='locale=en_US')

        if err:
            return 'http-error', None

        if response.status == 404:
            return 'not-found', None

        if response.status >= 400:
            return 'http-error', None

        html = response.data

        if CAPTCHA in html:
            die(['Rate limit reached!', 'Last url: %s' % url])

        if (CURRENT_AVAILABILITY_DISCLAIMER in html
                or AVAILABILITY_DISCLAIMER in html):
            return 'unavailable', None

        if LOGIN_DISCLAIMER in html:
            return 'private-or-unavailable', None

        # TODO: integrate into ural
        bpost_id = url.rsplit('/', 1)[-1].encode()

        # Extracting metadata
        meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id)

        match = meta_extractor.search(html)

        if match is None:
            return 'extraction-failed', None

        data = json5.loads(match.group(1).decode())
        data = nested_get([
            'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result',
            'data', 'feedback'
        ], data)

        if data is None:
            return 'extraction-failed', None

        # TODO: remove, this is here as a test
        # TODO: try to find a post where comments are disabled
        if get_count(data['seen_by_count']):
            print_err('Found seen_by_count: %i for %s' %
                      (get_count(data['seen_by_count']), url))

        if 'political_figure_data' in data and data['political_figure_data']:
            print_err('Found political_figure_data:')
            print_err(data['political_figure_data'])

        if get_count(data['reaction_count']) != get_count(data['reactors']):
            print_err('Found different reactions/reactors for %s' % url)

        # Extracting data from hidden html
        hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id)
        match = hidden_html_extractor.search(html)

        if match is not None:
            hidden_html = match.group(1).decode()
            soup = BeautifulSoup(hidden_html, 'lxml')

            # Sometimes fetching a post behaves weirdly
            if soup.select_one('h5 a') is None:
                return 'extraction-failed', None

            data['scraped'] = {}

            timestamp_elem = soup.select_one('[data-utime]')
            timestamp = int(timestamp_elem.get('data-utime'))

            data['scraped']['account_name'] = soup.select_one(
                'h5 a').get_text().strip()
            data['scraped']['timestamp'] = timestamp
            data['scraped']['time'] = datetime.fromtimestamp(
                timestamp).isoformat()

            # TODO: use a context manager
            try:
                data['scraped']['aria_label'] = timestamp_elem.parent.get(
                    'aria-label')
            except:
                pass

            try:
                data['scraped']['text'] = soup.select_one(
                    '[data-testid="post_message"]').get_text()
            except:
                pass

            # try:
            #     data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href')
            # except:
            #     pass

        return None, data