def payload_tweets_iter(payload): tweet_index = payload['globalObjects']['tweets'] user_index = payload['globalObjects']['users'] for instruction in payload['timeline']['instructions']: if 'addEntries' in instruction: entries = instruction['addEntries']['entries'] elif 'replaceEntry' in instruction: entries = [instruction['replaceEntry']['entry']] else: continue for entry in entries: entry_id = entry['entryId'] # Filtering tweets if (not entry_id.startswith('sq-I-t-') and not entry_id.startswith('tweet-')): continue tweet_meta = getpath(entry, ['content', 'item', 'content', 'tweet']) if tweet_meta is None: tweet_meta = getpath( entry, ['content', 'item', 'content', 'tombstone', 'tweet']) # Parsing error? if tweet_meta is None: raise TwitterPublicAPIParsingError # Skipping ads if 'promotedMetadata' in tweet_meta: continue tweet = process_single_tweet(tweet_meta['id'], tweet_index, user_index) # Additional metadata meta = None if tweet is not None: if 'forwardPivot' in tweet_meta: pivot = tweet_meta['forwardPivot'] meta = { 'intervention_text': getpath(pivot, ['text', 'text']), 'intervention_type': pivot.get('displayType'), 'intervention_url': getpath(pivot, ['landingUrl', 'url']) } yield tweet, meta
def generator(): starting_url = forge_comments_url(self.key, video_id) queue = deque([(False, video_id, starting_url)]) while len(queue) != 0: is_reply, item_id, url = queue.popleft() result = self.request_json(url) for item in result['items']: comment_id = item['id'] replies = getpath(item, ['replies', 'comments'], []) total_reply_count = getpath(item, ['snippet', 'totalReplyCount'], 0) if not raw: item = format_comment( item) if not is_reply else format_reply( item, video_id=video_id) yield item if is_reply: continue # Getting replies if not full_replies or len(replies) >= total_reply_count: for reply in replies: if not raw: reply = format_reply(reply) yield reply elif total_reply_count > 0: replies_url = forge_replies_url(self.key, comment_id) queue.append((True, comment_id, replies_url)) if len(result['items']) == 0: break # Next page token = result.get('nextPageToken') if token is not None: forge = forge_replies_url if is_reply else forge_comments_url next_url = forge(self.key, item_id, token=token) queue.append((is_reply, item_id, next_url))
def crowdtangle_summary(pool, link, token=None, start_date=None, with_top_posts=False, sort_by=CROWDTANGLE_SUMMARY_DEFAULT_SORT_TYPE, raw=False, platforms=None): if token is None: raise CrowdTangleMissingTokenError if not isinstance(start_date, str): raise TypeError( 'minet.crowdtangle.summary: expecting a `start_date` kwarg.') if sort_by not in CROWDTANGLE_SUMMARY_SORT_TYPES: raise TypeError('minet.crowdtangle.summary: unknown `sort_by`.') # Fetching api_url = url_forge(link, token, start_date, sort_by, platforms, with_top_posts) err, response, data = request_json(api_url, pool=pool) if err is not None: raise err if response.status == 401: raise CrowdTangleInvalidTokenError if response.status >= 400: raise CrowdTangleInvalidRequestError(api_url) stats = getpath(data, ['result', 'summary', 'facebook']) posts = getpath(data, ['result', 'posts']) if with_top_posts else None if stats is not None: if not raw: stats = format_summary(stats) if not with_top_posts: return stats else: if not raw: posts = [format_post(post, link=link) for post in posts] return stats, posts
def crowdtangle_post(pool, post_id, token=None, raw=False): if token is None: raise CrowdTangleMissingTokenError # Fetching api_url = URL_TEMPLATE % (post_id, token) err, response, data = request_json(api_url, pool=pool) if err is not None: raise err if response.status == 401: raise CrowdTangleInvalidTokenError if response.status >= 400: raise CrowdTangleInvalidRequestError(api_url) post = getpath(data, ['result', 'posts', 0]) if post is None: return if not raw: return format_post(post) return post
def crowdtangle_lists(pool, token=None, raw=False): if token is None: raise CrowdTangleMissingTokenError # Fetching api_url = URL_TEMPLATE % token err, response, data = request_json(api_url, pool=pool) if err is not None: raise err if response.status == 401: raise CrowdTangleInvalidTokenError if response.status >= 400: raise CrowdTangleInvalidRequestError(api_url) lists = getpath(data, ['result', 'lists']) if not raw: return [format_list(l) for l in lists] return lists
def format_comment(item): meta = item['snippet'] snippet = getpath(item, ['snippet', 'topLevelComment', 'snippet']) row = YouTubeComment( meta['videoId'], item['id'], snippet['authorDisplayName'], getpath(snippet, ['authorChannelId', 'value']), snippet['textOriginal'], int(snippet['likeCount']), snippet['publishedAt'], snippet['updatedAt'], int(meta['totalReplyCount']), None ) return row
def resolve(self, config): # Attempting to resolve env variable env_var = rc_key_to_env_var(self.key) env_value = os.environ.get(env_var, '').strip() if env_value: return self.type(env_value) return getpath(config, self.key, self.default)
def search_hashtag(self, name): name = name.lstrip('#') cursor = None while True: url = forge_hashtag_search_url(name, cursor=cursor) print(url, cursor) data = self.request_json(url) data = getpath(data, ['data', 'hashtag', 'edge_hashtag_to_media']) edges = data.get('edges') for edge in edges: yield edge['node']['shortcode'] print('Found %i posts' % len(edges)) has_next_page = getpath(data, ['page_info', 'has_next_page']) if not has_next_page: break cursor = getpath(data, ['page_info', 'end_cursor'])
def crowdtangle_post(request, post_id, token=None, raw=False): if token is None: raise CrowdTangleMissingTokenError # Fetching api_url = URL_TEMPLATE % (post_id, token) data = request(api_url) post = getpath(data, ['posts', 0]) if post is None: return if not raw: return format_post(post) return post
def format_reply(item, video_id=None): snippet = item['snippet'] row = YouTubeComment( video_id if video_id is not None else snippet['videoId'], item['id'], snippet['authorDisplayName'], getpath(snippet, ['authorChannelId', 'value']), snippet['textOriginal'], int(snippet['likeCount']), snippet['publishedAt'], snippet['updatedAt'], None, snippet['parentId'] ) return row
def collect_top_reactions(data): edges = getpath(data, ['top_reactions', 'edges']) if edges is None: return index = {} for edge in edges: emotion = FACEBOOK_REACTION_KEYS.get(edge['node']['key']) if emotion is None: print_err('Found unkown emotion %s' % edge) continue index[emotion] = edge['reaction_count'] or 0 return index
def request_json(self, url): err, response, data = request_json(url, pool=self.pool) if err: raise err if response.status == 403: sleep_time = seconds_to_midnight_pacific_time() + 10 if callable(self.before_sleep): self.before_sleep(sleep_time) time.sleep(sleep_time) return self.request_json(url) if response.status >= 400: if data is not None and 'API key not valid' in getpath( data, ['error', 'message'], ''): raise YouTubeInvalidAPIKeyError raise YouTubeInvalidAPICall(url, response.status, data) return data
def test_getpath(self): with pytest.raises(TypeError): getpath(NESTED_OBJECT, 'test') assert getpath(NESTED_OBJECT, ['a', 'd', 'e']) == 5 assert getpath(NESTED_OBJECT, ['a', 'd', 'e'], items=None) is None assert getpath(NESTED_OBJECT, ['a', 'c']) is None assert getpath(NESTED_OBJECT, ['a', 'c'], 67) == 67 assert getpath(NESTED_OBJECT, ['a', 'b', 1]) == 45 assert getpath(NESTED_OBJECT, ['a', 'b', -1, 'f', -1]) == 3 assert getpath(NESTED_OBJECT, ['a', 'b', 0, 'c']) == 4 assert getpath(NESTED_OBJECT, ['a', 'd', 'g', 'numbers', 1]) is None assert getpath(NESTED_OBJECT, ['a', 'd', 'g', 'numbers', 1], attributes=True) == 5 assert getpath(NESTED_OBJECT, ['a', 'd', 'g', 3], attributes=True) is None assert getpath(NESTED_OBJECT, ['a', 'd', 'g', 'recursion', 'numbers'], attributes=True) == [4, 5, 6] assert getpath(NESTED_OBJECT, 'a.d.e', split_char='.') == 5 assert getpath(NESTED_OBJECT, 'a§d§e', split_char='§') == 5 assert getpath(NESTED_OBJECT, 'a.b.1', split_char='.', parse_indices=True) == 45 assert getpath(NESTED_OBJECT, 'a.b.-1.f.-1', split_char='.', parse_indices=True) == 3 assert getpath([[1, 2]], [3, 4, 17]) is None
def fetch_facebook_page_stats(url): err, response = request(url, cookie='locale=en_US') if err: return 'http-error', None if response.status == 404: return 'not-found', None if response.status >= 400: return 'http-error', None html = response.data if CAPTCHA in html: die(['Rate limit reached!', 'Last url: %s' % url]) if (CURRENT_AVAILABILITY_DISCLAIMER in html or AVAILABILITY_DISCLAIMER in html): return 'unavailable', None if LOGIN_DISCLAIMER in html: return 'private-or-unavailable', None # TODO: integrate into ural bpost_id = url.rsplit('/', 1)[-1].encode() # Extracting metadata meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id) match = meta_extractor.search(html) if match is None: return 'extraction-failed', None data = json5.loads(match.group(1).decode()) data = getpath(data, [ 'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result', 'data', 'feedback' ]) if data is None: return 'extraction-failed', None # TODO: remove, this is here as a test # TODO: try to find a post where comments are disabled if get_count(data['seen_by_count']): print_err('Found seen_by_count: %i for %s' % (get_count(data['seen_by_count']), url)) if 'political_figure_data' in data and data['political_figure_data']: print_err('Found political_figure_data:') print_err(data['political_figure_data']) if get_count(data['reaction_count']) != get_count(data['reactors']): print_err('Found different reactions/reactors for %s' % url) # Extracting data from hidden html hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id) match = hidden_html_extractor.search(html) if match is not None: hidden_html = match.group(1).decode() soup = BeautifulSoup(hidden_html, 'lxml') # Sometimes fetching a post behaves weirdly if soup.select_one('h5 a') is None: return 'extraction-failed', None data['scraped'] = {} timestamp_elem = soup.select_one('[data-utime]') timestamp = int(timestamp_elem.get('data-utime')) data['scraped']['account_name'] = soup.select_one( 'h5 a').get_text().strip() data['scraped']['timestamp'] = timestamp data['scraped']['time'] = datetime.fromtimestamp( timestamp).isoformat() # TODO: use a context manager try: data['scraped']['aria_label'] = timestamp_elem.parent.get( 'aria-label') except: pass try: data['scraped']['text'] = soup.select_one( '[data-testid="post_message"]').get_text() except: pass # try: # data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href') # except: # pass return None, data
def request_search(self, query, cursor=None, refs=None, dump=False): params = forge_search_params(query, cursor=cursor) url = '%s?%s' % (TWITTER_PUBLIC_SEARCH_ENDPOINT, params) headers = { 'Authorization': TWITTER_PUBLIC_API_AUTH_HEADER, 'X-Guest-Token': self.guest_token, 'Cookie': self.cookie, 'Accept-Language': 'en' } err, response, data = self.request_json(url, headers=headers) if err: raise err if response.status == 429: self.reset() raise TwitterPublicAPIRateLimitError if response.status >= 400: error = getpath(data, ['errors', 0]) if error is not None and response.status == 400 and error.get( 'code') == 47: raise TwitterPublicAPIBadRequest if error is not None and error.get('code') == 130: raise TwitterPublicAPIOverCapacityError raise TwitterPublicAPIInvalidResponseError cursor = extract_cursor_from_payload(data) tweets = [] if dump: return data for tweet, meta in payload_tweets_iter(data): result = normalize_tweet(tweet, extract_referenced_tweets=refs is not None, collection_source='scraping') if refs is not None: for is_first, extracted_tweet in with_is_first(result): # Casting to int64 to save up memory id_int64 = int(extracted_tweet['id']) if id_int64 in refs: continue if is_first: tweets.append((extracted_tweet, meta)) else: tweets.append((extracted_tweet, None)) refs.add(id_int64) else: tweets.append((result, meta)) return cursor, tweets