def __init__(self, access_token, access_token_secret, api_key, api_secret_key, before_sleep=None): self.wrapper = TwitterWrapper(access_token, access_token_secret, api_key, api_secret_key) self.retryer = create_request_retryer(before_sleep=before_sleep)
def comments(self, url, detailed=False, per_call=False): # Reformatting url to hit mobile website url = convert_url_to_mobile(url) url_queue = deque([(url, None, None)]) calls = 0 replies = 0 retryer = create_request_retryer() while len(url_queue) != 0: current_url, direction, in_reply_to = url_queue.popleft() html = retryer(self.request_page, current_url) try: data = scrape_comments(html, direction, in_reply_to) except TypeError: # with open('./dump.html', 'w') as f: # f.write(html) print('Could not process comment in %s' % current_url, file=sys.stderr) return calls += 1 for reply_url, commented_id in data['replies']: url_queue.append((reply_url, None, commented_id)) if data['next'] is not None: url_queue.append( (data['next'], data['direction'], in_reply_to)) comments = data['comments'] for comment in data['comments']: if in_reply_to is not None: replies += 1 if not per_call: yield comment if per_call and len(comments) > 0: if detailed: details = { 'calls': calls, 'replies': replies, 'queue_size': len(url_queue) } yield details, comments else: yield comments
def __init__(self, cookie, throttle=FACEBOOK_MOBILE_DEFAULT_THROTTLE): # Grabbing cookie cookie = grab_facebook_cookie(cookie) if cookie is None: raise FacebookInvalidCookieError self.cookie = cookie self.pool = create_pool() self.rate_limiter_state = RateLimiterState(1, throttle) self.retryer = create_request_retryer()
def __init__(self, token, rate_limit=None, before_sleep=None): if rate_limit is None: rate_limit = CROWDTANGLE_DEFAULT_RATE_LIMIT summary_rate_limit = CROWDTANGLE_LINKS_DEFAULT_RATE_LIMIT else: rate_limit = rate_limit summary_rate_limit = rate_limit self.token = token self.rate_limiter_state = RateLimiterState(rate_limit, period=60) self.summary_rate_limiter_state = RateLimiterState(summary_rate_limit, period=60) self.pool = create_pool(timeout=CROWDTANGLE_DEFAULT_TIMEOUT) self.retryer = create_request_retryer(additional_exceptions=[ CrowdTangleInvalidJSONError, CrowdTangleServerError ], before_sleep=before_sleep)
def search(self, query, limit=None, before_sleep=None, include_referenced_tweets=False, with_meta=False): if len(query) > MAXIMUM_QUERY_LENGTH: raise TwitterPublicAPIQueryTooLongError cursor = None i = 0 retryer = create_request_retryer( min=1, additional_exceptions=[ TwitterPublicAPIRateLimitError, TwitterPublicAPIInvalidResponseError ], before_sleep=before_sleep) refs = set() if include_referenced_tweets else None while True: new_cursor, tweets = retryer(self.request_search, query, cursor, refs=refs) for tweet, meta in tweets: if with_meta: yield tweet, meta else: yield tweet i += 1 if limit is not None and i >= limit: return if new_cursor is None or len(tweets) == 0: return cursor = new_cursor
def create_iterator(pool, token, rate_limiter_state, limit=None, format='csv_dict_row', per_call=False, detailed=False, namespace=None, before_sleep=None, **kwargs): if format not in CROWDTANGLE_OUTPUT_FORMATS: raise TypeError('minet.crowdtangle: unkown `format`.') if namespace is not None: kwargs = vars(namespace) else: kwargs['token'] = token # Checking we have the necessary dates if kwargs.get('sort_by', 'date') == 'date': if kwargs.get('start_date') is None: raise CrowdTangleMissingStartDateError # Inferring end date to be now, this will be important later if kwargs.get('end_date') is None: kwargs['end_date'] = infer_end_date() # Complementing dates if kwargs.get('start_date') is not None: kwargs['start_date'] = complement_date(kwargs['start_date'], 'start') if kwargs.get('end_date') is not None: kwargs['end_date'] = complement_date(kwargs['end_date'], 'end') N = 0 C = 0 last_url = None last_items = set() has_limit = limit is not None rate_limited_step = rate_limited_from_state(rate_limiter_state)(step) retryer = create_request_retryer(additional_exceptions=[ CrowdTangleRateLimitExceeded, CrowdTangleInvalidJSONError ], before_sleep=before_sleep) # Chunking need_to_chunk = kwargs.get('sort_by', 'date') == 'date' chunk_size = kwargs.get('chunk_size', 500) current_chunk_size = 0 shifts = 0 years = years_iter(kwargs['start_date'], kwargs['end_date']) if need_to_chunk else None def rotate_year(): try: start_date, end_date = next(years) kwargs['start_date'] = start_date kwargs['end_date'] = end_date except StopIteration: return False return True if need_to_chunk: rotate_year() # Starting url url = url_forge(**kwargs) while True: C += 1 items, next_url = retryer(rate_limited_step, pool, url, item_key) # We have exhausted the available data if items is None: if need_to_chunk: could_rotate = rotate_year() if could_rotate: url = url_forge(**kwargs) continue break enough_to_stop = False n = 0 last_url = url acc = [] for item in items: # Avoiding duplicating items due to race conditions if item_id_getter(item) in last_items: continue current_date = item['date'] n += 1 N += 1 current_chunk_size += 1 if format == 'csv_dict_row': item = formatter(item, as_dict=True) elif format == 'csv_row': item = formatter(item) acc.append(item) if has_limit and N >= limit: enough_to_stop = True break if per_call: if detailed: details = None if need_to_chunk: details = {'date': current_date, 'shifts': shifts} yield details, acc else: yield acc else: yield from acc if enough_to_stop: break # We need to track last items to avoid registering the same one twice last_items = set(item_id_getter(item) for item in items) # Paginating if next_url is None: if need_to_chunk: could_rotate = rotate_year() if could_rotate: url = url_forge(**kwargs) continue break # Handling chunking if current_chunk_size >= chunk_size: current_chunk_size = 0 shifts += 1 kwargs['end_date'] = items[-1]['date'].replace(' ', 'T') url = url_forge(**kwargs) continue url = next_url