示例#1
0
 def __init__(self,
              access_token,
              access_token_secret,
              api_key,
              api_secret_key,
              before_sleep=None):
     self.wrapper = TwitterWrapper(access_token, access_token_secret,
                                   api_key, api_secret_key)
     self.retryer = create_request_retryer(before_sleep=before_sleep)
示例#2
0
    def comments(self, url, detailed=False, per_call=False):

        # Reformatting url to hit mobile website
        url = convert_url_to_mobile(url)

        url_queue = deque([(url, None, None)])

        calls = 0
        replies = 0

        retryer = create_request_retryer()

        while len(url_queue) != 0:
            current_url, direction, in_reply_to = url_queue.popleft()

            html = retryer(self.request_page, current_url)

            try:
                data = scrape_comments(html, direction, in_reply_to)
            except TypeError:
                # with open('./dump.html', 'w') as f:
                #     f.write(html)
                print('Could not process comment in %s' % current_url,
                      file=sys.stderr)
                return

            calls += 1

            for reply_url, commented_id in data['replies']:
                url_queue.append((reply_url, None, commented_id))

            if data['next'] is not None:
                url_queue.append(
                    (data['next'], data['direction'], in_reply_to))

            comments = data['comments']

            for comment in data['comments']:
                if in_reply_to is not None:
                    replies += 1

                if not per_call:
                    yield comment

            if per_call and len(comments) > 0:
                if detailed:
                    details = {
                        'calls': calls,
                        'replies': replies,
                        'queue_size': len(url_queue)
                    }

                    yield details, comments
                else:
                    yield comments
示例#3
0
    def __init__(self, cookie, throttle=FACEBOOK_MOBILE_DEFAULT_THROTTLE):

        # Grabbing cookie
        cookie = grab_facebook_cookie(cookie)

        if cookie is None:
            raise FacebookInvalidCookieError

        self.cookie = cookie
        self.pool = create_pool()

        self.rate_limiter_state = RateLimiterState(1, throttle)
        self.retryer = create_request_retryer()
示例#4
0
文件: client.py 项目: medialab/minet
    def __init__(self, token, rate_limit=None, before_sleep=None):
        if rate_limit is None:
            rate_limit = CROWDTANGLE_DEFAULT_RATE_LIMIT
            summary_rate_limit = CROWDTANGLE_LINKS_DEFAULT_RATE_LIMIT
        else:
            rate_limit = rate_limit
            summary_rate_limit = rate_limit

        self.token = token
        self.rate_limiter_state = RateLimiterState(rate_limit, period=60)
        self.summary_rate_limiter_state = RateLimiterState(summary_rate_limit,
                                                           period=60)
        self.pool = create_pool(timeout=CROWDTANGLE_DEFAULT_TIMEOUT)
        self.retryer = create_request_retryer(additional_exceptions=[
            CrowdTangleInvalidJSONError, CrowdTangleServerError
        ],
                                              before_sleep=before_sleep)
示例#5
0
    def search(self,
               query,
               limit=None,
               before_sleep=None,
               include_referenced_tweets=False,
               with_meta=False):

        if len(query) > MAXIMUM_QUERY_LENGTH:
            raise TwitterPublicAPIQueryTooLongError

        cursor = None
        i = 0

        retryer = create_request_retryer(
            min=1,
            additional_exceptions=[
                TwitterPublicAPIRateLimitError,
                TwitterPublicAPIInvalidResponseError
            ],
            before_sleep=before_sleep)

        refs = set() if include_referenced_tweets else None

        while True:
            new_cursor, tweets = retryer(self.request_search,
                                         query,
                                         cursor,
                                         refs=refs)

            for tweet, meta in tweets:
                if with_meta:
                    yield tweet, meta

                else:
                    yield tweet

                i += 1

                if limit is not None and i >= limit:
                    return

            if new_cursor is None or len(tweets) == 0:
                return

            cursor = new_cursor
示例#6
0
    def create_iterator(pool,
                        token,
                        rate_limiter_state,
                        limit=None,
                        format='csv_dict_row',
                        per_call=False,
                        detailed=False,
                        namespace=None,
                        before_sleep=None,
                        **kwargs):

        if format not in CROWDTANGLE_OUTPUT_FORMATS:
            raise TypeError('minet.crowdtangle: unkown `format`.')

        if namespace is not None:
            kwargs = vars(namespace)
        else:
            kwargs['token'] = token

        # Checking we have the necessary dates
        if kwargs.get('sort_by', 'date') == 'date':
            if kwargs.get('start_date') is None:
                raise CrowdTangleMissingStartDateError

            # Inferring end date to be now, this will be important later
            if kwargs.get('end_date') is None:
                kwargs['end_date'] = infer_end_date()

        # Complementing dates
        if kwargs.get('start_date') is not None:
            kwargs['start_date'] = complement_date(kwargs['start_date'],
                                                   'start')

        if kwargs.get('end_date') is not None:
            kwargs['end_date'] = complement_date(kwargs['end_date'], 'end')

        N = 0
        C = 0
        last_url = None
        last_items = set()

        has_limit = limit is not None

        rate_limited_step = rate_limited_from_state(rate_limiter_state)(step)

        retryer = create_request_retryer(additional_exceptions=[
            CrowdTangleRateLimitExceeded, CrowdTangleInvalidJSONError
        ],
                                         before_sleep=before_sleep)

        # Chunking
        need_to_chunk = kwargs.get('sort_by', 'date') == 'date'
        chunk_size = kwargs.get('chunk_size', 500)
        current_chunk_size = 0
        shifts = 0
        years = years_iter(kwargs['start_date'],
                           kwargs['end_date']) if need_to_chunk else None

        def rotate_year():
            try:
                start_date, end_date = next(years)
                kwargs['start_date'] = start_date
                kwargs['end_date'] = end_date
            except StopIteration:
                return False

            return True

        if need_to_chunk:
            rotate_year()

        # Starting url
        url = url_forge(**kwargs)

        while True:
            C += 1

            items, next_url = retryer(rate_limited_step, pool, url, item_key)

            # We have exhausted the available data
            if items is None:

                if need_to_chunk:
                    could_rotate = rotate_year()

                    if could_rotate:
                        url = url_forge(**kwargs)
                        continue

                break

            enough_to_stop = False
            n = 0

            last_url = url

            acc = []

            for item in items:

                # Avoiding duplicating items due to race conditions
                if item_id_getter(item) in last_items:
                    continue

                current_date = item['date']

                n += 1
                N += 1
                current_chunk_size += 1

                if format == 'csv_dict_row':
                    item = formatter(item, as_dict=True)
                elif format == 'csv_row':
                    item = formatter(item)

                acc.append(item)

                if has_limit and N >= limit:
                    enough_to_stop = True
                    break

            if per_call:
                if detailed:
                    details = None

                    if need_to_chunk:
                        details = {'date': current_date, 'shifts': shifts}

                    yield details, acc
                else:
                    yield acc
            else:
                yield from acc

            if enough_to_stop:
                break

            # We need to track last items to avoid registering the same one twice
            last_items = set(item_id_getter(item) for item in items)

            # Paginating
            if next_url is None:
                if need_to_chunk:
                    could_rotate = rotate_year()

                    if could_rotate:
                        url = url_forge(**kwargs)
                        continue

                break

            # Handling chunking
            if current_chunk_size >= chunk_size:
                current_chunk_size = 0
                shifts += 1
                kwargs['end_date'] = items[-1]['date'].replace(' ', 'T')
                url = url_forge(**kwargs)
                continue

            url = next_url