Пример #1
0
def query_single_page(url, html_response=True, retry=10):
    """
    Returns tweets from the given URL.

    :param url: The URL to get the tweets from
    :param html_response: False, if the HTML is embedded in a JSON
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    headers = {'User-Agent': random.choice(HEADERS_LIST)}

    try:
        response = requests.get(url, headers=headers)
        if html_response:
            html = response.text or ''
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            return [], None

        if not html_response:
            return tweets, json_resp['min_position']

        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)
    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info("Retrying... (Attempts left: {})".format(retry))
        return query_single_page(url, html_response, retry - 1)

    logger.error("Giving up.")
    return [], None
Пример #2
0
def query_tweets_once(query, limit=None, lang=''):
    """
    Queries twitter for all the tweets you want! It will load all pages it gets
    from twitter. However, twitter might out of a sudden stop serving new pages,
    in that case, use the `query_tweets` method.

    Note that this function catches the KeyboardInterrupt so it can return
    tweets on incomplete queries if the user decides to abort.

    :param query: Any advanced query you want to do! Compile it at
                  https://twitter.com/search-advanced and just copy the query!
    :param limit: Scraping will be stopped when at least ``limit`` number of
                  items are fetched.
    :param num_tweets: Number of tweets fetched outside this function.
    :return:      A list of twitterscraper.Tweet objects. You will get at least
                  ``limit`` number of items.
    """
    logger.info("Querying {}".format(query))
    query = query.replace(' ', '%20').replace("#", "%23").replace(":", "%3A")
    pos = None
    tweets = []
    try:
        while True:
            new_tweets, pos = query_single_page(
                INIT_URL.format(q=query, lang=lang) if pos is None else
                RELOAD_URL.format(q=query, pos=pos, lang=lang), pos is None)
            if len(new_tweets) == 0:
                logger.info("Got {} tweets for {}.".format(len(tweets), query))
                return tweets

            tweets += new_tweets

            if limit and len(tweets) >= limit:
                logger.info("Got {} tweets for {}.".format(len(tweets), query))
                return tweets
    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Returning tweets gathered "
                    "so far...")
    except BaseException:
        logger.exception("An unknown error occurred! Returning tweets "
                         "gathered so far.")
    logger.info("Got {} tweets for {}.".format(len(tweets), query))
    return tweets