Exemplo n.º 1
0
def query_tweets_from_user(user, limit=None):
    pos = None
    tweets = []
    try:
        while True:
           new_tweets, pos = query_single_page(query, lang='', pos=pos, from_user=True)
           if len(new_tweets) == 0:
               logger.info("Got {} tweets from username {}".format(len(tweets), user))
               return tweets

           tweets += new_tweets

           if limit and len(tweets) >= limit:
               logger.info("Got {} tweets from username {}".format(len(tweets), user))
               return tweets

    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Returning tweets gathered "
                     "so far...")
    except BaseException:
        logger.exception("An unknown error occurred! Returning tweets "
                          "gathered so far.")
    logger.info("Got {} tweets from username {}.".format(
        len(tweets), user))
    return tweets
Exemplo n.º 2
0
def query_tweets_from_user(user, limit=None):
    pos = None
    tweets = []
    try:
        while True:
            new_tweets, pos = query_single_page(user,
                                                lang="",
                                                pos=pos,
                                                from_user=True)
            if len(new_tweets) == 0:
                logger.info("Got {} tweets from username {}".format(
                    len(tweets), user))
                return tweets

            tweets += new_tweets

            if limit and len(tweets) >= limit:
                logger.info("Got {} tweets from username {}".format(
                    len(tweets), user))
                return tweets

    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Returning tweets gathered "
                    "so far...")
    except BaseException:
        logger.exception("An unknown error occurred! Returning tweets "
                         "gathered so far.")
    logger.info("Got {} tweets from username {}.".format(len(tweets), user))
    return tweets
Exemplo n.º 3
0
def query_single_page(query, lang, pos, retry=50, from_user=False):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)

    try:
        response = requests.get(url, headers=HEADER)
        if pos is None:  # html response
            html = response.text or ""
            json_resp = None
        else:
            html = ""
            try:
                json_resp = json.loads(response.text)
                html = json_resp["items_html"] or ""
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            if json_resp:
                pos = urllib.parse.quote(json_resp["min_position"])
            else:
                pos = None
            return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp["min_position"])
        if from_user:
            return tweets, tweets[-1].id
        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info("Retrying... (Attempts left: {})".format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error("Giving up.")
    return [], None
Exemplo n.º 4
0
def query_single_page(url, html_response=True, retry=10, from_user=False):
    """
    Returns tweets from the given URL.

    :param url: The URL to get the tweets from
    :param html_response: False, if the HTML is embedded in a JSON
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """

    try:
        response = requests.get(url, headers=HEADER)
        if html_response:
            html = response.text or ''
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            return [], None

        if not html_response:
            return tweets, urllib.parse.quote(json_resp['min_position'])

        if from_user:
            return tweets, tweets[-1].id
        else:
            return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(url, html_response, retry - 1)

    logger.error('Giving up.')
    return [], None
Exemplo n.º 5
0
def query_tweets_once_generator(query,
                                limit=None,
                                lang='',
                                pos=None,
                                dl_imgs=False):
    """
    Queries twitter for all the tweets you want! It will load all pages it gets
    from twitter. However, twitter might out of a sudden stop serving new pages,
    in that case, use the `query_tweets` method.

    Note that this function catches the KeyboardInterrupt so it can return
    tweets on incomplete queries if the user decides to abort.

    :param query: Any advanced query you want to do! Compile it at
                  https://twitter.com/search-advanced and just copy the query!
    :param limit: Scraping will be stopped when at least ``limit`` number of
                  items are fetched.
    :param pos: Field used as a "checkpoint" to continue where you left off in iteration
    :return:      A list of twitterscraper.Tweet objects. You will get at least
                  ``limit`` number of items.
    """
    logger.info('Querying {}'.format(query))
    query = query.replace(' ', '%20').replace('#', '%23').replace(
        ':', '%3A').replace('&', '%26')
    num_tweets = 0
    try:
        while True:
            new_tweets, new_pos = query_single_page(query, lang, pos, dl_imgs)
            if len(new_tweets) == 0:
                logger.info('Got {} tweets for {}.'.format(num_tweets, query))
                return

            for t in new_tweets:
                yield t, pos

            # use new_pos only once you have iterated through all old tweets
            pos = new_pos

            num_tweets += len(new_tweets)

            if limit and num_tweets >= limit:
                logger.info('Got {} tweets for {}.'.format(num_tweets, query))
                return

    except KeyboardInterrupt:
        logger.info('Program interrupted by user. Returning tweets gathered '
                    'so far...')
    except BaseException:
        logger.exception('An unknown error occurred! Returning tweets '
                         'gathered so far.')
    logger.info('Got {} tweets for {}.'.format(num_tweets, query))
Exemplo n.º 6
0
def query_tweets_once_generator(query, limit=None, lang='', db=None):
    """
    Queries twitter for all the tweets you want! It will load all pages it gets
    from twitter. However, twitter might out of a sudden stop serving new pages,
    in that case, use the `query_tweets` method.

    Note that this function catches the KeyboardInterrupt so it can return
    tweets on incomplete queries if the user decides to abort.

    :param query: Any advanced query you want to do! Compile it at
                  https://twitter.com/search-advanced and just copy the query!
    :param limit: Scraping will be stopped when at least ``limit`` number of
                  items are fetched.
    :param num_tweets: Number of tweets fetched outside this function.
    :return:      A list of twitterscraper.Tweet objects. You will get at least
                  ``limit`` number of items.
    """
    logger.info('Querying {}'.format(query))
    query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A')
    pos = None
    num_tweets = 0
    try:
        while True:
            new_tweets, pos = query_single_page(
                INIT_URL.format(q=query, lang=lang) if pos is None else
                RELOAD_URL.format(q=query, pos=pos, lang=lang), pos is None)
            if len(new_tweets) == 0:
                logger.info('Got {} tweets for {}.'.format(num_tweets, query))
                return

            if db:
                db(new_tweets)
                return new_tweets[-1], pos
            else:
                for t in new_tweets:
                    yield t, pos

            num_tweets += len(new_tweets)

            if limit and num_tweets >= limit:
                logger.info('Got {} tweets for {}.'.format(num_tweets, query))
                return

    except KeyboardInterrupt:
        logger.info('Program interrupted by user. Returning tweets gathered '
                    'so far...')
    except BaseException:
        logger.exception('An unknown error occurred! Returning tweets '
                         'gathered so far.')
    logger.info('Got {} tweets for {}.'.format(num_tweets, query))
Exemplo n.º 7
0
def query_tweets_once_generator(query, limit=None, lang='', pos=None):
    """
    Queries twitter for all the tweets you want! It will load all pages it gets
    from twitter. However, twitter might out of a sudden stop serving new pages,
    in that case, use the `query_tweets` method.

    Note that this function catches the KeyboardInterrupt so it can return
    tweets on incomplete queries if the user decides to abort.

    :param query: Any advanced query you want to do! Compile it at
                  https://twitter.com/search-advanced and just copy the query!
    :param limit: Scraping will be stopped when at least ``limit`` number of
                  items are fetched.
    :param pos: Field used as a "checkpoint" to continue where you left off in iteration
    :return:      A list of twitterscraper.Tweet objects. You will get at least
                  ``limit`` number of items.
    """
    logger.info('Querying {}'.format(query))
    query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A')
    num_tweets = 0
    try:
        while True:
            new_tweets, new_pos = query_single_page(query, lang, pos)
            if len(new_tweets) == 0:
                logger.info('Got {} tweets for {}.'.format(
                    num_tweets, query))
                return

            for t in new_tweets:
                yield t, pos

            # use new_pos only once you have iterated through all old tweets
            pos = new_pos

            num_tweets += len(new_tweets)

            if limit and num_tweets >= limit:
                logger.info('Got {} tweets for {}.'.format(
                    num_tweets, query))
                return

    except KeyboardInterrupt:
        logger.info('Program interrupted by user. Returning tweets gathered '
                     'so far...')
    except BaseException:
        logger.exception('An unknown error occurred! Returning tweets '
                          'gathered so far.')
    logger.info('Got {} tweets for {}.'.format(
        num_tweets, query))
Exemplo n.º 8
0
def query_user_info(user):
    """
    Returns the scraped user data from a twitter user page.

    :param user: the twitter user to web scrape its twitter page info
    """

    try:
        user_info = query_user_page(INIT_URL_USER.format(u=user))
        if user_info:
            logger.info("Got user information from username {}".format(user))
            return user_info

    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Returning user information gathered so far...")
    except BaseException:
        logger.exception("An unknown error occurred! Returning user information gathered so far...")

    logger.info("Got user information from username {}".format(user))
    return user_info
Exemplo n.º 9
0
def query_tweet_page(user, status_id):
    tweets = []
    try:
        new_tweets, pos = query_single_page(user,
                                            lang='',
                                            pos=None,
                                            from_user=True,
                                            status_id=status_id)
        if len(new_tweets) == 0:
            logger.info("Got {} tweets from user {} and status {}".format(
                len(tweets), user, status_id))
        tweets += new_tweets
        return tweets
    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Returning tweets gathered "
                    "so far...")
    except BaseException:
        logger.exception("An unknown error occurred! Returning tweets "
                         "gathered so far.")
    logger.info("Got {} tweets from username {}.".format(len(tweets), user))
    return tweets
Exemplo n.º 10
0
def query_user_info(user):
    """
    Returns the scraped user data from a twitter user page.

    :param user: the twitter user to web scrape its twitter page info 
    """


    try:
        user_info = query_user_page(INIT_URL_USER.format(u=user))
        if user_info:
            logger.info(f"Got user information from username {user}")
            return user_info

    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Returning user information gathered so far...")
    except BaseException:
        logger.exception("An unknown error occurred! Returning user information gathered so far...")

    logger.info(f"Got user information from username {user}")
    return user_info             
Exemplo n.º 11
0
def query_user_page(url, retry=10):
    """
    Returns the scraped user data from a twitter user page.

    :param url: The URL to get the twitter user info from (url contains the user page)
    :param retry: Number of retries if something goes wrong.
    :return: Returns the scraped user data from a twitter user page.
    """

    try:
        response = requests.get(url, headers=HEADER)
        html = response.text or ""

        user = User()
        user_info = user.from_html(html)
        if not user_info:
            return None

        return user_info

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))

    if retry > 0:
        logger.info("Retrying... (Attempts left: {})".format(retry))
        return query_user_page(url, retry - 1)

    logger.error("Giving up.")
    return None
Exemplo n.º 12
0
def query_user_page(url, retry=10, timeout=60):
    """
    Returns the scraped user data from a twitter user page.

    :param url: The URL to get the twitter user info from (url contains the user page)
    :param retry: Number of retries if something goes wrong.
    :return: Returns the scraped user data from a twitter user page.
    """

    try:
        proxy = next(proxy_pool)
        logger.info('Using proxy {}'.format(proxy))
        response = requests.get(url, headers=HEADER, proxies={"http": proxy})
        html = response.text or ''

        user_info = User.from_html(html)
        if not user_info:
            return None

        return user_info

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_user_page(url, retry - 1)

    logger.error('Giving up.')
    return None
Exemplo n.º 13
0
def query_user_page(url, retry=10):
    """
    Returns the scraped user data from a twitter user page.

    :param url: The URL to get the twitter user info from (url contains the user page)
    :param retry: Number of retries if something goes wrong.
    :return: Returns the scraped user data from a twitter user page.
    """

    try:
        response = requests.get(url, headers=HEADER)
        html = response.text or ''

        user = User()
        user_info = user.from_html(html)
        if not user_info:
            return None

        return user_info

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(
            e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_user_page(url, retry-1)

    logger.error('Giving up.')
    return None
Exemplo n.º 14
0
def query_single_page(query, lang, pos, retry=50, from_user=False, timeout=60):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)
    logger.info('Scraping tweets from {}'.format(url))

    try:
        proxy = next(proxy_pool)
        logger.info('Using proxy {}'.format(proxy))
        response = requests.get(url,
                                headers=HEADER,
                                proxies={"http": proxy},
                                timeout=timeout)
        if pos is None:  # html response
            html = response.text or ''
            json_resp = None
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            try:
                if json_resp:
                    pos = json_resp['min_position']
                    has_more_items = json_resp['has_more_items']
                    if not has_more_items:
                        logger.info("Twitter returned : 'has_more_items' ")
                        return [], None
                else:
                    pos = None
            except:
                pass
            if retry > 0:
                logger.info('Retrying... (Attempts left: {})'.format(retry))
                return query_single_page(query, lang, pos, retry - 1,
                                         from_user)
            else:
                return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp['min_position'])
        if from_user:
            return tweets, tweets[-1].tweet_id
        return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id,
                                            tweets[0].tweet_id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error('Giving up.')
    return [], None
Exemplo n.º 15
0
def query_single_page(query, lang, pos, retry=50, from_user=False):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)

    try:
        response = requests.get(url, headers=HEADER)
        if pos is None:  # html response
            html = response.text or ''
            json_resp = None
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception('Failed to parse JSON "{}" while requesting "{}"'.format(e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            if json_resp:
                pos = json_resp['min_position']
            else:
                pos = None
            if retry > 0:
                return query_single_page(query, lang, pos, retry - 1, from_user)
            else:
                return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp['min_position'])
        if from_user:
            return tweets, tweets[-1].id
        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(
            e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception('Failed to parse JSON "{}" while requesting "{}".'.format(
            e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error('Giving up.')
    return [], None