예제 #1
0
def query_user_page(url, retry=10, timeout=60):
    """
    Returns the scraped user data from a twitter user page.

    :param url: The URL to get the twitter user info from (url contains the user page)
    :param retry: Number of retries if something goes wrong.
    :return: Returns the scraped user data from a twitter user page.
    """

    try:
        proxy = next(proxy_pool)
        logger.info('Using proxy {}'.format(proxy))
        response = requests.get(url, headers=HEADER, proxies={"http": proxy})
        html = response.text or ''

        user_info = User.from_html(html)
        if not user_info:
            return None

        return user_info

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_user_page(url, retry - 1)

    logger.error('Giving up.')
    return None
예제 #2
0
def query_tweets_from_user(user, limit=None):
    pos = None
    tweets = []
    try:
        while True:
            new_tweets, pos = query_single_page(query,
                                                lang='',
                                                pos=pos,
                                                from_user=True)
            if len(new_tweets) == 0:
                #logger.info("Got {} tweets from username {}".format(len(tweets), user))
                return tweets

            tweets += new_tweets

            if limit and len(tweets) >= limit:
                #logger.info("Got {} tweets from username {}".format(len(tweets), user))
                return tweets

    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Returning tweets gathered "
                    "so far...")
    except BaseException:
        logger.exception("An unknown error occurred! Returning tweets "
                         "gathered so far.")
    # logger.info("Got {} tweets from username {}.".format(
    #     len(tweets), user))
    return tweets
예제 #3
0
def query_user_page(url, retry=10):
    """
    Returns the scraped user data from a twitter user page.

    :param url: The URL to get the twitter user info from (url contains the user page)
    :param retry: Number of retries if something goes wrong.
    :return: Returns the scraped user data from a twitter user page.
    """

    try:
        response = requests.get(url, headers=HEADER)
        html = response.text or ""

        user = User()
        user_info = user.from_html(html)
        if not user_info:
            return None

        return user_info

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))

    if retry > 0:
        logger.info("Retrying... (Attempts left: {})".format(retry))
        return query_user_page(url, retry - 1)

    logger.error("Giving up.")
    return None
예제 #4
0
def query_single_page(query, lang, pos, retry=50, from_user=False):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)

    try:
        response = requests.get(url, headers=HEADER)
        if pos is None:  # html response
            html = response.text or ""
            json_resp = None
        else:
            html = ""
            try:
                json_resp = json.loads(response.text)
                html = json_resp["items_html"] or ""
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            if json_resp:
                pos = urllib.parse.quote(json_resp["min_position"])
            else:
                pos = None
            return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp["min_position"])
        if from_user:
            return tweets, tweets[-1].id
        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info("Retrying... (Attempts left: {})".format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error("Giving up.")
    return [], None
예제 #5
0
파일: query.py 프로젝트: arkomz/POSEApp
def query_single_page(url, html_response=True, retry=10, from_user=False):
    """
    Returns tweets from the given URL.

    :param url: The URL to get the tweets from
    :param html_response: False, if the HTML is embedded in a JSON
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """

    try:
        response = requests.get(url, headers=HEADER)
        if html_response:
            html = response.text or ''
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            return [], None

        if not html_response:
            return tweets, urllib.parse.quote(json_resp['min_position'])

        if from_user:
            return tweets, tweets[-1].id
        else:
            return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(url, html_response, retry - 1)

    logger.error('Giving up.')
    return [], None
예제 #6
0
def query_tweets_once_generator(query,
                                limit=None,
                                lang='',
                                pos=None,
                                dl_imgs=False):
    """
    Queries twitter for all the tweets you want! It will load all pages it gets
    from twitter. However, twitter might out of a sudden stop serving new pages,
    in that case, use the `query_tweets` method.

    Note that this function catches the KeyboardInterrupt so it can return
    tweets on incomplete queries if the user decides to abort.

    :param query: Any advanced query you want to do! Compile it at
                  https://twitter.com/search-advanced and just copy the query!
    :param limit: Scraping will be stopped when at least ``limit`` number of
                  items are fetched.
    :param pos: Field used as a "checkpoint" to continue where you left off in iteration
    :return:      A list of twitterscraper.Tweet objects. You will get at least
                  ``limit`` number of items.
    """
    logger.info('Querying {}'.format(query))
    query = query.replace(' ', '%20').replace('#', '%23').replace(
        ':', '%3A').replace('&', '%26')
    num_tweets = 0
    try:
        while True:
            new_tweets, new_pos = query_single_page(query, lang, pos, dl_imgs)
            if len(new_tweets) == 0:
                logger.info('Got {} tweets for {}.'.format(num_tweets, query))
                return

            for t in new_tweets:
                yield t, pos

            # use new_pos only once you have iterated through all old tweets
            pos = new_pos

            num_tweets += len(new_tweets)

            if limit and num_tweets >= limit:
                logger.info('Got {} tweets for {}.'.format(num_tweets, query))
                return

    except KeyboardInterrupt:
        logger.info('Program interrupted by user. Returning tweets gathered '
                    'so far...')
    except BaseException:
        logger.exception('An unknown error occurred! Returning tweets '
                         'gathered so far.')
    logger.info('Got {} tweets for {}.'.format(num_tweets, query))
예제 #7
0
def query_tweets(query,
                 limit=None,
                 begindate=dt.date(2006, 3, 21),
                 enddate=dt.date.today(),
                 poolsize=20,
                 lang='',
                 logging=False):
    no_days = (enddate - begindate).days

    if (no_days < 0):
        sys.exit('Begin date must occur before end date.')

    if poolsize > no_days:
        # Since we are assigning each pool a range of dates to query,
        # the number of pools should not exceed the number of dates.
        poolsize = no_days
    dateranges = [
        begindate + dt.timedelta(days=elem)
        for elem in linspace(0, no_days, poolsize + 1)
    ]

    if limit and poolsize:
        limit_per_pool = (limit // poolsize) + 1
    else:
        limit_per_pool = None

    queries = [
        '{} since:{} until:{}'.format(query, since, until)
        for since, until in zip(dateranges[:-1], dateranges[1:])
    ]

    all_tweets = []
    try:
        pool = Pool(poolsize)
        #logger.info('queries: {}'.format(queries))
        try:
            for new_tweets in pool.imap_unordered(
                    partial(query_tweets_once, limit=limit_per_pool,
                            lang=lang), queries):
                all_tweets.extend(new_tweets)
                #logger.info('Got {} tweets ({} new).'.format(

#   len(all_tweets), len(new_tweets)))
        except KeyboardInterrupt:
            logger.info('Program interrupted by user. Returning all tweets '
                        'gathered so far.')
    finally:
        pool.close()
        pool.join()

    return all_tweets
예제 #8
0
def query_tweets_from_user(user, limit=None):
    pos = None
    tweets = []
    try:
        while True:
           new_tweets, pos = query_single_page(query, lang='', pos=pos, from_user=True)
           if len(new_tweets) == 0:
               logger.info("Got {} tweets from username {}".format(len(tweets), user))
               return tweets

           tweets += new_tweets

           if limit and len(tweets) >= limit:
               logger.info("Got {} tweets from username {}".format(len(tweets), user))
               return tweets

    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Returning tweets gathered "
                     "so far...")
    except BaseException:
        logger.exception("An unknown error occurred! Returning tweets "
                          "gathered so far.")
    logger.info("Got {} tweets from username {}.".format(
        len(tweets), user))
    return tweets
예제 #9
0
def query_tweets_once_generator(query, limit=None, lang='', db=None):
    """
    Queries twitter for all the tweets you want! It will load all pages it gets
    from twitter. However, twitter might out of a sudden stop serving new pages,
    in that case, use the `query_tweets` method.

    Note that this function catches the KeyboardInterrupt so it can return
    tweets on incomplete queries if the user decides to abort.

    :param query: Any advanced query you want to do! Compile it at
                  https://twitter.com/search-advanced and just copy the query!
    :param limit: Scraping will be stopped when at least ``limit`` number of
                  items are fetched.
    :param num_tweets: Number of tweets fetched outside this function.
    :return:      A list of twitterscraper.Tweet objects. You will get at least
                  ``limit`` number of items.
    """
    logger.info('Querying {}'.format(query))
    query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A')
    pos = None
    num_tweets = 0
    try:
        while True:
            new_tweets, pos = query_single_page(
                INIT_URL.format(q=query, lang=lang) if pos is None else
                RELOAD_URL.format(q=query, pos=pos, lang=lang), pos is None)
            if len(new_tweets) == 0:
                logger.info('Got {} tweets for {}.'.format(num_tweets, query))
                return

            if db:
                db(new_tweets)
                return new_tweets[-1], pos
            else:
                for t in new_tweets:
                    yield t, pos

            num_tweets += len(new_tweets)

            if limit and num_tweets >= limit:
                logger.info('Got {} tweets for {}.'.format(num_tweets, query))
                return

    except KeyboardInterrupt:
        logger.info('Program interrupted by user. Returning tweets gathered '
                    'so far...')
    except BaseException:
        logger.exception('An unknown error occurred! Returning tweets '
                         'gathered so far.')
    logger.info('Got {} tweets for {}.'.format(num_tweets, query))
예제 #10
0
def query_tweets_once_generator(query, limit=None, lang='', pos=None):
    """
    Queries twitter for all the tweets you want! It will load all pages it gets
    from twitter. However, twitter might out of a sudden stop serving new pages,
    in that case, use the `query_tweets` method.

    Note that this function catches the KeyboardInterrupt so it can return
    tweets on incomplete queries if the user decides to abort.

    :param query: Any advanced query you want to do! Compile it at
                  https://twitter.com/search-advanced and just copy the query!
    :param limit: Scraping will be stopped when at least ``limit`` number of
                  items are fetched.
    :param pos: Field used as a "checkpoint" to continue where you left off in iteration
    :return:      A list of twitterscraper.Tweet objects. You will get at least
                  ``limit`` number of items.
    """
    logger.info('Querying {}'.format(query))
    query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A')
    num_tweets = 0
    try:
        while True:
            new_tweets, new_pos = query_single_page(query, lang, pos)
            if len(new_tweets) == 0:
                logger.info('Got {} tweets for {}.'.format(
                    num_tweets, query))
                return

            for t in new_tweets:
                yield t, pos

            # use new_pos only once you have iterated through all old tweets
            pos = new_pos

            num_tweets += len(new_tweets)

            if limit and num_tweets >= limit:
                logger.info('Got {} tweets for {}.'.format(
                    num_tweets, query))
                return

    except KeyboardInterrupt:
        logger.info('Program interrupted by user. Returning tweets gathered '
                     'so far...')
    except BaseException:
        logger.exception('An unknown error occurred! Returning tweets '
                          'gathered so far.')
    logger.info('Got {} tweets for {}.'.format(
        num_tweets, query))
예제 #11
0
def download_tw(tweet: Tweet, user_dir: str):
    img_url: str
    for img_url in tweet.img_urls:
        img_file = user_dir + img_url[img_url.rindex('/'):]
        if not os.path.exists(img_file):
            retry = 5
            while retry > 0:
                try:
                    logger.info("download " + img_url + ", retry = " +
                                str(retry))
                    request.urlretrieve(img_url, img_file)
                    break
                except URLError:
                    retry = retry - 1
                    pass
예제 #12
0
def download_all_images(tweets, output_path, username=None, size="orig"):
    if username:
        root_dir = os.path.join(output_path or '', username)
        create_directory(root_dir)

    for t in tweets:
        for img_url in t.img_urls:
            date = dt.datetime.fromtimestamp(t.timestamp_epochs)

            if not username:
                root_dir = os.path.join(output_path or '', t.screen_name)
                create_directory(root_dir)

            final_path = root_dir
            # Create Subfolders for Years and Months
            final_path = os.path.join(final_path, date.strftime("%Y"))
            create_directory(final_path)
            final_path = os.path.join(final_path, date.strftime("%m"))
            create_directory(final_path)

            #if is_retweet:
            # Create Subfolder to separate any Retweets
            #final_path = os.path.join(final_path, "Retweets")
            #create_directory(final_path)

            timestamp = date.strftime("%Y-%m-%d_") + size + "_"

            #if is_retweet:
            #timestamp = "RT_" + timestamp

            filepath = os.path.join(final_path,
                                    timestamp + os.path.basename(img_url))

            r = requests.get(img_url + ':' + size, stream=True)
            base_name = timestamp + os.path.basename(img_url)
            filename = os.path.join(final_path or '', base_name)

            with open(filename, 'wb') as fd:
                for chunk in r.iter_content(chunk_size=1024):
                    fd.write(chunk)
            logger.info(filename)
예제 #13
0
def query_user_info(user):
    """
    Returns the scraped user data from a twitter user page.

    :param user: the twitter user to web scrape its twitter page info 
    """

    try:
        user_info = query_user_page(INIT_URL_USER.format(u=user))
        if user_info:
            logger.info(f"Got user information from username {user}")
            return user_info

    except KeyboardInterrupt:
        logger.info(
            "Program interrupted by user. Returning user information gathered so far..."
        )
    except BaseException:
        logger.exception(
            "An unknown error occurred! Returning user information gathered so far..."
        )
예제 #14
0
def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang=''):
    no_days = (enddate - begindate).days
    if poolsize > no_days:
        # Since we are assigning each pool a range of dates to query,
		# the number of pools should not exceed the number of dates.
        poolsize = no_days
    dateranges = [begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize+1)]

    if limit:
        limit_per_pool = (limit // poolsize)+1
    else:
        limit_per_pool = None

    queries = ['{} since:{} until:{}'.format(query, since, until)
               for since, until in zip(dateranges[:-1], dateranges[1:])]

    all_tweets = []
    try:
        pool = Pool(poolsize)
        logger.info('queries: {}'.format(queries))
        try:
            for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries):
                all_tweets.extend(new_tweets)
                logger.info('Got {} tweets ({} new).'.format(
                    len(all_tweets), len(new_tweets)))
        except KeyboardInterrupt:
            logger.info('Program interrupted by user. Returning all tweets '
                         'gathered so far.')
    finally:
        pool.close()
        pool.join()

    return all_tweets
예제 #15
0
def main():
    logger.info({'Hello world': '1'})
    try:
        config_path = os.path.split(
            os.path.realpath(__file__))[0] + os.sep + 'config.json'
        if not os.path.isfile(config_path):
            sys.exit(u'当前路径:%s 不存在配置文件config.json' %
                     (os.path.split(os.path.realpath(__file__))[0] + os.sep))
        with open(config_path) as f:
            config = json.loads(f.read())
        validate_config(config)
        print('hello world')
        user_id_list = config['user_id_list']
        if not isinstance(user_id_list, list):
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            user_id_list = get_user_list(user_id_list)
        for user in user_id_list:
            print(user)
            list_of_tweets = query_tweets_from_user(user, 10)
            outPutFileName = get_filepath(user, 'data') + '.csv'
            with open(outPutFileName, "w", encoding="utf-8") as output:
                writer = csv.writer(output)
                writer.writerow(["text_html", "img_url", "video_url", "links"])
                for t in list_of_tweets:
                    writer.writerow(
                        [t.text_html, t.img_urls, t.video_url, t.links])
                    for imgUrl in t.img_urls:
                        download_one_file(user, 'img', imgUrl)
                    for videoUrl in t.video_url:
                        download_one_file(user, 'video', videoUrl)
    except ValueError:
        print('config.json格式不正确')
    except Exception as e:
        print('Error: ', e)
        traceback.print_exe()
예제 #16
0
def query_user_page(url, retry=10):
    """
    Returns the scraped user data from a twitter user page.

    :param url: The URL to get the twitter user info from (url contains the user page)
    :param retry: Number of retries if something goes wrong.
    :return: Returns the scraped user data from a twitter user page.
    """

    try:
        response = requests.get(url, headers=HEADER)
        html = response.text or ''

        user = User()
        user_info = user.from_html(html)
        if not user_info:
            return None

        return user_info

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(
            e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_user_page(url, retry-1)

    logger.error('Giving up.')
    return None
예제 #17
0
def query_tweet_page(user, status_id):
    tweets = []
    try:
        new_tweets, pos = query_single_page(user,
                                            lang='',
                                            pos=None,
                                            from_user=True,
                                            status_id=status_id)
        if len(new_tweets) == 0:
            logger.info("Got {} tweets from user {} and status {}".format(
                len(tweets), user, status_id))
        tweets += new_tweets
        return tweets
    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Returning tweets gathered "
                    "so far...")
    except BaseException:
        logger.exception("An unknown error occurred! Returning tweets "
                         "gathered so far.")
    logger.info("Got {} tweets from username {}.".format(len(tweets), user))
    return tweets
예제 #18
0
def query_user_info(user):
    """
    Returns the scraped user data from a twitter user page.

    :param user: the twitter user to web scrape its twitter page info 
    """


    try:
        user_info = query_user_page(INIT_URL_USER.format(u=user))
        if user_info:
            logger.info(f"Got user information from username {user}")
            return user_info

    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Returning user information gathered so far...")
    except BaseException:
        logger.exception("An unknown error occurred! Returning user information gathered so far...")

    logger.info(f"Got user information from username {user}")
    return user_info             
예제 #19
0
def query_single_page(query, lang, pos, retry=50, from_user=False, timeout=60):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)
    logger.info('Scraping tweets from {}'.format(url))

    try:
        proxy = next(proxy_pool)
        logger.info('Using proxy {}'.format(proxy))
        response = requests.get(url,
                                headers=HEADER,
                                proxies={"http": proxy},
                                timeout=timeout)
        if pos is None:  # html response
            html = response.text or ''
            json_resp = None
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception(
                    'Failed to parse JSON "{}" while requesting "{}"'.format(
                        e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            try:
                if json_resp:
                    pos = json_resp['min_position']
                    has_more_items = json_resp['has_more_items']
                    if not has_more_items:
                        logger.info("Twitter returned : 'has_more_items' ")
                        return [], None
                else:
                    pos = None
            except:
                pass
            if retry > 0:
                logger.info('Retrying... (Attempts left: {})'.format(retry))
                return query_single_page(query, lang, pos, retry - 1,
                                         from_user)
            else:
                return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp['min_position'])
        if from_user:
            return tweets, tweets[-1].tweet_id
        return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id,
                                            tweets[0].tweet_id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception(
            'Failed to parse JSON "{}" while requesting "{}".'.format(e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error('Giving up.')
    return [], None
예제 #20
0
from twitterscraper.ts_logger import logger
from twitterscraper.user import User

#from fake_useragent import UserAgent
#ua = UserAgent()
#HEADER = {'User-Agent': ua.random}
HEADERS_LIST = [
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
    'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
    'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
    'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre'
]

HEADER = {'User-Agent': random.choice(HEADERS_LIST)}
logger.info(HEADER)

INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q={q}&l={lang}'
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' \
             'default&include_available_features=1&include_entities=1&' \
             'reset_error_state=false&src=typd&max_position={pos}&q={q}&l={lang}'
INIT_URL_USER = '******'
RELOAD_URL_USER = '******' \
                  'include_available_features=1&include_entities=1&' \
                  'max_position={pos}&reset_error_state=false'
PROXY_URL = 'https://free-proxy-list.net/'


def get_proxies():
    response = requests.get(PROXY_URL)
    soup = BeautifulSoup(response.text, 'lxml')
예제 #21
0
def query_single_page(query, lang, pos, retry=50, from_user=False):
    """
    Returns tweets from the given URL.

    :param query: The query parameter of the query url
    :param lang: The language parameter of the query url
    :param pos: The query url parameter that determines where to start looking
    :param retry: Number of retries if something goes wrong.
    :return: The list of tweets, the pos argument for getting the next page.
    """
    url = get_query_url(query, lang, pos, from_user)

    try:
        response = requests.get(url, headers=HEADER)
        if pos is None:  # html response
            html = response.text or ''
            json_resp = None
        else:
            html = ''
            try:
                json_resp = json.loads(response.text)
                html = json_resp['items_html'] or ''
            except ValueError as e:
                logger.exception('Failed to parse JSON "{}" while requesting "{}"'.format(e, url))

        tweets = list(Tweet.from_html(html))

        if not tweets:
            if json_resp:
                pos = json_resp['min_position']
            else:
                pos = None
            if retry > 0:
                return query_single_page(query, lang, pos, retry - 1, from_user)
            else:
                return [], pos

        if json_resp:
            return tweets, urllib.parse.quote(json_resp['min_position'])
        if from_user:
            return tweets, tweets[-1].id
        return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id)

    except requests.exceptions.HTTPError as e:
        logger.exception('HTTPError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.ConnectionError as e:
        logger.exception('ConnectionError {} while requesting "{}"'.format(
            e, url))
    except requests.exceptions.Timeout as e:
        logger.exception('TimeOut {} while requesting "{}"'.format(
            e, url))
    except json.decoder.JSONDecodeError as e:
        logger.exception('Failed to parse JSON "{}" while requesting "{}".'.format(
            e, url))

    if retry > 0:
        logger.info('Retrying... (Attempts left: {})'.format(retry))
        return query_single_page(query, lang, pos, retry - 1)

    logger.error('Giving up.')
    return [], None
예제 #22
0
def main():
    try:
        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
            description=__doc__
        )

        parser.add_argument("query", type=str, help="Advanced twitter query")
        parser.add_argument("-o", "--output", type=str, default="tweets.json",
                            help="Path to a JSON file to store the gathered "
                                 "tweets to.")
        parser.add_argument("-l", "--limit", type=int, default=None,
                            help="Number of minimum tweets to gather.")
        parser.add_argument("-a", "--all", action='store_true',
                            help="Set this flag if you want to get all tweets "
                                 "in the history of twitter. Begindate is set to 2006-03-01."
                                 "This may take a while. You can increase the number of parallel"
                                 "processes depending on the computational power you have.")
        parser.add_argument("-c", "--csv", action='store_true',
                                help="Set this flag if you want to save the results to a CSV format.")
        parser.add_argument("-u", "--user", action='store_true',
                            help="Set this flag to if you want to scrape tweets from a specific user"
                                 "The query should then consist of the profilename (user) you want to scrape without @")
        parser.add_argument("--profiles", action='store_true',
                            help="Set this flag to if you want to scrape profile info of all the users where you" 
                            "have previously scraped from. After all of the tweets have been scraped it will start"
                            "a new process of scraping profile pages.")
        parser.add_argument("--lang", type=str, default=None,
                            help="Set this flag if you want to query tweets in \na specific language. You can choose from:\n"
                                 "en (English)\nar (Arabic)\nbn (Bengali)\n"
                                 "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n"
                                 "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n"
                                 "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n"
                                 "id (Indonesian)\nit (Italian)\nja (Japanese)\n"
                                 "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n"
                                 "no (Norwegian)\npl (Polish)\npt (Portuguese)\n"
                                 "ro (Romanian)\nru (Russian)\nsv (Swedish)\n"
                                 "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n"
                                 "ur (Urdu)\nvi (Vietnamese)\n"
                                 "zh-cn (Chinese Simplified)\n"
                                 "zh-tw (Chinese Traditional)"
                                 )
        parser.add_argument("-d", "--dump", action="store_true",
                            help="Set this flag if you want to dump the tweets \nto the console rather than outputting to a file")
        parser.add_argument("-bd", "--begindate", type=valid_date, default="2006-03-21",
                            help="Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21", metavar='\b')
        parser.add_argument("-ed", "--enddate", type=valid_date, default=dt.date.today(),
                            help="Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.", metavar='\b')
        parser.add_argument("-p", "--poolsize", type=int, default=20, help="Specify the number of parallel process you want to run. \n"
                            "Default value is set to 20. \nYou can change this number if you have more computing power available. \n"
                            "Set to 1 if you dont want to run any parallel processes.", metavar='\b')
        args = parser.parse_args()

        if isfile(args.output) and not args.dump:
            logger.error("Output file already exists! Aborting.")
            exit(-1)

        if args.all:
            args.begindate = dt.date(2006,3,1)
            args.enddate = dt.date.today()

        if args.user:
            tweets = query_tweets_from_user(user = args.query, limit = args.limit)
        else:
            tweets = query_tweets(query = args.query, limit = args.limit,
                              begindate = args.begindate, enddate = args.enddate,
                              poolsize = args.poolsize, lang = args.lang)

        if args.dump:
            print(json.dumps(tweets, cls=JSONEncoder))
        else:
            if tweets:
                with open(args.output, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output)
                        f.writerow(["user", "fullname", "tweet-id", "timestamp", "url", "likes", "replies", "retweets", "text", "html"])
                        for x in tweets:
                            f.writerow([x.user, x.fullname, x.id, x.timestamp, x.url,
                                        x.likes, x.replies, x.retweets,
                                        x.text, x.html])
                    else:
                        json.dump(tweets, output, cls=JSONEncoder)

            if args.profiles and tweets:
                list_users = list(set([tweet.user for tweet in tweets]))
                # list_users_info = [query_user_info(elem) for elem in list_users]
                filename = 'userprofiles_' + args.output

                with open(filename, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output)
                        f.writerow(["user","fullname","location","blog","date_joined","id","num_tweets","following","followers","likes","lists"])
                        for elem in list_users:
                            u = query_user_info(elem)
                            if u is None:
                                continue
                            else:
                                f.writerow([u.user, u.full_name, u.location, u.blog, u.date_joined, u.id, u.tweets, u.following,
                                u.followers, u.likes, u.lists])

                    else:
                        for elem in list_users:
                            u = query_user_info(elem)
                            if u is None:
                                continue
                            else:
                                json.dump(u, output, cls=JSONEncoder, indent=2)

    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Quitting...")
예제 #23
0
def main():
    try:
        parser = argparse.ArgumentParser(
            formatter_class=argparse.RawTextHelpFormatter, description=__doc__)

        parser.add_argument("query", type=str, help="Advanced twitter query")
        parser.add_argument("-o",
                            "--output",
                            type=str,
                            default="tweets.json",
                            help="Path to a JSON file to store the gathered "
                            "tweets to.")
        parser.add_argument("-l",
                            "--limit",
                            type=int,
                            default=None,
                            help="Number of minimum tweets to gather.")
        parser.add_argument(
            "-a",
            "--all",
            action='store_true',
            help="Set this flag if you want to get all tweets "
            "in the history of twitter. Begindate is set to 2006-03-01."
            "This may take a while. You can increase the number of parallel"
            "processes depending on the computational power you have.")
        parser.add_argument(
            "-c",
            "--csv",
            action='store_true',
            help=
            "Set this flag if you want to save the results to a CSV format.")
        parser.add_argument(
            "-u",
            "--user",
            action='store_true',
            help=
            "Set this flag to if you want to scrape tweets from a specific user"
            "The query should then consist of the profilename you want to scrape without @"
        )
        parser.add_argument(
            "--profiles",
            action='store_true',
            help=
            "Set this flag to if you want to scrape profile info of all the users where you"
            "have previously scraped from. After all of the tweets have been scraped it will start"
            "a new process of scraping profile pages.")
        parser.add_argument(
            "--lang",
            type=str,
            default=None,
            help=
            "Set this flag if you want to query tweets in \na specific language. You can choose from:\n"
            "en (English)\nar (Arabic)\nbn (Bengali)\n"
            "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n"
            "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n"
            "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n"
            "id (Indonesian)\nit (Italian)\nja (Japanese)\n"
            "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n"
            "no (Norwegian)\npl (Polish)\npt (Portuguese)\n"
            "ro (Romanian)\nru (Russian)\nsv (Swedish)\n"
            "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n"
            "ur (Urdu)\nvi (Vietnamese)\n"
            "zh-cn (Chinese Simplified)\n"
            "zh-tw (Chinese Traditional)")
        parser.add_argument(
            "-d",
            "--dump",
            action="store_true",
            help=
            "Set this flag if you want to dump the tweets \nto the console rather than outputting to a file"
        )
        parser.add_argument(
            "-ow",
            "--overwrite",
            action="store_true",
            help=
            "Set this flag if you want to overwrite the existing output file.")
        parser.add_argument(
            "-bd",
            "--begindate",
            type=valid_date,
            default="2006-03-21",
            help=
            "Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21",
            metavar='\b')
        parser.add_argument(
            "-ed",
            "--enddate",
            type=valid_date,
            default=dt.date.today(),
            help=
            "Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.",
            metavar='\b')
        parser.add_argument(
            "-p",
            "--poolsize",
            type=int,
            default=20,
            help="Specify the number of parallel process you want to run. \n"
            "Default value is set to 20. \nYou can change this number if you have more computing power available. \n"
            "Set to 1 if you dont want to run any parallel processes.",
            metavar='\b')
        parser.add_argument(
            "-i",
            "--images",
            action="store_true",
            help=
            "Set this flag if you want to download all images from the query.")
        parser.add_argument(
            "-io",
            "--imagesoutput",
            type=str,
            default="./",
            help=
            "The path to the folder to download all of the images to, using -i."
        )
        parser.add_argument(
            "-ex",
            "--onlymedia",
            action="store_true",
            help="Set this flag if you want exclude tweets without media.")
        args = parser.parse_args()

        if isfile(args.output) and not args.dump and not args.overwrite:
            logger.error("Output file already exists! Aborting.")
            exit(-1)

        if args.all:
            args.begindate = dt.date(2006, 3, 1)

        if args.user:
            tweets = query_tweets_from_user(user=args.query,
                                            limit=args.limit,
                                            dl_imgs=args.onlymedia)
        else:
            tweets = query_tweets(query=args.query,
                                  limit=args.limit,
                                  begindate=args.begindate,
                                  enddate=args.enddate,
                                  poolsize=args.poolsize,
                                  lang=args.lang,
                                  dl_imgs=args.onlymedia)

        if args.dump:
            pprint([tweet.__dict__ for tweet in tweets])
        else:
            if tweets:
                with open(args.output, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output,
                                       delimiter=";",
                                       quoting=csv.QUOTE_NONNUMERIC)
                        f.writerow([
                            "screen_name", "username", "user_id", "tweet_id",
                            "tweet_url", "timestamp", "timestamp_epochs",
                            "text", "text_html", "links", "hashtags",
                            "has_media", "img_urls", "video_url", "likes",
                            "retweets", "replies", "is_replied", "is_reply_to",
                            "parent_tweet_id", "reply_to_users"
                        ])
                        for t in tweets:
                            f.writerow([
                                t.screen_name, t.username, t.user_id,
                                t.tweet_id, t.tweet_url, t.timestamp,
                                t.timestamp_epochs, t.text, t.text_html,
                                t.links, t.hashtags, t.has_media, t.img_urls,
                                t.video_url, t.likes, t.retweets, t.replies,
                                t.is_replied, t.is_reply_to, t.parent_tweet_id,
                                t.reply_to_users
                            ])

                        if args.images:
                            if args.user:
                                download_all_images(tweets,
                                                    args.imagesoutput,
                                                    username=args.query)
                            else:
                                download_all_images(tweets, args.imagesoutput)
                    else:
                        if args.images:
                            if args.user:
                                download_all_images(tweets,
                                                    args.imagesoutput,
                                                    username=args.query)
                            else:
                                download_all_images(tweets, args.imagesoutput)
                        json.dump(tweets, output, cls=JSONEncoder)
            if args.profiles and tweets:
                list_users = list(set([tweet.username for tweet in tweets]))
                list_users_info = [
                    query_user_info(elem) for elem in list_users
                ]
                filename = 'userprofiles_' + args.output

                if args.images:
                    download_all_images(list_users_info, args.imagesoutput)

                with open(filename, "w", encoding="utf-8") as output:
                    json.dump(list_users_info, output, cls=JSONEncoder)
    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Quitting...")
예제 #24
0
def main():
    try:
        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
            description=__doc__
        )

        parser.add_argument("query", type=str, help="Advanced twitter query")
        parser.add_argument("-o", "--output", type=str, default="tweets.json",
                            help="Path to a JSON file to store the gathered "
                                 "tweets to.")
        parser.add_argument("-l", "--limit", type=int, default=None,
                            help="Number of minimum tweets to gather.")
        parser.add_argument("-a", "--all", action='store_true',
                            help="Set this flag if you want to get all tweets "
                                 "in the history of twitter. Begindate is set to 2006-03-01."
                                 "This may take a while. You can increase the number of parallel"
                                 "processes depending on the computational power you have.")
        parser.add_argument("-c", "--csv", action='store_true',
                                help="Set this flag if you want to save the results to a CSV format.")
        parser.add_argument("-u", "--user", action='store_true',
                            help="Set this flag to if you want to scrape tweets from a specific user"
                                 "The query should then consist of the profilename you want to scrape without @")
        parser.add_argument("--lang", type=str, default=None,
                            help="Set this flag if you want to query tweets in \na specific language. You can choose from:\n"
                                 "en (English)\nar (Arabic)\nbn (Bengali)\n"
                                 "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n"
                                 "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n"
                                 "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n"
                                 "id (Indonesian)\nit (Italian)\nja (Japanese)\n"
                                 "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n"
                                 "no (Norwegian)\npl (Polish)\npt (Portuguese)\n"
                                 "ro (Romanian)\nru (Russian)\nsv (Swedish)\n"
                                 "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n"
                                 "ur (Urdu)\nvi (Vietnamese)\n"
                                 "zh-cn (Chinese Simplified)\n"
                                 "zh-tw (Chinese Traditional)"
                                 )
        parser.add_argument("-d", "--dump", action="store_true",
                            help="Set this flag if you want to dump the tweets \nto the console rather than outputting to a file")
        parser.add_argument("-bd", "--begindate", type=valid_date, default="2006-03-21",
                            help="Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21", metavar='\b')
        parser.add_argument("-ed", "--enddate", type=valid_date, default=dt.date.today(),
                            help="Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.", metavar='\b')
        parser.add_argument("-p", "--poolsize", type=int, default=20, help="Specify the number of parallel process you want to run. \n"
                            "Default value is set to 20. \nYou can change this number if you have more computing power available. \n"
                            "Set to 1 if you dont want to run any parallel processes.", metavar='\b')
        args = parser.parse_args()

        if isfile(args.output) and not args.dump:
            logger.error("Output file already exists! Aborting.")
            exit(-1)

        if args.all:
            args.begindate = dt.date(2006,3,1)

        if args.user:
            tweets = query_tweets_from_user(user = args.query, limit = args.limit)
        else:
            tweets = query_tweets(query = args.query, limit = args.limit,
                              begindate = args.begindate, enddate = args.enddate,
                              poolsize = args.poolsize, lang = args.lang)

        if args.dump:
            print(json.dumps(tweets, cls=JSONEncoder))
        else:
            if tweets:
                with open(args.output, "w", encoding="utf-8") as output:
                    if args.csv:
                        f = csv.writer(output)
                        f.writerow(["user", "fullname", "tweet-id", "timestamp", "url", "likes", "replies", "retweets", "text", "html"])
                        for x in tweets:
                            f.writerow([x.user, x.fullname, x.id, x.timestamp, x.url,
                                        x.likes, x.replies, x.retweets,
                                        x.text, x.html])
                    else:
                        json.dump(tweets, output, cls=JSONEncoder)
    except KeyboardInterrupt:
        logger.info("Program interrupted by user. Quitting...")