def query_user_page(url, retry=10, timeout=60): """ Returns the scraped user data from a twitter user page. :param url: The URL to get the twitter user info from (url contains the user page) :param retry: Number of retries if something goes wrong. :return: Returns the scraped user data from a twitter user page. """ try: proxy = next(proxy_pool) logger.info('Using proxy {}'.format(proxy)) response = requests.get(url, headers=HEADER, proxies={"http": proxy}) html = response.text or '' user_info = User.from_html(html) if not user_info: return None return user_info except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_user_page(url, retry - 1) logger.error('Giving up.') return None
def query_tweets_from_user(user, limit=None): pos = None tweets = [] try: while True: new_tweets, pos = query_single_page(query, lang='', pos=pos, from_user=True) if len(new_tweets) == 0: #logger.info("Got {} tweets from username {}".format(len(tweets), user)) return tweets tweets += new_tweets if limit and len(tweets) >= limit: #logger.info("Got {} tweets from username {}".format(len(tweets), user)) return tweets except KeyboardInterrupt: logger.info("Program interrupted by user. Returning tweets gathered " "so far...") except BaseException: logger.exception("An unknown error occurred! Returning tweets " "gathered so far.") # logger.info("Got {} tweets from username {}.".format( # len(tweets), user)) return tweets
def query_user_page(url, retry=10): """ Returns the scraped user data from a twitter user page. :param url: The URL to get the twitter user info from (url contains the user page) :param retry: Number of retries if something goes wrong. :return: Returns the scraped user data from a twitter user page. """ try: response = requests.get(url, headers=HEADER) html = response.text or "" user = User() user_info = user.from_html(html) if not user_info: return None return user_info except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) if retry > 0: logger.info("Retrying... (Attempts left: {})".format(retry)) return query_user_page(url, retry - 1) logger.error("Giving up.") return None
def query_single_page(query, lang, pos, retry=50, from_user=False): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) try: response = requests.get(url, headers=HEADER) if pos is None: # html response html = response.text or "" json_resp = None else: html = "" try: json_resp = json.loads(response.text) html = json_resp["items_html"] or "" except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: if json_resp: pos = urllib.parse.quote(json_resp["min_position"]) else: pos = None return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp["min_position"]) if from_user: return tweets, tweets[-1].id return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info("Retrying... (Attempts left: {})".format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error("Giving up.") return [], None
def query_single_page(url, html_response=True, retry=10, from_user=False): """ Returns tweets from the given URL. :param url: The URL to get the tweets from :param html_response: False, if the HTML is embedded in a JSON :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ try: response = requests.get(url, headers=HEADER) if html_response: html = response.text or '' else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: return [], None if not html_response: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].id else: return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(url, html_response, retry - 1) logger.error('Giving up.') return [], None
def query_tweets_once_generator(query, limit=None, lang='', pos=None, dl_imgs=False): """ Queries twitter for all the tweets you want! It will load all pages it gets from twitter. However, twitter might out of a sudden stop serving new pages, in that case, use the `query_tweets` method. Note that this function catches the KeyboardInterrupt so it can return tweets on incomplete queries if the user decides to abort. :param query: Any advanced query you want to do! Compile it at https://twitter.com/search-advanced and just copy the query! :param limit: Scraping will be stopped when at least ``limit`` number of items are fetched. :param pos: Field used as a "checkpoint" to continue where you left off in iteration :return: A list of twitterscraper.Tweet objects. You will get at least ``limit`` number of items. """ logger.info('Querying {}'.format(query)) query = query.replace(' ', '%20').replace('#', '%23').replace( ':', '%3A').replace('&', '%26') num_tweets = 0 try: while True: new_tweets, new_pos = query_single_page(query, lang, pos, dl_imgs) if len(new_tweets) == 0: logger.info('Got {} tweets for {}.'.format(num_tweets, query)) return for t in new_tweets: yield t, pos # use new_pos only once you have iterated through all old tweets pos = new_pos num_tweets += len(new_tweets) if limit and num_tweets >= limit: logger.info('Got {} tweets for {}.'.format(num_tweets, query)) return except KeyboardInterrupt: logger.info('Program interrupted by user. Returning tweets gathered ' 'so far...') except BaseException: logger.exception('An unknown error occurred! Returning tweets ' 'gathered so far.') logger.info('Got {} tweets for {}.'.format(num_tweets, query))
def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang='', logging=False): no_days = (enddate - begindate).days if (no_days < 0): sys.exit('Begin date must occur before end date.') if poolsize > no_days: # Since we are assigning each pool a range of dates to query, # the number of pools should not exceed the number of dates. poolsize = no_days dateranges = [ begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize + 1) ] if limit and poolsize: limit_per_pool = (limit // poolsize) + 1 else: limit_per_pool = None queries = [ '{} since:{} until:{}'.format(query, since, until) for since, until in zip(dateranges[:-1], dateranges[1:]) ] all_tweets = [] try: pool = Pool(poolsize) #logger.info('queries: {}'.format(queries)) try: for new_tweets in pool.imap_unordered( partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries): all_tweets.extend(new_tweets) #logger.info('Got {} tweets ({} new).'.format( # len(all_tweets), len(new_tweets))) except KeyboardInterrupt: logger.info('Program interrupted by user. Returning all tweets ' 'gathered so far.') finally: pool.close() pool.join() return all_tweets
def query_tweets_from_user(user, limit=None): pos = None tweets = [] try: while True: new_tweets, pos = query_single_page(query, lang='', pos=pos, from_user=True) if len(new_tweets) == 0: logger.info("Got {} tweets from username {}".format(len(tweets), user)) return tweets tweets += new_tweets if limit and len(tweets) >= limit: logger.info("Got {} tweets from username {}".format(len(tweets), user)) return tweets except KeyboardInterrupt: logger.info("Program interrupted by user. Returning tweets gathered " "so far...") except BaseException: logger.exception("An unknown error occurred! Returning tweets " "gathered so far.") logger.info("Got {} tweets from username {}.".format( len(tweets), user)) return tweets
def query_tweets_once_generator(query, limit=None, lang='', db=None): """ Queries twitter for all the tweets you want! It will load all pages it gets from twitter. However, twitter might out of a sudden stop serving new pages, in that case, use the `query_tweets` method. Note that this function catches the KeyboardInterrupt so it can return tweets on incomplete queries if the user decides to abort. :param query: Any advanced query you want to do! Compile it at https://twitter.com/search-advanced and just copy the query! :param limit: Scraping will be stopped when at least ``limit`` number of items are fetched. :param num_tweets: Number of tweets fetched outside this function. :return: A list of twitterscraper.Tweet objects. You will get at least ``limit`` number of items. """ logger.info('Querying {}'.format(query)) query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A') pos = None num_tweets = 0 try: while True: new_tweets, pos = query_single_page( INIT_URL.format(q=query, lang=lang) if pos is None else RELOAD_URL.format(q=query, pos=pos, lang=lang), pos is None) if len(new_tweets) == 0: logger.info('Got {} tweets for {}.'.format(num_tweets, query)) return if db: db(new_tweets) return new_tweets[-1], pos else: for t in new_tweets: yield t, pos num_tweets += len(new_tweets) if limit and num_tweets >= limit: logger.info('Got {} tweets for {}.'.format(num_tweets, query)) return except KeyboardInterrupt: logger.info('Program interrupted by user. Returning tweets gathered ' 'so far...') except BaseException: logger.exception('An unknown error occurred! Returning tweets ' 'gathered so far.') logger.info('Got {} tweets for {}.'.format(num_tweets, query))
def query_tweets_once_generator(query, limit=None, lang='', pos=None): """ Queries twitter for all the tweets you want! It will load all pages it gets from twitter. However, twitter might out of a sudden stop serving new pages, in that case, use the `query_tweets` method. Note that this function catches the KeyboardInterrupt so it can return tweets on incomplete queries if the user decides to abort. :param query: Any advanced query you want to do! Compile it at https://twitter.com/search-advanced and just copy the query! :param limit: Scraping will be stopped when at least ``limit`` number of items are fetched. :param pos: Field used as a "checkpoint" to continue where you left off in iteration :return: A list of twitterscraper.Tweet objects. You will get at least ``limit`` number of items. """ logger.info('Querying {}'.format(query)) query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A') num_tweets = 0 try: while True: new_tweets, new_pos = query_single_page(query, lang, pos) if len(new_tweets) == 0: logger.info('Got {} tweets for {}.'.format( num_tweets, query)) return for t in new_tweets: yield t, pos # use new_pos only once you have iterated through all old tweets pos = new_pos num_tweets += len(new_tweets) if limit and num_tweets >= limit: logger.info('Got {} tweets for {}.'.format( num_tweets, query)) return except KeyboardInterrupt: logger.info('Program interrupted by user. Returning tweets gathered ' 'so far...') except BaseException: logger.exception('An unknown error occurred! Returning tweets ' 'gathered so far.') logger.info('Got {} tweets for {}.'.format( num_tweets, query))
def download_tw(tweet: Tweet, user_dir: str): img_url: str for img_url in tweet.img_urls: img_file = user_dir + img_url[img_url.rindex('/'):] if not os.path.exists(img_file): retry = 5 while retry > 0: try: logger.info("download " + img_url + ", retry = " + str(retry)) request.urlretrieve(img_url, img_file) break except URLError: retry = retry - 1 pass
def download_all_images(tweets, output_path, username=None, size="orig"): if username: root_dir = os.path.join(output_path or '', username) create_directory(root_dir) for t in tweets: for img_url in t.img_urls: date = dt.datetime.fromtimestamp(t.timestamp_epochs) if not username: root_dir = os.path.join(output_path or '', t.screen_name) create_directory(root_dir) final_path = root_dir # Create Subfolders for Years and Months final_path = os.path.join(final_path, date.strftime("%Y")) create_directory(final_path) final_path = os.path.join(final_path, date.strftime("%m")) create_directory(final_path) #if is_retweet: # Create Subfolder to separate any Retweets #final_path = os.path.join(final_path, "Retweets") #create_directory(final_path) timestamp = date.strftime("%Y-%m-%d_") + size + "_" #if is_retweet: #timestamp = "RT_" + timestamp filepath = os.path.join(final_path, timestamp + os.path.basename(img_url)) r = requests.get(img_url + ':' + size, stream=True) base_name = timestamp + os.path.basename(img_url) filename = os.path.join(final_path or '', base_name) with open(filename, 'wb') as fd: for chunk in r.iter_content(chunk_size=1024): fd.write(chunk) logger.info(filename)
def query_user_info(user): """ Returns the scraped user data from a twitter user page. :param user: the twitter user to web scrape its twitter page info """ try: user_info = query_user_page(INIT_URL_USER.format(u=user)) if user_info: logger.info(f"Got user information from username {user}") return user_info except KeyboardInterrupt: logger.info( "Program interrupted by user. Returning user information gathered so far..." ) except BaseException: logger.exception( "An unknown error occurred! Returning user information gathered so far..." )
def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang=''): no_days = (enddate - begindate).days if poolsize > no_days: # Since we are assigning each pool a range of dates to query, # the number of pools should not exceed the number of dates. poolsize = no_days dateranges = [begindate + dt.timedelta(days=elem) for elem in linspace(0, no_days, poolsize+1)] if limit: limit_per_pool = (limit // poolsize)+1 else: limit_per_pool = None queries = ['{} since:{} until:{}'.format(query, since, until) for since, until in zip(dateranges[:-1], dateranges[1:])] all_tweets = [] try: pool = Pool(poolsize) logger.info('queries: {}'.format(queries)) try: for new_tweets in pool.imap_unordered(partial(query_tweets_once, limit=limit_per_pool, lang=lang), queries): all_tweets.extend(new_tweets) logger.info('Got {} tweets ({} new).'.format( len(all_tweets), len(new_tweets))) except KeyboardInterrupt: logger.info('Program interrupted by user. Returning all tweets ' 'gathered so far.') finally: pool.close() pool.join() return all_tweets
def main(): logger.info({'Hello world': '1'}) try: config_path = os.path.split( os.path.realpath(__file__))[0] + os.sep + 'config.json' if not os.path.isfile(config_path): sys.exit(u'当前路径:%s 不存在配置文件config.json' % (os.path.split(os.path.realpath(__file__))[0] + os.sep)) with open(config_path) as f: config = json.loads(f.read()) validate_config(config) print('hello world') user_id_list = config['user_id_list'] if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list user_id_list = get_user_list(user_id_list) for user in user_id_list: print(user) list_of_tweets = query_tweets_from_user(user, 10) outPutFileName = get_filepath(user, 'data') + '.csv' with open(outPutFileName, "w", encoding="utf-8") as output: writer = csv.writer(output) writer.writerow(["text_html", "img_url", "video_url", "links"]) for t in list_of_tweets: writer.writerow( [t.text_html, t.img_urls, t.video_url, t.links]) for imgUrl in t.img_urls: download_one_file(user, 'img', imgUrl) for videoUrl in t.video_url: download_one_file(user, 'video', videoUrl) except ValueError: print('config.json格式不正确') except Exception as e: print('Error: ', e) traceback.print_exe()
def query_user_page(url, retry=10): """ Returns the scraped user data from a twitter user page. :param url: The URL to get the twitter user info from (url contains the user page) :param retry: Number of retries if something goes wrong. :return: Returns the scraped user data from a twitter user page. """ try: response = requests.get(url, headers=HEADER) html = response.text or '' user = User() user_info = user.from_html(html) if not user_info: return None return user_info except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format( e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format( e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_user_page(url, retry-1) logger.error('Giving up.') return None
def query_tweet_page(user, status_id): tweets = [] try: new_tweets, pos = query_single_page(user, lang='', pos=None, from_user=True, status_id=status_id) if len(new_tweets) == 0: logger.info("Got {} tweets from user {} and status {}".format( len(tweets), user, status_id)) tweets += new_tweets return tweets except KeyboardInterrupt: logger.info("Program interrupted by user. Returning tweets gathered " "so far...") except BaseException: logger.exception("An unknown error occurred! Returning tweets " "gathered so far.") logger.info("Got {} tweets from username {}.".format(len(tweets), user)) return tweets
def query_user_info(user): """ Returns the scraped user data from a twitter user page. :param user: the twitter user to web scrape its twitter page info """ try: user_info = query_user_page(INIT_URL_USER.format(u=user)) if user_info: logger.info(f"Got user information from username {user}") return user_info except KeyboardInterrupt: logger.info("Program interrupted by user. Returning user information gathered so far...") except BaseException: logger.exception("An unknown error occurred! Returning user information gathered so far...") logger.info(f"Got user information from username {user}") return user_info
def query_single_page(query, lang, pos, retry=50, from_user=False, timeout=60): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) logger.info('Scraping tweets from {}'.format(url)) try: proxy = next(proxy_pool) logger.info('Using proxy {}'.format(proxy)) response = requests.get(url, headers=HEADER, proxies={"http": proxy}, timeout=timeout) if pos is None: # html response html = response.text or '' json_resp = None else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}"'.format( e, url)) tweets = list(Tweet.from_html(html)) if not tweets: try: if json_resp: pos = json_resp['min_position'] has_more_items = json_resp['has_more_items'] if not has_more_items: logger.info("Twitter returned : 'has_more_items' ") return [], None else: pos = None except: pass if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1, from_user) else: return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].tweet_id return tweets, "TWEET-{}-{}".format(tweets[-1].tweet_id, tweets[0].tweet_id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format(e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format(e, url)) except json.decoder.JSONDecodeError as e: logger.exception( 'Failed to parse JSON "{}" while requesting "{}".'.format(e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error('Giving up.') return [], None
from twitterscraper.ts_logger import logger from twitterscraper.user import User #from fake_useragent import UserAgent #ua = UserAgent() #HEADER = {'User-Agent': ua.random} HEADERS_LIST = [ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13', 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201', 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16', 'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre' ] HEADER = {'User-Agent': random.choice(HEADERS_LIST)} logger.info(HEADER) INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q={q}&l={lang}' RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' \ 'default&include_available_features=1&include_entities=1&' \ 'reset_error_state=false&src=typd&max_position={pos}&q={q}&l={lang}' INIT_URL_USER = '******' RELOAD_URL_USER = '******' \ 'include_available_features=1&include_entities=1&' \ 'max_position={pos}&reset_error_state=false' PROXY_URL = 'https://free-proxy-list.net/' def get_proxies(): response = requests.get(PROXY_URL) soup = BeautifulSoup(response.text, 'lxml')
def query_single_page(query, lang, pos, retry=50, from_user=False): """ Returns tweets from the given URL. :param query: The query parameter of the query url :param lang: The language parameter of the query url :param pos: The query url parameter that determines where to start looking :param retry: Number of retries if something goes wrong. :return: The list of tweets, the pos argument for getting the next page. """ url = get_query_url(query, lang, pos, from_user) try: response = requests.get(url, headers=HEADER) if pos is None: # html response html = response.text or '' json_resp = None else: html = '' try: json_resp = json.loads(response.text) html = json_resp['items_html'] or '' except ValueError as e: logger.exception('Failed to parse JSON "{}" while requesting "{}"'.format(e, url)) tweets = list(Tweet.from_html(html)) if not tweets: if json_resp: pos = json_resp['min_position'] else: pos = None if retry > 0: return query_single_page(query, lang, pos, retry - 1, from_user) else: return [], pos if json_resp: return tweets, urllib.parse.quote(json_resp['min_position']) if from_user: return tweets, tweets[-1].id return tweets, "TWEET-{}-{}".format(tweets[-1].id, tweets[0].id) except requests.exceptions.HTTPError as e: logger.exception('HTTPError {} while requesting "{}"'.format( e, url)) except requests.exceptions.ConnectionError as e: logger.exception('ConnectionError {} while requesting "{}"'.format( e, url)) except requests.exceptions.Timeout as e: logger.exception('TimeOut {} while requesting "{}"'.format( e, url)) except json.decoder.JSONDecodeError as e: logger.exception('Failed to parse JSON "{}" while requesting "{}".'.format( e, url)) if retry > 0: logger.info('Retrying... (Attempts left: {})'.format(retry)) return query_single_page(query, lang, pos, retry - 1) logger.error('Giving up.') return [], None
def main(): try: parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=__doc__ ) parser.add_argument("query", type=str, help="Advanced twitter query") parser.add_argument("-o", "--output", type=str, default="tweets.json", help="Path to a JSON file to store the gathered " "tweets to.") parser.add_argument("-l", "--limit", type=int, default=None, help="Number of minimum tweets to gather.") parser.add_argument("-a", "--all", action='store_true', help="Set this flag if you want to get all tweets " "in the history of twitter. Begindate is set to 2006-03-01." "This may take a while. You can increase the number of parallel" "processes depending on the computational power you have.") parser.add_argument("-c", "--csv", action='store_true', help="Set this flag if you want to save the results to a CSV format.") parser.add_argument("-u", "--user", action='store_true', help="Set this flag to if you want to scrape tweets from a specific user" "The query should then consist of the profilename (user) you want to scrape without @") parser.add_argument("--profiles", action='store_true', help="Set this flag to if you want to scrape profile info of all the users where you" "have previously scraped from. After all of the tweets have been scraped it will start" "a new process of scraping profile pages.") parser.add_argument("--lang", type=str, default=None, help="Set this flag if you want to query tweets in \na specific language. You can choose from:\n" "en (English)\nar (Arabic)\nbn (Bengali)\n" "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n" "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n" "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n" "id (Indonesian)\nit (Italian)\nja (Japanese)\n" "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n" "no (Norwegian)\npl (Polish)\npt (Portuguese)\n" "ro (Romanian)\nru (Russian)\nsv (Swedish)\n" "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n" "ur (Urdu)\nvi (Vietnamese)\n" "zh-cn (Chinese Simplified)\n" "zh-tw (Chinese Traditional)" ) parser.add_argument("-d", "--dump", action="store_true", help="Set this flag if you want to dump the tweets \nto the console rather than outputting to a file") parser.add_argument("-bd", "--begindate", type=valid_date, default="2006-03-21", help="Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21", metavar='\b') parser.add_argument("-ed", "--enddate", type=valid_date, default=dt.date.today(), help="Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.", metavar='\b') parser.add_argument("-p", "--poolsize", type=int, default=20, help="Specify the number of parallel process you want to run. \n" "Default value is set to 20. \nYou can change this number if you have more computing power available. \n" "Set to 1 if you dont want to run any parallel processes.", metavar='\b') args = parser.parse_args() if isfile(args.output) and not args.dump: logger.error("Output file already exists! Aborting.") exit(-1) if args.all: args.begindate = dt.date(2006,3,1) args.enddate = dt.date.today() if args.user: tweets = query_tweets_from_user(user = args.query, limit = args.limit) else: tweets = query_tweets(query = args.query, limit = args.limit, begindate = args.begindate, enddate = args.enddate, poolsize = args.poolsize, lang = args.lang) if args.dump: print(json.dumps(tweets, cls=JSONEncoder)) else: if tweets: with open(args.output, "w", encoding="utf-8") as output: if args.csv: f = csv.writer(output) f.writerow(["user", "fullname", "tweet-id", "timestamp", "url", "likes", "replies", "retweets", "text", "html"]) for x in tweets: f.writerow([x.user, x.fullname, x.id, x.timestamp, x.url, x.likes, x.replies, x.retweets, x.text, x.html]) else: json.dump(tweets, output, cls=JSONEncoder) if args.profiles and tweets: list_users = list(set([tweet.user for tweet in tweets])) # list_users_info = [query_user_info(elem) for elem in list_users] filename = 'userprofiles_' + args.output with open(filename, "w", encoding="utf-8") as output: if args.csv: f = csv.writer(output) f.writerow(["user","fullname","location","blog","date_joined","id","num_tweets","following","followers","likes","lists"]) for elem in list_users: u = query_user_info(elem) if u is None: continue else: f.writerow([u.user, u.full_name, u.location, u.blog, u.date_joined, u.id, u.tweets, u.following, u.followers, u.likes, u.lists]) else: for elem in list_users: u = query_user_info(elem) if u is None: continue else: json.dump(u, output, cls=JSONEncoder, indent=2) except KeyboardInterrupt: logger.info("Program interrupted by user. Quitting...")
def main(): try: parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, description=__doc__) parser.add_argument("query", type=str, help="Advanced twitter query") parser.add_argument("-o", "--output", type=str, default="tweets.json", help="Path to a JSON file to store the gathered " "tweets to.") parser.add_argument("-l", "--limit", type=int, default=None, help="Number of minimum tweets to gather.") parser.add_argument( "-a", "--all", action='store_true', help="Set this flag if you want to get all tweets " "in the history of twitter. Begindate is set to 2006-03-01." "This may take a while. You can increase the number of parallel" "processes depending on the computational power you have.") parser.add_argument( "-c", "--csv", action='store_true', help= "Set this flag if you want to save the results to a CSV format.") parser.add_argument( "-u", "--user", action='store_true', help= "Set this flag to if you want to scrape tweets from a specific user" "The query should then consist of the profilename you want to scrape without @" ) parser.add_argument( "--profiles", action='store_true', help= "Set this flag to if you want to scrape profile info of all the users where you" "have previously scraped from. After all of the tweets have been scraped it will start" "a new process of scraping profile pages.") parser.add_argument( "--lang", type=str, default=None, help= "Set this flag if you want to query tweets in \na specific language. You can choose from:\n" "en (English)\nar (Arabic)\nbn (Bengali)\n" "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n" "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n" "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n" "id (Indonesian)\nit (Italian)\nja (Japanese)\n" "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n" "no (Norwegian)\npl (Polish)\npt (Portuguese)\n" "ro (Romanian)\nru (Russian)\nsv (Swedish)\n" "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n" "ur (Urdu)\nvi (Vietnamese)\n" "zh-cn (Chinese Simplified)\n" "zh-tw (Chinese Traditional)") parser.add_argument( "-d", "--dump", action="store_true", help= "Set this flag if you want to dump the tweets \nto the console rather than outputting to a file" ) parser.add_argument( "-ow", "--overwrite", action="store_true", help= "Set this flag if you want to overwrite the existing output file.") parser.add_argument( "-bd", "--begindate", type=valid_date, default="2006-03-21", help= "Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21", metavar='\b') parser.add_argument( "-ed", "--enddate", type=valid_date, default=dt.date.today(), help= "Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.", metavar='\b') parser.add_argument( "-p", "--poolsize", type=int, default=20, help="Specify the number of parallel process you want to run. \n" "Default value is set to 20. \nYou can change this number if you have more computing power available. \n" "Set to 1 if you dont want to run any parallel processes.", metavar='\b') parser.add_argument( "-i", "--images", action="store_true", help= "Set this flag if you want to download all images from the query.") parser.add_argument( "-io", "--imagesoutput", type=str, default="./", help= "The path to the folder to download all of the images to, using -i." ) parser.add_argument( "-ex", "--onlymedia", action="store_true", help="Set this flag if you want exclude tweets without media.") args = parser.parse_args() if isfile(args.output) and not args.dump and not args.overwrite: logger.error("Output file already exists! Aborting.") exit(-1) if args.all: args.begindate = dt.date(2006, 3, 1) if args.user: tweets = query_tweets_from_user(user=args.query, limit=args.limit, dl_imgs=args.onlymedia) else: tweets = query_tweets(query=args.query, limit=args.limit, begindate=args.begindate, enddate=args.enddate, poolsize=args.poolsize, lang=args.lang, dl_imgs=args.onlymedia) if args.dump: pprint([tweet.__dict__ for tweet in tweets]) else: if tweets: with open(args.output, "w", encoding="utf-8") as output: if args.csv: f = csv.writer(output, delimiter=";", quoting=csv.QUOTE_NONNUMERIC) f.writerow([ "screen_name", "username", "user_id", "tweet_id", "tweet_url", "timestamp", "timestamp_epochs", "text", "text_html", "links", "hashtags", "has_media", "img_urls", "video_url", "likes", "retweets", "replies", "is_replied", "is_reply_to", "parent_tweet_id", "reply_to_users" ]) for t in tweets: f.writerow([ t.screen_name, t.username, t.user_id, t.tweet_id, t.tweet_url, t.timestamp, t.timestamp_epochs, t.text, t.text_html, t.links, t.hashtags, t.has_media, t.img_urls, t.video_url, t.likes, t.retweets, t.replies, t.is_replied, t.is_reply_to, t.parent_tweet_id, t.reply_to_users ]) if args.images: if args.user: download_all_images(tweets, args.imagesoutput, username=args.query) else: download_all_images(tweets, args.imagesoutput) else: if args.images: if args.user: download_all_images(tweets, args.imagesoutput, username=args.query) else: download_all_images(tweets, args.imagesoutput) json.dump(tweets, output, cls=JSONEncoder) if args.profiles and tweets: list_users = list(set([tweet.username for tweet in tweets])) list_users_info = [ query_user_info(elem) for elem in list_users ] filename = 'userprofiles_' + args.output if args.images: download_all_images(list_users_info, args.imagesoutput) with open(filename, "w", encoding="utf-8") as output: json.dump(list_users_info, output, cls=JSONEncoder) except KeyboardInterrupt: logger.info("Program interrupted by user. Quitting...")
def main(): try: parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=__doc__ ) parser.add_argument("query", type=str, help="Advanced twitter query") parser.add_argument("-o", "--output", type=str, default="tweets.json", help="Path to a JSON file to store the gathered " "tweets to.") parser.add_argument("-l", "--limit", type=int, default=None, help="Number of minimum tweets to gather.") parser.add_argument("-a", "--all", action='store_true', help="Set this flag if you want to get all tweets " "in the history of twitter. Begindate is set to 2006-03-01." "This may take a while. You can increase the number of parallel" "processes depending on the computational power you have.") parser.add_argument("-c", "--csv", action='store_true', help="Set this flag if you want to save the results to a CSV format.") parser.add_argument("-u", "--user", action='store_true', help="Set this flag to if you want to scrape tweets from a specific user" "The query should then consist of the profilename you want to scrape without @") parser.add_argument("--lang", type=str, default=None, help="Set this flag if you want to query tweets in \na specific language. You can choose from:\n" "en (English)\nar (Arabic)\nbn (Bengali)\n" "cs (Czech)\nda (Danish)\nde (German)\nel (Greek)\nes (Spanish)\n" "fa (Persian)\nfi (Finnish)\nfil (Filipino)\nfr (French)\n" "he (Hebrew)\nhi (Hindi)\nhu (Hungarian)\n" "id (Indonesian)\nit (Italian)\nja (Japanese)\n" "ko (Korean)\nmsa (Malay)\nnl (Dutch)\n" "no (Norwegian)\npl (Polish)\npt (Portuguese)\n" "ro (Romanian)\nru (Russian)\nsv (Swedish)\n" "th (Thai)\ntr (Turkish)\nuk (Ukranian)\n" "ur (Urdu)\nvi (Vietnamese)\n" "zh-cn (Chinese Simplified)\n" "zh-tw (Chinese Traditional)" ) parser.add_argument("-d", "--dump", action="store_true", help="Set this flag if you want to dump the tweets \nto the console rather than outputting to a file") parser.add_argument("-bd", "--begindate", type=valid_date, default="2006-03-21", help="Scrape for tweets starting from this date. Format YYYY-MM-DD. \nDefault value is 2006-03-21", metavar='\b') parser.add_argument("-ed", "--enddate", type=valid_date, default=dt.date.today(), help="Scrape for tweets until this date. Format YYYY-MM-DD. \nDefault value is the date of today.", metavar='\b') parser.add_argument("-p", "--poolsize", type=int, default=20, help="Specify the number of parallel process you want to run. \n" "Default value is set to 20. \nYou can change this number if you have more computing power available. \n" "Set to 1 if you dont want to run any parallel processes.", metavar='\b') args = parser.parse_args() if isfile(args.output) and not args.dump: logger.error("Output file already exists! Aborting.") exit(-1) if args.all: args.begindate = dt.date(2006,3,1) if args.user: tweets = query_tweets_from_user(user = args.query, limit = args.limit) else: tweets = query_tweets(query = args.query, limit = args.limit, begindate = args.begindate, enddate = args.enddate, poolsize = args.poolsize, lang = args.lang) if args.dump: print(json.dumps(tweets, cls=JSONEncoder)) else: if tweets: with open(args.output, "w", encoding="utf-8") as output: if args.csv: f = csv.writer(output) f.writerow(["user", "fullname", "tweet-id", "timestamp", "url", "likes", "replies", "retweets", "text", "html"]) for x in tweets: f.writerow([x.user, x.fullname, x.id, x.timestamp, x.url, x.likes, x.replies, x.retweets, x.text, x.html]) else: json.dump(tweets, output, cls=JSONEncoder) except KeyboardInterrupt: logger.info("Program interrupted by user. Quitting...")