def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('screen_name_file') args = parser.parse_args() logger = get_console_info_logger() ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) screen_names = get_screen_names_from_file(args.screen_name_file) for screen_name in screen_names: tweet_filename = "%s.tweets" % screen_name if os.path.exists(tweet_filename): logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name)) else: try: tweets = crawler.get_all_timeline_tweets_for_screen_name(screen_name) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) elif e.error_code == 401: logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, "%s.tweets" % screen_name)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) # Parse and document command line options parser = argparse.ArgumentParser(description="") parser.add_argument('--input', dest='screen_name_file', default="example_screen_names.txt", help='A text file with one screen name per line.') parser.add_argument('--token', dest='token_file', default=os.path.expanduser("~") + "/.trawler/default.yaml", help='A configuration file with Twitter API access tokens. See example_token_file.yaml or twitter_oauth_settings.sample.py') parser.add_argument('--output', dest='output', default='./', help='Where to output the resulting data.') args = parser.parse_args() # Set up loggers and output directory logger = get_console_info_logger() output_directory = args.output try: if not os.path.exists(output_directory): os.makedirs(output_directory) except: print "Could not create directory:", directory exit(0) logger.info("Created directory: %s" % output_directory) # Set up API access if args.token_file.endswith('yaml'): #YAML file tokens = yaml.safe_load(open(args.token_file)) elif args.token_file.endswith('py'): #.py file -- surely there is a better way to do this tokens = {} for line in open(args.token_file): k,v = [x.strip() for x in line.split("=")] tokens[k] = v[1:-1] else: raise "Unrecognized token file type -- please use a .yaml or .py file following the examples" twython = get_connection( tokens['consumer_key'], tokens['consumer_secret']) crawler = get_timeline_crawler( twython, logger=logger) # Gather unique screen names screen_names = get_screen_names_from_file(args.screen_name_file) # Gather tweets for each of the unique screen names # NB: in production, one should use `id` as an identifier (which does not change) # rather than the `screen_name`, which can be changed at the users's whim. for screen_name in screen_names: tweet_filename = output_directory + screen_name + ".tweets.gz" if os.path.exists(tweet_filename): logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name)) else: tweets = crawler.get_all_timeline_tweets_for_screen_name( screen_name ) #Write them out as one-JSON-object-per-line in a gzipped file save_tweets_to_json_file(tweets, tweet_filename)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('screen_name_file') args = parser.parse_args() logger = get_console_info_logger() ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) timeline_crawler = CrawlTwitterTimelines(twython, logger) ff_finder = FindFriendFollowers(twython, logger) screen_names = get_screen_names_from_file(args.screen_name_file) for screen_name in screen_names: ff_screen_names = ff_finder.get_ff_screen_names_for_screen_name( screen_name) save_screen_names_to_file(ff_screen_names, "%s.ff" % screen_name, logger) for ff_screen_name in ff_screen_names: tweet_filename = "%s.tweets" % ff_screen_name if os.path.exists(tweet_filename): logger.info( "File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, ff_screen_name)) else: try: tweets = timeline_crawler.get_all_timeline_tweets_for_screen_name( ff_screen_name) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn( "HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % ff_screen_name) elif e.error_code == 401: logger.warn( "HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % ff_screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, tweet_filename)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('screen_name_file') parser.add_argument('old_tweet_path') parser.add_argument('new_tweet_path') args = parser.parse_args() logger = get_console_info_logger() ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) screen_names = get_screen_names_from_file(args.screen_name_file) for screen_name in screen_names: old_tweet_filename = os.path.join(args.old_tweet_path, "%s.tweets" % screen_name) new_tweet_filename = os.path.join(args.new_tweet_path, "%s.tweets" % screen_name) if not os.path.exists(old_tweet_filename): logger.error("Older Tweet file '%s' does not exist - will not attempt to download Tweets for '%s'" % (old_tweet_filename, screen_name)) continue if os.path.exists(new_tweet_filename): logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (new_tweet_filename, screen_name)) continue most_recent_tweet_id = get_most_recent_tweet_id_from_json_tweet_file(old_tweet_filename) try: tweets = crawler.get_all_timeline_tweets_for_screen_name_since(screen_name, most_recent_tweet_id) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) elif e.error_code == 401: logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, new_tweet_filename)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('screen_name_file') args = parser.parse_args() logger = get_console_info_logger() ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline", logger) screen_names = get_screen_names_from_file(args.screen_name_file) for screen_name in screen_names: tweet_filename = "%s.tweets" % screen_name if os.path.exists(tweet_filename): logger.info( "File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name)) else: try: logger.info("Retrieving Tweets for user '%s'" % screen_name) tweets = crawler.get_data(screen_name=screen_name, count=200) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn( "HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) elif e.error_code == 401: logger.warn( "HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, "%s.tweets" % screen_name)
def filter(self, json_tweet_string): tweet = json.loads(json_tweet_string) screen_name = tweet['user']['screen_name'] path_to_tweetfile = os.path.join(self._download_path, "%s.tweets" % screen_name) # If file already exists for user, don't try to rescrape their timeline if os.path.exists(path_to_tweetfile): self._logger.info("Timeline file for '%s' already exists - will not rescrape" % screen_name) if os.path.getsize(path_to_tweetfile) > 0: return True else: return False try: self._logger.info("Retrieving Tweets for user '%s'" % screen_name) tweets = self._crawler.get_data(screen_name=screen_name, count=200) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: self._logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) open(path_to_tweetfile, "w").close() # Create empty file return False elif e.error_code == 401: self._logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) open(path_to_tweetfile, "w").close() # Create empty file return False else: # Unhandled exception raise e else: if len(tweets) < self._minimum_tweet_threshold: self._logger.info("User '%s' has only %d Tweets, threshold is %d" % \ (screen_name, len(tweets), self._minimum_tweet_threshold)) open(path_to_tweetfile, "w").close() # Create empty file return False else: save_tweets_to_json_file(tweets, path_to_tweetfile) return True
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('id_file') args = parser.parse_args() logger = get_console_info_logger() #ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = get_connection( consumer_key, consumer_secret) #crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline", logger) crawler = get_timeline_crawler(twython, logger) ids = get_ids_from_file(args.id_file) for user_id in ids: tweet_filename = "%s.tweets" % user_id if os.path.exists(tweet_filename): logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, user_id)) else: try: tweets = crawler.get_most_recent_tweets_by_id( user_id ) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) elif e.error_code == 401: logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, 'testdata/%s.json' % user_id)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) # Parse and document command line options parser = argparse.ArgumentParser(description="") parser.add_argument('--input', dest='screen_name_file', default="example_screen_names.txt", help='A text file with one screen name per line.') parser.add_argument( '--token', dest='token_file', default=os.path.expanduser("~") + "/.trawler/default.yaml", help= 'A configuration file with Twitter API access tokens. See example_token_file.yaml or twitter_oauth_settings.sample.py' ) parser.add_argument('--output', dest='output', default='./', help='Where to output the resulting data.') args = parser.parse_args() # Set up loggers and output directory logger = get_console_info_logger() output_directory = args.output try: if not os.path.exists(output_directory): os.makedirs(output_directory) except: print "Could not create directory:", directory exit(0) logger.info("Created directory: %s" % output_directory) # Set up API access if args.token_file.endswith('yaml'): #YAML file tokens = yaml.safe_load(open(args.token_file)) elif args.token_file.endswith('py'): #.py file -- surely there is a better way to do this tokens = {} for line in open(args.token_file): k, v = [x.strip() for x in line.split("=")] tokens[k] = v[1:-1] else: raise "Unrecognized token file type -- please use a .yaml or .py file following the examples" twython = get_connection(tokens['consumer_key'], tokens['consumer_secret']) crawler = get_timeline_crawler(twython, logger=logger) # Gather unique screen names screen_names = get_screen_names_from_file(args.screen_name_file) # Gather tweets for each of the unique screen names # NB: in production, one should use `id` as an identifier (which does not change) # rather than the `screen_name`, which can be changed at the users's whim. for screen_name in screen_names: tweet_filename = output_directory + screen_name + ".tweets.gz" if os.path.exists(tweet_filename): logger.info( "File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name)) else: tweets = crawler.get_all_timeline_tweets_for_screen_name( screen_name) #Write them out as one-JSON-object-per-line in a gzipped file save_tweets_to_json_file(tweets, tweet_filename)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) # Parse and document command line options parser = argparse.ArgumentParser(description="") parser.add_argument('-sn', dest='screen_name_file', default="example_screen_names.txt", help='A text file with one screen name per line.') parser.add_argument('-t', dest='token_file', default=os.path.expanduser("~") + "/.trawler/default.yaml", help='A configuration file with Twitter API access tokens. See example_token_file.yaml.') parser.add_argument('-d', dest='depth', default=0, help='Friend and follower depth. A value of 1 will gather all tweets for users \ in the file as well as all tweets from their friends and followers. Default is 0.') args = parser.parse_args() # Set up loggers and output directory logger = get_console_info_logger() output_directory = "data/" + datetime.datetime.now().isoformat() + "/" try: if not os.path.exists(output_directory): os.makedirs(output_directory) except: print "Could not create directory:", directory exit(0) logger.info("Created directory: %s" % output_directory) # Set up API access tokens = yaml.safe_load(open(args.token_file)) ACCESS_TOKEN = Twython(tokens['consumer_key'], tokens['consumer_secret'], oauth_version=2).obtain_access_token() twython = Twython(tokens['consumer_key'], access_token=ACCESS_TOKEN) crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline", logger) # Gather unique screen names screen_names = get_screen_names_from_file(args.screen_name_file) depth = int(args.depth) # todo, validate args.depth unique_screen_names = [] if depth > 0: # don't initiate ff_finder unless we have to ff_finder = FindFriendFollowers(twython, logger) ff_screen_names = get_ff(screen_names, depth, ff_finder, logger) unique_screen_names = set(ff_screen_names) else: unique_screen_names = set(screen_names) # assume the list has redundant names save_screen_names_to_file(unique_screen_names, output_directory + 'screen_names') # Gather tweets for each of the unique screen names for screen_name in unique_screen_names: tweet_filename = output_directory + screen_name + ".tweets" if os.path.exists(tweet_filename): logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name)) else: try: logger.info("Retrieving Tweets for user " + screen_name + " writing to file " + tweet_filename) tweets = crawler.get_data(screen_name=screen_name, count=200) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) elif e.error_code == 401: logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, tweet_filename)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('id_file') parser.add_argument('output_loc') parser.add_argument('--token_file',dest='token_file',default=None) args = parser.parse_args() logger = get_console_info_logger() #Optionally pass as a parameter #There has to be a more elegant way to combine this with the default behavior -- tomorrow's problem though oauth_settings_file_loc = args.token_file if oauth_settings_file_loc: print "Using tokens from:", oauth_settings_file_loc exec(open(oauth_settings_file_loc).read()) ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) twitter_ids = get_screen_names_from_file(args.id_file) twitter_ids.reverse() #HARDCODE output_loc = args.output_loc tempfile_loc = 'tmp/' os.system('mkdir -p '+tempfile_loc) #load previously broken ID files so we don't try to read them again broken_ids = set([]) #Defaults to an empty set try: broken_ids = set([long(x).strip() for x in open(tempfile_loc + '404d').readlines()]) except: pass try: broken_ids = broken_ids.union(set([long(x).strip() for x in open(tempfile_loc + '401d').readlines()])) except: pass for twitter_id in twitter_ids: if twitter_id in broken_ids: print '%s was previously inaccessible, not trying to download.' % twitter_id continue tweet_filename = output_loc + "%s.tweets.gz" % twitter_id if os.path.exists(tweet_filename): logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, twitter_id)) else: try: tweets = crawler.get_all_timeline_tweets_for_id(twitter_id) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % twitter_id) with open(tempfile_loc + '404d','a') as OUT: OUT.write('%s\n' % twitter_id) elif e.error_code == 401: logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % twitter_id) with open(tempfile_loc + '401d','a') as OUT: OUT.write('%s\n' % twitter_id) else: # Unhandled exception print e #Reconnect and try again twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) else: save_tweets_to_json_file(tweets, tweet_filename, gzip_out=True)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('id_file') parser.add_argument('output_loc') parser.add_argument('--token_file', dest='token_file', default=None) args = parser.parse_args() logger = get_console_info_logger() #Optionally pass as a parameter #There has to be a more elegant way to combine this with the default behavior -- tomorrow's problem though oauth_settings_file_loc = args.token_file if oauth_settings_file_loc: print "Using tokens from:", oauth_settings_file_loc exec(open(oauth_settings_file_loc).read()) ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) twitter_ids = get_screen_names_from_file(args.id_file) twitter_ids.reverse() #HARDCODE output_loc = args.output_loc tempfile_loc = 'tmp/' os.system('mkdir -p ' + tempfile_loc) #load previously broken ID files so we don't try to read them again broken_ids = set([]) #Defaults to an empty set try: broken_ids = set( [long(x).strip() for x in open(tempfile_loc + '404d').readlines()]) except: pass try: broken_ids = broken_ids.union( set([ long(x).strip() for x in open(tempfile_loc + '401d').readlines() ])) except: pass for twitter_id in twitter_ids: if twitter_id in broken_ids: print '%s was previously inaccessible, not trying to download.' % twitter_id continue tweet_filename = output_loc + "%s.tweets.gz" % twitter_id if os.path.exists(tweet_filename): logger.info( "File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, twitter_id)) else: try: tweets = crawler.get_all_timeline_tweets_for_id(twitter_id) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn( "HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % twitter_id) with open(tempfile_loc + '404d', 'a') as OUT: OUT.write('%s\n' % twitter_id) elif e.error_code == 401: logger.warn( "HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % twitter_id) with open(tempfile_loc + '401d', 'a') as OUT: OUT.write('%s\n' % twitter_id) else: # Unhandled exception print e #Reconnect and try again twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) else: save_tweets_to_json_file(tweets, tweet_filename, gzip_out=True)