def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('screen_name_file') args = parser.parse_args() logger = get_console_info_logger() ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) screen_names = get_screen_names_from_file(args.screen_name_file) for screen_name in screen_names: tweet_filename = "%s.tweets" % screen_name if os.path.exists(tweet_filename): logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name)) else: try: tweets = crawler.get_all_timeline_tweets_for_screen_name(screen_name) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) elif e.error_code == 401: logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, "%s.tweets" % screen_name)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('screen_name_file') args = parser.parse_args() logger = get_console_info_logger() ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) timeline_crawler = CrawlTwitterTimelines(twython, logger) ff_finder = FindFriendFollowers(twython, logger) screen_names = get_screen_names_from_file(args.screen_name_file) for screen_name in screen_names: ff_screen_names = ff_finder.get_ff_screen_names_for_screen_name( screen_name) save_screen_names_to_file(ff_screen_names, "%s.ff" % screen_name, logger) for ff_screen_name in ff_screen_names: tweet_filename = "%s.tweets" % ff_screen_name if os.path.exists(tweet_filename): logger.info( "File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, ff_screen_name)) else: try: tweets = timeline_crawler.get_all_timeline_tweets_for_screen_name( ff_screen_name) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn( "HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % ff_screen_name) elif e.error_code == 401: logger.warn( "HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % ff_screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, tweet_filename)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('screen_name_file') parser.add_argument('old_tweet_path') parser.add_argument('new_tweet_path') args = parser.parse_args() logger = get_console_info_logger() ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) screen_names = get_screen_names_from_file(args.screen_name_file) for screen_name in screen_names: old_tweet_filename = os.path.join(args.old_tweet_path, "%s.tweets" % screen_name) new_tweet_filename = os.path.join(args.new_tweet_path, "%s.tweets" % screen_name) if not os.path.exists(old_tweet_filename): logger.error("Older Tweet file '%s' does not exist - will not attempt to download Tweets for '%s'" % (old_tweet_filename, screen_name)) continue if os.path.exists(new_tweet_filename): logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (new_tweet_filename, screen_name)) continue most_recent_tweet_id = get_most_recent_tweet_id_from_json_tweet_file(old_tweet_filename) try: tweets = crawler.get_all_timeline_tweets_for_screen_name_since(screen_name, most_recent_tweet_id) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) elif e.error_code == 401: logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, new_tweet_filename)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('id_file') parser.add_argument('output_loc') parser.add_argument('--token_file',dest='token_file',default=None) args = parser.parse_args() logger = get_console_info_logger() #Optionally pass as a parameter #There has to be a more elegant way to combine this with the default behavior -- tomorrow's problem though oauth_settings_file_loc = args.token_file if oauth_settings_file_loc: print "Using tokens from:", oauth_settings_file_loc exec(open(oauth_settings_file_loc).read()) ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) twitter_ids = get_screen_names_from_file(args.id_file) twitter_ids.reverse() #HARDCODE output_loc = args.output_loc tempfile_loc = 'tmp/' os.system('mkdir -p '+tempfile_loc) #load previously broken ID files so we don't try to read them again broken_ids = set([]) #Defaults to an empty set try: broken_ids = set([long(x).strip() for x in open(tempfile_loc + '404d').readlines()]) except: pass try: broken_ids = broken_ids.union(set([long(x).strip() for x in open(tempfile_loc + '401d').readlines()])) except: pass for twitter_id in twitter_ids: if twitter_id in broken_ids: print '%s was previously inaccessible, not trying to download.' % twitter_id continue tweet_filename = output_loc + "%s.tweets.gz" % twitter_id if os.path.exists(tweet_filename): logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, twitter_id)) else: try: tweets = crawler.get_all_timeline_tweets_for_id(twitter_id) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % twitter_id) with open(tempfile_loc + '404d','a') as OUT: OUT.write('%s\n' % twitter_id) elif e.error_code == 401: logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % twitter_id) with open(tempfile_loc + '401d','a') as OUT: OUT.write('%s\n' % twitter_id) else: # Unhandled exception print e #Reconnect and try again twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) else: save_tweets_to_json_file(tweets, tweet_filename, gzip_out=True)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('id_file') parser.add_argument('output_loc') parser.add_argument('--token_file', dest='token_file', default=None) args = parser.parse_args() logger = get_console_info_logger() #Optionally pass as a parameter #There has to be a more elegant way to combine this with the default behavior -- tomorrow's problem though oauth_settings_file_loc = args.token_file if oauth_settings_file_loc: print "Using tokens from:", oauth_settings_file_loc exec(open(oauth_settings_file_loc).read()) ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) twitter_ids = get_screen_names_from_file(args.id_file) twitter_ids.reverse() #HARDCODE output_loc = args.output_loc tempfile_loc = 'tmp/' os.system('mkdir -p ' + tempfile_loc) #load previously broken ID files so we don't try to read them again broken_ids = set([]) #Defaults to an empty set try: broken_ids = set( [long(x).strip() for x in open(tempfile_loc + '404d').readlines()]) except: pass try: broken_ids = broken_ids.union( set([ long(x).strip() for x in open(tempfile_loc + '401d').readlines() ])) except: pass for twitter_id in twitter_ids: if twitter_id in broken_ids: print '%s was previously inaccessible, not trying to download.' % twitter_id continue tweet_filename = output_loc + "%s.tweets.gz" % twitter_id if os.path.exists(tweet_filename): logger.info( "File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, twitter_id)) else: try: tweets = crawler.get_all_timeline_tweets_for_id(twitter_id) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn( "HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % twitter_id) with open(tempfile_loc + '404d', 'a') as OUT: OUT.write('%s\n' % twitter_id) elif e.error_code == 401: logger.warn( "HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % twitter_id) with open(tempfile_loc + '401d', 'a') as OUT: OUT.write('%s\n' % twitter_id) else: # Unhandled exception print e #Reconnect and try again twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = CrawlTwitterTimelines(twython, logger) else: save_tweets_to_json_file(tweets, tweet_filename, gzip_out=True)