def run(self): print('Worker started') # do some initialization here while True: data = self.queue.get(True) try: if data is None: print('ALL FINISHED!!!!', self.conn_number) break print('Starting: ', data) if self.gets_user_id: user = TwitterUser(self.api_hook, user_id=data) else: user = TwitterUser(self.api_hook, screen_name=data) user.populate_tweets_from_api(json_output_directory=os.path.join(self.out_dir,"json")) if len(user.tweets) == 0: if self.to_pickle or self.populate_lists or self.populate_friends or self.populate_followers: print 'pickling and dumping: ', user.screen_name pickle.dump(user, open(os.path.join(self.out_dir,"obj",data), "wb")) continue if self.populate_lists: user.populate_lists_member_of() if self.populate_friends: print 'populating friends, ', user.screen_name user.populate_friends() if self.populate_followers: print 'populating followers, ', user.screen_name user.populate_followers() if self.to_pickle or self.populate_lists or self.populate_friends or self.populate_followers: # Pickle and dump user print 'pickling and dumping (no tweets): ', user.screen_name user.tweets = [] pickle.dump(user, open(os.path.join(self.out_dir,"obj",data), "wb")) except Exception: print('FAILED:: ', data) exc_type, exc_value, exc_traceback = sys.exc_info() print("*** print_tb:") traceback.print_tb(exc_traceback, limit=30, file=sys.stdout) print("*** print_exception:") print('finished collecting data for: ', data)
def get_user_network(self, this_user_network_dir_name, user_ids, restrict_output_to_ids, stored_user_list): counter_val = 0 for uid in user_ids: counter_val += 1 if counter_val % 10 == 0: print (counter_val, " / ", len(user_ids), this_user_network_dir_name.replace(self.network_dir, "")) # try to find user in stored_users if str(uid) in stored_user_list: user = pickle.load(open(self.pickle_dir + "/" + str(uid), "rb")) else: user = TwitterUser(self.api_hook, user_id=uid) user.populate_tweets_from_api() out_fil = open(self.pickle_dir + "/" + str(uid), "wb") pickle.dump(user, out_fil) out_fil.close() self.write_user_network(this_user_network_dir_name, user, uid, restrict_output_to_ids)
def run(self): print ("Worker started") while True: user_id, snow_sample_number = self.queue.get(True) print "Starting: ", user_id, snow_sample_number stored_user_list = set( [os.path.basename(user_pickle) for user_pickle in glob.glob(os.path.join(self.out_dir, "obj", "*"))] ) # Get the ego if user_id in stored_user_list: print ("\tgot pickled: ", user_id) user = pickle.load(open(os.path.join(self.out_dir, "obj", str(user_id)), "rb")) else: user = TwitterUser(self.api_hook, user_id=user_id) user.populate_tweets_from_api(json_output_directory=os.path.join(self.out_dir, "json")) if len(user.tweets) == 0: print "pickling and dumping: ", user.screen_name pickle.dump(user, open(os.path.join(self.out_dir, "obj", user_id), "wb")) continue print "populating friends, ", user.screen_name user.populate_friends() print "pickling and dumping (no tweets): ", user.screen_name user.tweets = [] pickle.dump(user, open(os.path.join(self.out_dir, "obj", user_id), "wb")) ##write out their following network and add each id to queue # network_fil = codecs.open(os.path.join(self.network_dir,user_id),"w", "utf-8") added = 0 for following_id in user.mentioned.keys(): if snow_sample_number < self.step_count: added += 1 self.queue.put([str(following_id), snow_sample_number + 1]) # network_fil.write(",".join([user_id,str(following_id)])+"\n") # network_fil.close() print "finished collecting data for: ", user_id print "added: ", added
def run(self): print ("Worker started") while True: user_id, snow_sample_number = self.queue.get(True) print ("Starting: ", user_id, snow_sample_number) stored_user_list = set([os.path.basename(user_pickle) for user_pickle in glob.glob(self.pickle_dir + "*")]) # Get the ego if user_id in stored_user_list: print ("\tgot pickled: ", user_id) user = pickle.load(open(self.pickle_dir + "/" + str(user_id), "rb")) else: user = TwitterUser(self.api_hook, user_id=user_id) print ("\tgetting tweets for: ", user_id) user.populate_tweets_from_api() print ("\t num tweets received for: ", user_id, " ", len(user.tweets)) # print '\tgetting followers for: ', screen_name # user.populate_followers() print ("\tgetting friends for: ", user_id) user.populate_friends() print ("pickling: ", user_id) pickle.dump(user, open(self.pickle_dir + "/" + user_id, "wb")) ##write out their following network and add each id to queue network_fil = codecs.open(os.path.join(self.network_dir, user_id), "w", "utf-8") added = 0 for following_id in user.friend_ids: if snow_sample_number < 2: added += 1 self.queue.put([str(following_id), snow_sample_number + 1]) network_fil.write(",".join([user_id, str(following_id)]) + "\n") network_fil.close() print "finished collecting data for: ", user_id print "added: ", added
print 'n authed users: ', len(handles) out_dir = sys.argv[2] os.mkdir(out_dir) #user_sns = [line.strip() for line in open(sys.argv[3]).readlines()] user_sns = ['Neuro_Skeptic'] print 'num users: ', len(user_sns) of = codecs.open("output_fil.tsv", "w", "utf8") for i in range(len(user_sns)): #creates a Twitter User object to fill with information from the API user = TwitterUser(handles[i], screen_name=user_sns[i]) user.populate_tweets_from_api(json_output_filename=out_dir + user_sns[i] + ".json", sleep_var=False) user.populate_followers() rts = 0 gt = 0 for t in user.tweets: if t.retweeted is not None: rts += 1 if t.geocode_info is not None: gt += 1 of.write( tab_stringify_newline([ user.screen_name, gt, rts, len(user.tweets), user.earliest_tweet_time, user.latest_tweet_time, user.name, user.n_total_tweets, user.creation_date,
username_to_collect_data_for = 'Jackie_Pooo' consumer_key = "YOUR_CONSUMER_KEY_HERE" consumer_secret = "YOUR_CONSUMER_SECRET_HERE" access_token = "YOUR_ACCESS_TOKEN_HERE" access_token_secret = "YOUR_ACCESS_TOKEN_SECRET_HERE" ## get a "hook", or connection, to the API using your consumer key/secret and access token/secret api_hook = TwitterAPIHook(consumer_key,consumer_secret, access_token=access_token,access_token_secret=access_token_secret) #creates a Twitter User object to fill with information from the API user = TwitterUser(api_hook,screen_name=username_to_collect_data_for) # we call populate_tweets_from_api,which goes to the Twitter API # and collects the user's data it is outputted to the file username_you_put.json # the sleep_var param tells the function it shouldn't worry # about rate limits (we're only collecting for one user, so it doesn't really matter # If you remove the is_gzip argument, the output file will be gzipped print 'populating users tweets!' user.populate_tweets_from_api(json_output_filename=username_to_collect_data_for+".json", sleep_var=False, is_gzip=False, since_id=None) for t in user.tweets: print t.mentions print 'user had {n_tweets} tweets'.format(n_tweets=len(user.tweets))
handles = get_handles(glob.glob(os.path.join(sys.argv[1],"*.txt"))) print 'n authed users: ', len(handles) out_dir = sys.argv[2] os.mkdir(out_dir) #user_sns = [line.strip() for line in open(sys.argv[3]).readlines()] user_sns = ['Neuro_Skeptic'] print 'num users: ', len(user_sns) of = codecs.open("output_fil.tsv","w","utf8") for i in range(len(user_sns)): #creates a Twitter User object to fill with information from the API user = TwitterUser(handles[i], screen_name=user_sns[i]) user.populate_tweets_from_api(json_output_filename=out_dir+user_sns[i]+".json", sleep_var=False) user.populate_followers() rts = 0 gt = 0 for t in user.tweets: if t.retweeted is not None: rts+=1 if t.geocode_info is not None: gt +=1 of.write(tab_stringify_newline([user.screen_name, gt, rts, len(user.tweets), user.earliest_tweet_time, user.latest_tweet_time,
access_token = "YOUR_ACCESS_TOKEN_HERE" access_token_secret = "YOUR_ACCESS_TOKEN_SECRET_HERE" ## get a "hook", or connection, to the API using your consumer key/secret and access token/secret api_hook = TwitterAPIHook(consumer_key,consumer_secret, access_token=access_token,access_token_secret=access_token_secret) #creates a Twitter User object to fill with information from the API user = TwitterUser(api_hook, screen_name=username_to_collect_data_for) # we call populate_tweets_from_api,which goes to the Twitter API # and collects the user's data it is outputted to the file username_you_put.json # the sleep_var param tells the function it shouldn't worry # about rate limits (we're only collecting for one user, so it doesn't really matter # If you remove the is_gzip argument, the output file will be gzipped print 'populating users tweets!' user.populate_tweets_from_api(json_output_filename=username_to_collect_data_for+".json", sleep_var=False, is_gzip=False) print 'user had {n_tweets} tweets'.format(n_tweets=len(user.tweets)) # we now will collect the user's followers print 'populating user followers!' user.populate_followers(sleep_var=False) print 'user had {n_followers} followers!'.format(n_followers=len(user.follower_ids))
elif args.screen_name: print 'Running with screen name: ', args.screen_name args.json_file_or_folder = os.path.join(OUTPUT_DIR,args.screen_name+".json.gz") if os.path.exists(args.json_file_or_folder): print "User's tweets already in the system at: ", args.json_file_or_folder else: print "Getting user's tweets and saving to: ", args.json_file_or_folder if not args.path_to_twitter_credentials_file: print "Can't do anything with a screen name without some API credentials, see the help for this script " \ "and this parameter!" sys.exit(-1) app_handler = TwitterApplicationHandler(pathToConfigFile=args.path_to_twitter_credentials_file) user = TwitterUser(screen_name=args.screen_name, api_hook=app_handler.api_hooks[0]) user.populate_tweets_from_api(json_output_filename=args.json_file_or_folder,sleep_var=False) ######## # load the models and the files ######## print 'LOADING MODEL' identity_model,feature_names = get_identity_model_and_features() word_vector_model, all_dictionaries, ark_clusters, sets, names = get_init_data(GENSIM_MODEL_LOCATION, BROWN_CLUSTER_LOCATION) print 'MODEL HAS BEEN LOADED' def gen_json_for_tweets_of_interest(input_filename, output_filename,keep_only_tweets_with_terms=None): """ This function generates a cleaned json file so that the identity
def run(self): print ("Worker started") while True: try: data = self.queue.get(True) if data is None: print "ALL DONE, EXITING!" return user_id, screen_name = data[0], data[1] print ("Starting: ", screen_name, user_id) this_user_network_dir_name = os.path.join(self.network_dir, user_id) mkdir_no_err(this_user_network_dir_name) stored_user_list = set( [os.path.basename(user_pickle) for user_pickle in glob.glob(self.pickle_dir + "*")] ) # Get the ego if user_id in stored_user_list: print ("\tgot pickled: ", user_id) user = pickle.load(open(self.pickle_dir + "/" + str(user_id), "rb")) else: user = TwitterUser(self.api_hook, user_id=user_id) print ("\tgetting tweets for: ", user_id) user.populate_tweets_from_api() print ("\t num tweets received for: ", user_id, " (", screen_name, "): ", len(user.tweets)) if len(user.tweets) > 0: print ("\tgetting lists, friends, followers for: ", user_id) user.populate_lists_member_of() # user.populate_followers() # user.populate_friends() print ("pickling: ", screen_name) pickle.dump(user, open(self.pickle_dir + "/" + user_id, "wb")) self.write_user_network(this_user_network_dir_name, user, user_id, None) if len(user.tweets) == 0: print ("finished collecting data for: ", user_id, ", no tweets") continue # Find the ego network based on retweets, mentions and replies user_network_to_pull = user.get_ego_network_actors() print ("Starting to get ", user.user_id, "'s network of ", len(user_network_to_pull), " actors") restrict_to_users = [u for u in user_network_to_pull] restrict_to_users.append(user_id) self.get_user_network( this_user_network_dir_name, user_network_to_pull, restrict_to_users, stored_user_list ) except Exception: print ("FAILED:: ", data) exc_type, exc_value, exc_traceback = sys.exc_info() print ("*** print_tb:") traceback.print_tb(exc_traceback, limit=50, file=sys.stdout) print ("finished collecting data for: ", screen_name)
args.json_file_or_folder = os.path.join(OUTPUT_DIR, args.screen_name + ".json.gz") if os.path.exists(args.json_file_or_folder): print "User's tweets already in the system at: ", args.json_file_or_folder else: print "Getting user's tweets and saving to: ", args.json_file_or_folder if not args.path_to_twitter_credentials_file: print "Can't do anything with a screen name without some API credentials, see the help for this script " \ "and this parameter!" sys.exit(-1) app_handler = TwitterApplicationHandler( pathToConfigFile=args.path_to_twitter_credentials_file) user = TwitterUser(screen_name=args.screen_name, api_hook=app_handler.api_hooks[0]) user.populate_tweets_from_api( json_output_filename=args.json_file_or_folder, sleep_var=False) ######## # load the models and the files ######## print 'LOADING MODEL' identity_model, feature_names = get_identity_model_and_features() word_vector_model, all_dictionaries, ark_clusters, sets, names = get_init_data( GENSIM_MODEL_LOCATION, BROWN_CLUSTER_LOCATION) print 'MODEL HAS BEEN LOADED' def gen_json_for_tweets_of_interest(input_filename, output_filename,
def run(self): print('Worker started') # do some initialization here snow_sample_number = None since_tweet_id = None while True: data = self.queue.get(True) try: if data is None: print 'ALL FINISHED!!!!' break if len(data) == 1 or type(data) is str or type( data) is unicode or type(data) is int: user_identifier = data elif len(data) == 3: user_identifier, snow_sample_number, since_tweet_id = data elif len(data) == 2: if self.step_count: user_identifier, snow_sample_number = data elif self.gets_since_tweet_id: user_identifier, since_tweet_id = data user_identifier = str(user_identifier) print 'Starting: ', data pickle_filename = os.path.join(self.out_dir, "obj", user_identifier) json_filename = os.path.join(self.out_dir, "json", user_identifier + ".json.gz") # Get the user's data if os.path.exists(pickle_filename) and os.path.exists( json_filename) and not self.add_to_file: print '\tgot existing data for: ', data user = pickle.load(open(pickle_filename, "rb")) user.populate_tweets_from_file(json_filename) else: if self.gets_user_id: user = TwitterUser(self.api_hook, user_id=user_identifier) else: user = TwitterUser(self.api_hook, screen_name=user_identifier) print 'populating tweets', user_identifier if self.populate_tweets: if self.save_user_tweets: print 'saving tweets to: ', json_filename of_name, tweet_count = user.populate_tweets_from_api( json_output_filename=json_filename, since_id=since_tweet_id, populate_object_with_tweets=False) else: of_name, tweet_count = user.populate_tweets_from_api( since_id=since_tweet_id, populate_object_with_tweets=False) if self.tweet_count_file: self.tweet_count_file.write( str(user_identifier) + "\t" + str(tweet_count) + "\n") if self.populate_lists: print 'populating lists', user.screen_name user.populate_lists_member_of() if self.populate_friends: print 'populating friends, ', user.screen_name user.populate_friends() if self.populate_followers: print 'populating followers, ', user.screen_name user.populate_followers() if self.save_user_data and \ (self.always_pickle or self.populate_lists or self.populate_friends or self.populate_followers): # Pickle and dump user #print 'pickling and dumping (no tweets): ', user.screen_name user.tweets = [] pickle.dump(user, open(pickle_filename, "wb")) # now add to queue if necessary if snow_sample_number is not None and snow_sample_number < self.step_count: for user_identifier in self.add_users_to_queue_function( user): self.queue.put( [str(user_identifier), snow_sample_number + 1]) if self.post_process_function: self.post_process_function(user) except KeyboardInterrupt as e: print e break except Exception: print('FAILED:: ', data) exc_type, exc_value, exc_traceback = sys.exc_info() print("*** print_tb:") traceback.print_tb(exc_traceback, limit=30, file=sys.stdout) print("*** print_exception:")