#user_sns = [line.strip() for line in open(sys.argv[3]).readlines()] user_sns = ['Neuro_Skeptic'] print 'num users: ', len(user_sns) of = codecs.open("output_fil.tsv", "w", "utf8") for i in range(len(user_sns)): #creates a Twitter User object to fill with information from the API user = TwitterUser(handles[i], screen_name=user_sns[i]) user.populate_tweets_from_api(json_output_filename=out_dir + user_sns[i] + ".json", sleep_var=False) user.populate_followers() rts = 0 gt = 0 for t in user.tweets: if t.retweeted is not None: rts += 1 if t.geocode_info is not None: gt += 1 of.write( tab_stringify_newline([ user.screen_name, gt, rts, len(user.tweets), user.earliest_tweet_time, user.latest_tweet_time, user.name, user.n_total_tweets, user.creation_date, user.followers_count, user.following_count ])) of.close()
for i in range(len(user_sns)): #creates a Twitter User object to fill with information from the API user = TwitterUser(handles[i], screen_name=user_sns[i]) user.populate_tweets_from_api(json_output_filename=out_dir+user_sns[i]+".json", sleep_var=False) user.populate_followers() rts = 0 gt = 0 for t in user.tweets: if t.retweeted is not None: rts+=1 if t.geocode_info is not None: gt +=1 of.write(tab_stringify_newline([user.screen_name, gt, rts, len(user.tweets), user.earliest_tweet_time, user.latest_tweet_time, user.name, user.n_total_tweets, user.creation_date, user.followers_count, user.following_count])) of.close()
ht_ht_file = codecs.open( dir1 + "ht_ht_edgefile.tsv", "w", "utf8" ) # hashtag by hashtag network edgelist where co-occurrence in a tweet defines an edge mention_file = codecs.open( dir1 + "mention_edgefile.tsv", "w", "utf8" ) # user by user network edgelist of mention ties user_url_file = codecs.open( dir1 + "user_url_edgefile.tsv", "w", "utf8" ) # bipartite network edgelist of users and urls url_mention_file = codecs.open(dir1 + "url_mention_edgefile.tsv", "w", "utf8") # attribute_file = codecs.open(dir1 + "attribute.tsv", "w", "utf8") # node attribute table geo_file = codecs.open(dir1 + "geofile.tsv", "w", "utf8") # a list of all geo tagged tweets lang_file = codecs.open(dir1 + "langfile.tsv", "w", "utf8") # language frequencies at the user level retweet_file = codecs.open(dir1 + "retweet.tsv", "w", "utf8") # user by user where an edge is a retweet friend_file.write(tab_stringify_newline(["Source", "Target"])) user_ht_file.write(tab_stringify_newline(["Source", "Target", "tweetID", "date"])) user_url_file.write(tab_stringify_newline(["Source", "Target", "tweetID", "date", "urls_tweet"])) ht_ht_file.write(tab_stringify_newline(["userID", "hashtag_A", "hashtag_B", "tweetID", "date"])) mention_file.write(tab_stringify_newline(["Source", "Target", "tweetID", "date"])) url_mention_file.write(tab_stringify_newline(["Source", "Target", "url", "tweetID", "date"])) geo_file.write(tab_stringify_newline(["userID", "lat", "lon", "date"])) lang_file.write(tab_stringify_newline(["userID", "lang", "count"])) retweet_file.write(tab_stringify_newline(["Source", "retweet_sn", "tweetID", "date"])) attribute_file.write( tab_stringify_newline( [ "userID", "ScreenName", "followingCount", "followerCount",
except: mkdir(dir2) friend_file = codecs.open(dir1+"friend_edgefile.tsv", "w", "utf8") user_ht_file = codecs.open(dir1+"user_ht_edgefile.tsv", "w", "utf8") ht_ht_file = codecs.open(dir1+"ht_ht_edgefile.tsv", 'w', 'utf8') mention_file = codecs.open(dir1+"mention_edgefile.tsv", "w", "utf8") user_url_file = codecs.open(dir1+"user_url_edgefile.tsv", "w", "utf8") url_mention_file=codecs.open(dir1+"url_mention_edgefile.tsv","w","utf8") attribute_file = codecs.open(dir1+"attribute.tsv", "w", "utf8") geo_file = codecs.open(dir1+"geofile.tsv", "w", "utf8") lang_file = codecs.open(dir1+"langfile.tsv", "w", "utf8") retweet_file=codecs.open(dir1+ "retweet.tsv", "w", "utf8") friend_file.write(tab_stringify_newline(['Source', 'Target'])) user_ht_file.write(tab_stringify_newline(['Source', 'Target','tweetID', 'date'])) user_url_file.write(tab_stringify_newline(['Source', 'Target','tweetID', 'date','urls_tweet'])) ht_ht_file.write(tab_stringify_newline(['userID','hashtag_A','hashtag_B','tweetID','date'])) mention_file.write(tab_stringify_newline(['Source', 'Target','tweetID', 'date'])) url_mention_file.write(tab_stringify_newline(['Source', 'Target','url','tweetID', 'date'])) geo_file.write(tab_stringify_newline(['userID', 'lat', 'lon', 'date'])) lang_file.write(tab_stringify_newline(['userID', 'lang', 'count'])) retweet_file.write(tab_stringify_newline(['Source', 'retweet_sn','tweetID', 'date'])) attribute_file.write(tab_stringify_newline(['userID', 'ScreenName', 'followingCount', 'followerCount', 'tweetCount', 'tweetsCollected', 'firstTweet','lastTweet', 'creation_date', 'urlCount', 'mentionCount'])) for file in onlyfiles: try:
user_url_file = codecs.open( dir1 + "user_url_edgefile.tsv", "w", "utf8") # bipartite network edgelist of users and urls url_mention_file = codecs.open(dir1 + "url_mention_edgefile.tsv", "w", "utf8") # attribute_file = codecs.open(dir1 + "attribute.tsv", "w", "utf8") # node attribute table geo_file = codecs.open(dir1 + "geofile.tsv", "w", "utf8") # a list of all geo tagged tweets lang_file = codecs.open(dir1 + "langfile.tsv", "w", "utf8") # language frequencies at the user level retweet_file = codecs.open( dir1 + "retweet.tsv", "w", "utf8") # user by user where an edge is a retweet friend_file.write(tab_stringify_newline(['Source', 'Target'])) user_ht_file.write( tab_stringify_newline(['Source', 'Target', 'tweetID', 'date'])) user_url_file.write( tab_stringify_newline( ['Source', 'Target', 'tweetID', 'date', 'urls_tweet'])) ht_ht_file.write( tab_stringify_newline( ['userID', 'hashtag_A', 'hashtag_B', 'tweetID', 'date'])) mention_file.write( tab_stringify_newline(['Source', 'Target', 'tweetID', 'date'])) url_mention_file.write( tab_stringify_newline(['Source', 'Target', 'url', 'tweetID', 'date'])) geo_file.write(tab_stringify_newline(['userID', 'lat', 'lon', 'date'])) lang_file.write(tab_stringify_newline(['userID', 'lang', 'count'])) retweet_file.write(
print "N TO FIND: ", len(user_sns) #user_ids = [u for u in user_ids] user_ids = [u.lower() for u in user_sns] out_fil = io.open(sys.argv[3], "w") i = 0 j = 0 print len(user_sns) out_fil.write( tab_stringify_newline([ "id", 'name', "screen_name", 'url', 'protected', 'location', 'description', "followers_count", "friends_count", "favourites_count", "created_at", "utc_offset", 'time_zone', "statuses_count", "lang", "status_created_at", "status_coordinates", "status_lang", "profile_image_url_https", "verified" ])) while i < len(user_sns): j += 1 print j api_hook = handles[random.randint(0, len(handles) - 1)] curr_ids = set(user_ids[i:(i + 100)]) user_data = api_hook.get_from_url("users/lookup.json", { "user_id": ",".join(curr_ids), "include_entities": "false" })
"processed_data/50mpaths2", "dictionaries/*/*", BOOTSTRAPPED_DICTIONARY_LOCATION) features_from_conll_file, dictionary_data = get_all_features( CONLL_FILE, all_dictionaries, ark_clusters, sets, names) # get ids of all random, put in a set, then iterate through random.seed(0) all_random_ids = get_test_ids(CONLL_FILE, 0, -1, -1) random.shuffle(all_random_ids) output_file = open("results/param_tuning_results.tsv", "w") for fold in range(5): output, models, preds = run_all_on_test_ids( fold, all_random_ids[(fold * 150):((fold + 1) * 150)], model, features_from_conll_file, dictionary_data, eval_params=[.4, .45, .5, .55, .6], cutoff_params=[.0001, .0005, .001], use_filtered_params=[True, False], datasets_to_use=['x', 'wv', 'x_wv', 'all_wv', 'x_wv_ls', 'full'], regularization_params=[.53, .58, .6, .63, .65]) for o in output: output_file.write(tab_stringify_newline(o)) output_file.close()
def run(self): print('Worker started') # do some initialization here while True: data = self.queue.get(True) try: if data is None: print('ALL FINISHED!!!!', self.conn_number) self.out_file.close() self.user_info_out_file.close() break print 'collecting data' if self.gets_user_id: user_data = self.api_hook.get_from_url( "users/lookup.json", { "user_id": ",".join(data), "include_entities": "false" }, do_post=True) else: user_data = self.api_hook.get_from_url( "users/lookup.json", { "screen_name": ",".join(data), "include_entities": "false" }, do_post=True) user_ret_ids = [str(u['id']) for u in user_data] print len(data), len(user_ret_ids) not_there = set.difference(set(data), set(user_ret_ids)) print len(not_there) for u in not_there: self.out_file.write(tab_stringify_newline([u])) print 'sleeping' for user in user_data: output_data = [ user["id"], user.get('name'), user["screen_name"], user.get('url', ''), user['protected'], user.get('location', ''), user.get('description', ''), user["followers_count"], user["friends_count"], user["created_at"], user.get("utc_offset", ''), user.get('time_zone', ''), user["statuses_count"], user["lang"] ] if 'status' in user: output_data += [ user["status"]["created_at"], user["status"]["coordinates"] if user['status']['coordinates'] else '', user["status"]["lang"] ] else: output_data += ['', '', ''] output_data += [ user.get("profile_image_url_https", ""), user.get("verified", "") ] output_data = [(x.replace("\r\n", " ").replace( "\n", " ").replace("\r", " ").replace("\t", " ")) if type(x) is str else x for x in output_data] output_data = [(x.replace(u"\r\n", u" ").replace( u"\n", u" ").replace(u"\r", " ").replace( u"\t", u" ")) if type(x) is unicode else x for x in output_data] to_write = tab_stringify_newline(output_data) self.user_info_out_file.write(to_write) sleep(15) except KeyboardInterrupt as e: print e break except Exception: print('FAILED:: ', data) exc_type, exc_value, exc_traceback = sys.exc_info() print("*** print_tb:") traceback.print_tb(exc_traceback, limit=50, file=sys.stdout) print("*** print_exception:") print('finished collecting data for: ', data)
def run(self): print('Worker started') # do some initialization here while True: data = self.queue.get(True) try: if data is None: print('ALL FINISHED!!!!', self.conn_number) self.out_file.close() self.user_info_out_file.close() break print 'collecting data' user_data = self.api_hook.get_from_url("users/lookup.json", {"user_id": ",".join(data), "include_entities": "false"}) user_ret_ids = [str(u['id']) for u in user_data] print len(data),len(user_ret_ids) not_there = set.difference(set(data),set(user_ret_ids)) print len(not_there) for u in not_there: self.out_file.write(tab_stringify_newline([u])) print 'sleeping' for user in user_data: output_data = [user["id"], user.get('name'), user["screen_name"], user.get('url',''), user['protected'], user.get('location',''), user.get('description', ''), user["followers_count"], user["friends_count"], user["created_at"], user.get("utc_offset",''), user.get('time_zone',''), user["statuses_count"], user["lang"]] if 'status' in user: output_data += [user["status"]["created_at"], user["status"]["coordinates"] if user['status']['coordinates'] else '', user["status"]["lang"]] else: output_data += ['','',''] output_data += [user.get("profile_image_url_https",""),user.get("verified","")] output_data = [(x.replace("\r\n"," ") .replace("\n"," ") .replace("\r"," ") .replace("\t"," ")) if type(x) is str else x for x in output_data ] output_data = [(x.replace(u"\r\n",u" ") .replace(u"\n",u" ") .replace(u"\r"," ") .replace(u"\t",u" ")) if type(x) is unicode else x for x in output_data ] to_write = tab_stringify_newline(output_data) self.user_info_out_file.write(to_write) sleep(15) except Exception: print('FAILED:: ', data) exc_type, exc_value, exc_traceback = sys.exc_info() print("*** print_tb:") traceback.print_tb(exc_traceback, limit=50, file=sys.stdout) print("*** print_exception:") print('finished collecting data for: ', data)
"processed_data/50mpaths2", "dictionaries/*/*", BOOTSTRAPPED_DICTIONARY_LOCATION) features_from_conll_file, dictionary_data = get_all_features(CONLL_FILE,all_dictionaries,ark_clusters,sets,names) # get ids of all random, put in a set, then iterate through random.seed(0) all_random_ids = get_test_ids(CONLL_FILE, 0, -1, -1) random.shuffle(all_random_ids) output_file = open("results/param_tuning_results.tsv","w") for fold in range(5): output, models, preds = run_all_on_test_ids(fold, all_random_ids[(fold*150):( (fold+1)*150)], model, features_from_conll_file, dictionary_data, eval_params = [.4,.45,.5,.55,.6], cutoff_params=[.0001,.0005,.001], use_filtered_params=[True,False], datasets_to_use = ['x','wv','x_wv','all_wv','x_wv_ls','full'], regularization_params = [.53,.58,.6,.63,.65]) for o in output: output_file.write(tab_stringify_newline(o)) output_file.close()