import glob import os import sys from multiprocessing import Queue from twitter_dm.multiprocess import multiprocess_setup from twitter_dm.multiprocess.WorkerTweetData import TweetDataWorker from twitter_dm.utility import general_utils from twitter_dm.utility.general_utils import mkdir_no_err,collect_system_arguments, chunk_data handles, output_dir, tweet_ids, is_ids = collect_system_arguments(sys.argv) # Create the output directory mkdir_no_err(output_dir) # chunk tweets into 100s (the API takes them by 100) i = 0 tweets_chunked = chunk_data(tweet_ids) print tweets_chunked[0] # init a sync manager multiprocess_setup.init_good_sync_manager() # put data on the queue request_queue = multiprocess_setup.load_request_queue(tweets_chunked, len(handles)) # run! processes = [] for i in range(len(handles)):
OUTPUT_DIRECTORY = sys.argv[2] ##get all the handles we have to the api handles = general_utils.get_handles(glob.glob(os.path.join(sys.argv[1],"*.txt"))) print 'n authed users: ', len(handles) #user screen names we are interested in user_screenname_id_pairs = [line.strip().split("\t") for line in open(sys.argv[3]).readlines()] print user_screenname_id_pairs[0] pickle_dir = OUTPUT_DIRECTORY +"/obj/" network_dir = OUTPUT_DIRECTORY+"/net/" general_utils.mkdir_no_err(OUTPUT_DIRECTORY) general_utils.mkdir_no_err(pickle_dir) general_utils.mkdir_no_err(network_dir) multiprocess_setup.init_good_sync_manager() ##put data on the queue request_queue = multiprocess_setup.load_request_queue(user_screenname_id_pairs, len(handles)) processes = [] for i in range(len(handles)): p = TwitterEgoNetworkWorker(request_queue, handles[i], network_dir, pickle_dir) p.start() processes.append(p)
if len(sys.argv) != 4: print 'usage: [known_user_dir] [screen_name_file] [out_dir]' sys.exit(-1) handles =[] for fil in glob.glob(sys.argv[1]+"/*.txt"): print 'FIL: ' , fil app_handler = TwitterApplicationHandler(pathToConfigFile=fil) handles += app_handler.api_hooks print 'n authed users: ', len(handles) user_ids = set([line.strip().lower() for line in open(sys.argv[2]).readlines()]) out_dir = sys.argv[3] general_utils.mkdir_no_err(out_dir) print "N TO FIND: ", len(user_ids) user_ids = [u for u in user_ids] user_data_chunked = [] i=0 while i < len(user_ids): user_data_chunked.append(user_ids[i:(i+100)]) i += 100 user_data_chunked.append(user_ids[i-100:len(user_ids)]) print 'len chunked: ', len(user_data_chunked)
from twitter_dm.multiprocess import multiprocess_setup from twitter_dm.multiprocess.WorkerUserData import UserDataWorker from datetime import datetime from twitter_dm.utility.general_utils import mkdir_no_err, collect_system_arguments (handles, out_dir, user_ids, is_ids, collect_friends, collect_followers, gen_tweet_counts_file) = collect_system_arguments(sys.argv, [ 'collect_friends (y/n)', 'collect_followers (y/n)', "gen_tweet_counts_file (y/n)" ]) handles = handles[:2] print 'num users: ', len(user_ids) mkdir_no_err(out_dir) mkdir_no_err(os.path.join(out_dir, "obj")) mkdir_no_err(os.path.join(out_dir, "json")) multiprocess_setup.init_good_sync_manager() ##put data on the queue request_queue = multiprocess_setup.load_request_queue(user_ids, len(handles)) tweet_count_file_dir = None if gen_tweet_counts_file == 'y': tweet_count_file_dir = "tweet_count" + str(datetime.now()).split(" ")[0] mkdir_no_err(os.path.join(out_dir, tweet_count_file_dir)) processes = [] for i in range(len(handles)):
] user_screenname_id_pairs = get_user_ids_and_sn_data_from_list( user_sns, handles, True) print 'got screen names, ', len(user_screenname_id_pairs) # put data on the queue request_queue = multiprocess_setup.load_request_queue( [(x[1], 0) for x in user_screenname_id_pairs], len(handles), add_nones=False) pickle_dir = OUTPUT_DIRECTORY + "/obj/" network_dir = OUTPUT_DIRECTORY + "/json/" general_utils.mkdir_no_err(OUTPUT_DIRECTORY) general_utils.mkdir_no_err(pickle_dir) general_utils.mkdir_no_err(network_dir) multiprocess_setup.init_good_sync_manager() # put data on the queue user_screenname_id_pairs = get_user_ids_and_sn_data_from_list( user_sns, handles, True) print 'got screen names, ', len(user_screenname_id_pairs) # put data on the queue request_queue = multiprocess_setup.load_request_queue( [(x[1], 0) for x in user_screenname_id_pairs], len(handles), add_nones=False)
freeze_support() if len(sys.argv) != 4: print "usage: [known_user_dir] [output_dir] [tweet_id_file]" sys.exit(-1) handles = general_utils.get_handles(glob.glob(os.path.join(sys.argv[1], "*.txt"))) print "n authed users: ", len(handles) out_dir = sys.argv[2] user_ids = [line.strip().split(",")[0] for line in open(sys.argv[3]).readlines()] print "num users: ", len(user_ids) general_utils.mkdir_no_err(out_dir) general_utils.mkdir_no_err(os.path.join(out_dir, "obj")) general_utils.mkdir_no_err(os.path.join(out_dir, "json")) multiprocess_setup.init_good_sync_manager() # already_done = set([os.path.basename(f) for f in glob.glob(out_dir+"/*")]) print "len already done:", 0 # user_screennames = [u for u in user_screennames if u not in already_done] ##put data on the queue request_queue = multiprocess_setup.load_request_queue(user_ids, len(handles)) processes = [] for i in range(len(handles)): p = UserDataWorker( request_queue,
def run(self): print ("Worker started") while True: try: data = self.queue.get(True) if data is None: print "ALL DONE, EXITING!" return user_id, screen_name = data[0], data[1] print ("Starting: ", screen_name, user_id) this_user_network_dir_name = os.path.join(self.network_dir, user_id) mkdir_no_err(this_user_network_dir_name) stored_user_list = set( [os.path.basename(user_pickle) for user_pickle in glob.glob(self.pickle_dir + "*")] ) # Get the ego if user_id in stored_user_list: print ("\tgot pickled: ", user_id) user = pickle.load(open(self.pickle_dir + "/" + str(user_id), "rb")) else: user = TwitterUser(self.api_hook, user_id=user_id) print ("\tgetting tweets for: ", user_id) user.populate_tweets_from_api() print ("\t num tweets received for: ", user_id, " (", screen_name, "): ", len(user.tweets)) if len(user.tweets) > 0: print ("\tgetting lists, friends, followers for: ", user_id) user.populate_lists_member_of() # user.populate_followers() # user.populate_friends() print ("pickling: ", screen_name) pickle.dump(user, open(self.pickle_dir + "/" + user_id, "wb")) self.write_user_network(this_user_network_dir_name, user, user_id, None) if len(user.tweets) == 0: print ("finished collecting data for: ", user_id, ", no tweets") continue # Find the ego network based on retweets, mentions and replies user_network_to_pull = user.get_ego_network_actors() print ("Starting to get ", user.user_id, "'s network of ", len(user_network_to_pull), " actors") restrict_to_users = [u for u in user_network_to_pull] restrict_to_users.append(user_id) self.get_user_network( this_user_network_dir_name, user_network_to_pull, restrict_to_users, stored_user_list ) except Exception: print ("FAILED:: ", data) exc_type, exc_value, exc_traceback = sys.exc_info() print ("*** print_tb:") traceback.print_tb(exc_traceback, limit=50, file=sys.stdout) print ("finished collecting data for: ", screen_name)
print 'usage: [known_user_dir] [output_dir] [tweet_id_file]' sys.exit(-1) handles = general_utils.get_handles( glob.glob(os.path.join(sys.argv[1], "*.txt"))) print 'n authed users: ', len(handles) out_dir = sys.argv[2] user_ids = [ line.strip().split(",")[0] for line in open(sys.argv[3]).readlines() ] print 'num users: ', len(user_ids) general_utils.mkdir_no_err(out_dir) general_utils.mkdir_no_err(os.path.join(out_dir, "obj")) general_utils.mkdir_no_err(os.path.join(out_dir, "json")) multiprocess_setup.init_good_sync_manager() #already_done = set([os.path.basename(f) for f in glob.glob(out_dir+"/*")]) print 'len already done:', 0 #user_screennames = [u for u in user_screennames if u not in already_done] ##put data on the queue request_queue = multiprocess_setup.load_request_queue( user_ids, len(handles)) processes = [] for i in range(len(handles)): p = UserDataWorker(request_queue,
import cPickle as pickle import sys from collections import Counter from multiprocessing import Pool from os import listdir, mkdir import os from twitter_dm.utility.general_utils import tab_stringify_newline as tsn, mkdir_no_err if len(sys.argv) != 4: print 'usage: [input_dir] [output dir] [# cores for execution]' sys.exit(-1) INPUT_DIR = sys.argv[1] OUTPUT_DIR = sys.argv[2] mkdir_no_err(OUTPUT_DIR) def get_user_info(d): #i, uid = d #if i % 1000 == 0: # print i #try: i, uid = d u = pickle.load(open(os.path.join(INPUT_DIR, 'obj', uid), 'rb')) fname = os.path.join(INPUT_DIR, 'json', uid + '.json.gz') u.populate_tweets_from_file(fname, store_json=False, do_arabic_stemming=False,
Basically, we don't get information back from the API if these users have been suspended/deleted, so we can learn from that information """ __author__ = 'kjoseph' import glob import sys from twitter_dm.utility.general_utils import collect_system_arguments, chunk_data from twitter_dm.multiprocess import multiprocess_setup from twitter_dm.multiprocess.WorkerSimpleUserLookup import SimpleUserLookupWorker from twitter_dm.utility import general_utils handles, out_dir, data_to_collect, is_ids = collect_system_arguments(sys.argv) general_utils.mkdir_no_err(out_dir) user_data_chunked = chunk_data(data_to_collect) print 'len chunked: ', len(user_data_chunked) # initialize a better sync manager multiprocess_setup.init_good_sync_manager() # put data on the queue request_queue = multiprocess_setup.load_request_queue( [x for x in user_data_chunked], len(handles), add_nones=True) processes = [] for i in range(len(handles)): p = SimpleUserLookupWorker(request_queue, handles[i],
tweet_has_identity = True tweet.append(x.get_conll_form() + "\t" + lab) i += 1 if tweet_has_identity: outfil.write(("\n".join(tweet) +"\n\n").encode("utf8")) outfil.close() return ['success', final_out_filename] except: print 'UNKNOWN ERROR: ', json_file_name return ['no_dp_ptb',False,False] mkdir_no_err(OUTPUT_DIR) users_to_ignore = open("results/u_ignore.txt","w") users_no_tweets = open("results/u_notweets.txt","w") users_need_dp = open("results/u_needdp.txt","w") users_need_ptb = open("results/u_need_ptb.txt","w") word_vector_model, all_dictionaries, ark_clusters, sets, names = get_init_data( 'gensim_model/glove_twitter_50_raw_model.txt.gz', "processed_data/50mpaths2", "dictionaries/*/*", BOOTSTRAPPED_DICTIONARY_LOCATION) CONLL_FILE = "processed_data/all_conll_pub_and_nonpub.txt" features_from_conll_file, dict_for_filter = get_all_features(CONLL_FILE, all_dictionaries,
from twitter_dm.utility.general_utils import mkdir_no_err from glob import glob from multiprocessing import Pool import os CPU_COUNT = 2 TWEEBOPARSER_LOC= 'PATH_TO_TWEEBO_PARSER' DATA_DIR = "PATH_TO_DIRECTORY_OF_(GZIPPED)_JSON_FILES_WITH_TWEETS" def do_dependency_parse(fil): u = TwitterUser() u.populate_tweets_from_file(fil,do_tokenize=False) out_file_name = fil.replace(".json","").replace(".gz","").replace("/json/","/dep_parse/") print out_file_name if len(u.tweets) == 0: os.utime(out_file_name) return 'empty, success' data = dependency_parse_tweets(TWEEBOPARSER_LOC,u.tweets,out_file_name) return 'completed' mkdir_no_err(DATA_DIR.replace("json","dep_parse")) pool = Pool(processes=CPU_COUNT) #do_dependency_parse(glob(DATA_DIR+"/*")[0]) result = pool.map(do_dependency_parse, glob(DATA_DIR+"/*"))
from twitter_dm import TwitterUser from twitter_dm import dependency_parse_tweets from twitter_dm.utility.general_utils import mkdir_no_err CPU_COUNT = 2 TWEEBOPARSER_LOC = 'PATH_TO_TWEEBO_PARSER' DATA_DIR = "PATH_TO_DIRECTORY_OF_(GZIPPED)_JSON_FILES_WITH_TWEETS" def do_dependency_parse(fil): u = TwitterUser() u.populate_tweets_from_file(fil, do_tokenize=False) out_file_name = fil.replace(".json", "").replace(".gz", "").replace( "/json/", "/dep_parse/") print out_file_name if len(u.tweets) == 0: os.utime(out_file_name) return 'empty, success' data = dependency_parse_tweets(TWEEBOPARSER_LOC, u.tweets, out_file_name) return 'completed' mkdir_no_err(DATA_DIR.replace("json", "dep_parse")) pool = Pool(processes=CPU_COUNT) #do_dependency_parse(glob(DATA_DIR+"/*")[0]) result = pool.map(do_dependency_parse, glob(DATA_DIR + "/*"))