def gen_json_for_tweets_of_interest(data, identity_list): of_id, uid_list = data json_of_name = os.path.join(JSON_OUTPUT_DIRECTORY, str(of_id) + ".json.gz") print 'inp: ', json_of_name, len(uid_list), uid_list[0:2] tweets_to_write = [] if not os.path.exists(json_of_name): for i, uid in enumerate(uid_list): if i % 25 == 0: print i, len(tweets_to_write) try: u = TwitterUser() u.populate_tweets_from_file(os.path.join( JSON_INPUT_DIRECTORY, uid + ".json.gz"), store_json=True) tweets_to_keep = [] for t in u.tweets: if not t.retweeted and len(t.tokens) > 4: expanded_token_set = copy(t.tokens) for token in t.tokens: expanded_token_set += get_alternate_wordforms( token) if len(set(expanded_token_set) & identity_list): tweets_to_keep.append(t) tweets_to_write += tweets_to_keep except: print 'FAILED JSON FOR USER: '******'WRITING JSON' out_fil = gzip.open(json_of_name, "wb") for tweet in tweets_to_write: out_fil.write( json.dumps(tweet.raw_json).strip().encode("utf8") + "\n") out_fil.close()
def gen_output(data, json_data_dir): term, is_reply, tweets_needed = data dataset = [] # get all user files files = glob.glob(os.path.join(json_data_dir, "*")) random.shuffle(files) for f in files: user = TwitterUser() user.populate_tweets_from_file(f, store_json=True, do_arabic_stemming=False, lemmatize=False) if 50 <= user.n_total_tweets <= 10000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ len(t.tokens) > 5 and\ t.created_at >= MIN_TWEET_DATE and\ (term == '' or term in t.tokens) and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if is_reply: tweet_set = [t for t in tweet_set if t.reply_to] else: tweet_set = [t for t in tweet_set if not t.reply_to] if len(tweet_set) == 0: print 'size 0', term, tweets_needed, is_reply continue tweet = random.sample(tweet_set, 1)[0] print user.screen_name, term, tweets_needed, is_reply, ":::: ", tweet.text dataset.append(tweet) tweets_needed -= 1 if tweets_needed == 0: name = term if term != '' else 'random' name += '_reply' if is_reply else '_non_reply' pickle.dump(dataset, open(name + ".p", 'wb')) print 'done with: ', name, is_reply return else: print 'failed user'
def gen_json_for_tweets_of_interest(input_filename, output_filename, keep_only_tweets_with_terms=None): """ This function generates a cleaned json file so that the identity extraction only happens on "interesting" tweets. Right now, interesting is defined as non-retweets that have >4 tokens. Feel free to redefine as you feel is suitable :param input_filename: input json file name (Can be gzipped) :param output_filename: cleaned output json filename :param keep_only_tweets_with_terms: If you only want to keep tweets containing a specific set of terms, you can use this argument and pass in a set of terms here :return: """ tweets_to_write = [] if not os.path.exists(output_filename): try: u = TwitterUser() u.populate_tweets_from_file(input_filename, store_json=True) tweets = [ t for t in u.tweets if not t.retweeted and len(t.tokens) > 4 ] tweets_to_keep = [] if keep_only_tweets_with_terms: expanded_token_set = copy(t.tokens) for t in tweets: for token in t.tokens: expanded_token_set += get_alternate_wordforms(token) if len( set(expanded_token_set) & keep_only_tweets_with_terms): tweets_to_keep.append(t) else: tweets_to_keep = tweets tweets_to_write += tweets_to_keep except: print 'FAILED TO PARSE JSON FILE: ', input_filename print 'WRITING JSON' out_fil = gzip.open(output_filename, "wb") for tweet in tweets_to_write: out_fil.write( json.dumps(tweet.raw_json).strip().encode("utf8") + "\n") out_fil.close()
def getTweets(twitterid): ''' Function to get the twitter data for an individual twitter ID. This function is written to work with Kenny's github example here: https://github.com/kennyjoseph/twitter_dm Input: string of twitterID Output: list of the raw string of all tweets for twitterID ''' from twitter_dm.TwitterUser import TwitterUser tweets = [] u = TwitterUser() u.populate_tweets_from_file( twitterid + '.json' ) #Need to figure out of we can use numeric ID (123456789.json) or name (kenny_joseph.json) for t in u.tweets: tweets.append( t.tokens ) #not sure if tokens is exactly what we want, we want the raw words, not necessarily tokens. We'll check this. # # texts={} # source_filename='Datasets/Twitter/members.zip' # parser = etree.XMLParser(encoding='utf8',recover=True) # with zipfile.ZipFile(source_filename) as zf: # for i,member in enumerate(zf.infolist()): # name=member.filename.split('/')[1].split('.')[0] #filename is Raw3/name.csv # if idx ==name: # #print idx, name # raw=zf.open(member) # data=csv.reader(raw) # for j,line in enumerate(data): # if j>0: # texts[idx+'_'+str(j)]=line[0] # if texts=={}: # print 'no tweets for ', idx return tweets
# Get the handles to the Twitter API handles = get_handles(glob.glob(os.path.join(sys.argv[1], "*.txt"))) print 'n authed users: ', len(handles) out_dir = sys.argv[2] os.mkdir(out_dir) #user_sns = [line.strip() for line in open(sys.argv[3]).readlines()] user_sns = ['Neuro_Skeptic'] print 'num users: ', len(user_sns) of = codecs.open("output_fil.tsv", "w", "utf8") for i in range(len(user_sns)): #creates a Twitter User object to fill with information from the API user = TwitterUser(handles[i], screen_name=user_sns[i]) user.populate_tweets_from_api(json_output_filename=out_dir + user_sns[i] + ".json", sleep_var=False) user.populate_followers() rts = 0 gt = 0 for t in user.tweets: if t.retweeted is not None: rts += 1 if t.geocode_info is not None: gt += 1 of.write( tab_stringify_newline([ user.screen_name, gt, rts,
from twitter_dm.TwitterAPIHook import TwitterAPIHook from twitter_dm.TwitterUser import TwitterUser username_to_collect_data_for = 'Jackie_Pooo' consumer_key = "YOUR_CONSUMER_KEY_HERE" consumer_secret = "YOUR_CONSUMER_SECRET_HERE" access_token = "YOUR_ACCESS_TOKEN_HERE" access_token_secret = "YOUR_ACCESS_TOKEN_SECRET_HERE" ## get a "hook", or connection, to the API using your consumer key/secret and access token/secret api_hook = TwitterAPIHook(consumer_key,consumer_secret, access_token=access_token,access_token_secret=access_token_secret) #creates a Twitter User object to fill with information from the API user = TwitterUser(api_hook,screen_name=username_to_collect_data_for) # we call populate_tweets_from_api,which goes to the Twitter API # and collects the user's data it is outputted to the file username_you_put.json # the sleep_var param tells the function it shouldn't worry # about rate limits (we're only collecting for one user, so it doesn't really matter # If you remove the is_gzip argument, the output file will be gzipped print 'populating users tweets!' user.populate_tweets_from_api(json_output_filename=username_to_collect_data_for+".json", sleep_var=False, is_gzip=False, since_id=None) for t in user.tweets: print t.mentions print 'user had {n_tweets} tweets'.format(n_tweets=len(user.tweets))
def gen_conll_file(fil, ptb_dir, dp_dir): user = TwitterUser() user.populate_tweets_from_file(fil, do_tokenize=False) if 50 <= user.n_total_tweets <= 15000 and\ user.followers_count <= 25000 and user.creation_date <= MIN_ACCOUNT_AGE: dp_filename = os.path.join(dp_dir, str(user.user_id) + ".gz") ptb_filename = os.path.join(ptb_dir, str(user.user_id) + ".txt.gz") if not os.path.exists(dp_filename) or not os.path.exists(ptb_filename): return [ 'no_dp_ptb', [ user.user_id, os.path.exists(dp_filename), os.path.exists(ptb_filename) ] ] penntreebank = { x[0]: x[1:] for x in read_grouped_by_newline_file(ptb_filename) } dependency_parse = read_grouped_by_newline_file(dp_filename) tweet_set = [(i,t) for i,t in enumerate(user.tweets) if t.retweeted is None and\ len(t.urls) == 0 and 'http:' not in t.text and\ langid.classify(t.text)[0] == 'en'] # non english speaker or spam if len(tweet_set) < 40: return ['notweets', user.user_id] data_to_return = [] for twit_it, tweet in tweet_set: data_for_tweet = [] ptb_for_tweet = penntreebank[str(tweet.id)] dp_for_tweet = dependency_parse[twit_it] if ptb_for_tweet[0].split("\t")[2] != DependencyParseObject( dp_for_tweet[0]).text: print 'ahhhhh, weird stuff' continue for i, p in enumerate(dp_for_tweet): d = DependencyParseObject( tsn([ p, tweet.id, user.user_id, tweet.created_at.strftime("%m-%d-%y") ], newline=False)) # get java features spl_java = ptb_for_tweet[i].split("\t") java_id, penn_pos_tag, word = spl_java[:3] java_features = '' if len(spl_java) == 3 else spl_java[3] d.features += [x for x in java_features.split("|") if x != ''] d.features.append("penn_treebank_pos=" + penn_pos_tag) data_for_tweet.append(d) data_to_return.append(data_for_tweet) return ['success', [user.user_id, data_to_return]] else: return ['baduser', user.user_id]
elif args.screen_name: print 'Running with screen name: ', args.screen_name args.json_file_or_folder = os.path.join(OUTPUT_DIR, args.screen_name + ".json.gz") if os.path.exists(args.json_file_or_folder): print "User's tweets already in the system at: ", args.json_file_or_folder else: print "Getting user's tweets and saving to: ", args.json_file_or_folder if not args.path_to_twitter_credentials_file: print "Can't do anything with a screen name without some API credentials, see the help for this script " \ "and this parameter!" sys.exit(-1) app_handler = TwitterApplicationHandler( pathToConfigFile=args.path_to_twitter_credentials_file) user = TwitterUser(screen_name=args.screen_name, api_hook=app_handler.api_hooks[0]) user.populate_tweets_from_api( json_output_filename=args.json_file_or_folder, sleep_var=False) ######## # load the models and the files ######## print 'LOADING MODEL' identity_model, feature_names = get_identity_model_and_features() word_vector_model, all_dictionaries, ark_clusters, sets, names = get_init_data( GENSIM_MODEL_LOCATION, BROWN_CLUSTER_LOCATION) print 'MODEL HAS BEEN LOADED'
spl = dataset_descrip.split("=") if spl[0] == 'random': datasets_to_collect.append(['', int(spl[1]), []]) else: datasets_to_collect.append([spl[0], int(spl[1]), []]) # get all user files files = glob.glob(os.path.join(json_data_dir, "*")) curr_dataset = datasets_to_collect[0] print datasets_to_collect for f in files: user = TwitterUser(filename_for_tweets=f) if user.n_total_tweets < 10000 and user.n_total_tweets > 50 and\ user.followers_count < 25000 and user.creation_date <= MIN_ACCOUNT_AGE: tweet_set = [t for t in user.tweets if t.retweeted is None and\ len(t.urls) == 0 and\ len(t.tokens) > 5 and\ t.created_at <= MIN_TWEET_DATE and\ curr_dataset[0] in t.tokens and\ langid.classify(t.text)[0] == 'en'and\ sentiment(t.text)['compound'] != 0] if len(tweet_set) == 0: continue tweet = random.sample(tweet_set, 1)[0]
def run(self): print('Worker started') # do some initialization here snow_sample_number = None since_tweet_id = None while True: data = self.queue.get(True) try: if data is None: print 'ALL FINISHED!!!!' break if len(data) == 1 or type(data) is str or type( data) is unicode or type(data) is int: user_identifier = data elif len(data) == 3: user_identifier, snow_sample_number, since_tweet_id = data elif len(data) == 2: if self.step_count: user_identifier, snow_sample_number = data elif self.gets_since_tweet_id: user_identifier, since_tweet_id = data user_identifier = str(user_identifier) print 'Starting: ', data pickle_filename = os.path.join(self.out_dir, "obj", user_identifier) json_filename = os.path.join(self.out_dir, "json", user_identifier + ".json.gz") # Get the user's data if os.path.exists(pickle_filename) and os.path.exists( json_filename) and not self.add_to_file: print '\tgot existing data for: ', data user = pickle.load(open(pickle_filename, "rb")) user.populate_tweets_from_file(json_filename) else: if self.gets_user_id: user = TwitterUser(self.api_hook, user_id=user_identifier) else: user = TwitterUser(self.api_hook, screen_name=user_identifier) print 'populating tweets', user_identifier if self.populate_tweets: if self.save_user_tweets: print 'saving tweets to: ', json_filename of_name, tweet_count = user.populate_tweets_from_api( json_output_filename=json_filename, since_id=since_tweet_id, populate_object_with_tweets=False) else: of_name, tweet_count = user.populate_tweets_from_api( since_id=since_tweet_id, populate_object_with_tweets=False) if self.tweet_count_file: self.tweet_count_file.write( str(user_identifier) + "\t" + str(tweet_count) + "\n") if self.populate_lists: print 'populating lists', user.screen_name user.populate_lists_member_of() if self.populate_friends: print 'populating friends, ', user.screen_name user.populate_friends() if self.populate_followers: print 'populating followers, ', user.screen_name user.populate_followers() if self.save_user_data and \ (self.always_pickle or self.populate_lists or self.populate_friends or self.populate_followers): # Pickle and dump user #print 'pickling and dumping (no tweets): ', user.screen_name user.tweets = [] pickle.dump(user, open(pickle_filename, "wb")) # now add to queue if necessary if snow_sample_number is not None and snow_sample_number < self.step_count: for user_identifier in self.add_users_to_queue_function( user): self.queue.put( [str(user_identifier), snow_sample_number + 1]) if self.post_process_function: self.post_process_function(user) except KeyboardInterrupt as e: print e break except Exception: print('FAILED:: ', data) exc_type, exc_value, exc_traceback = sys.exc_info() print("*** print_tb:") traceback.print_tb(exc_traceback, limit=30, file=sys.stdout) print("*** print_exception:")