def ds_all_tweets(mode): tweets_dist = {} count = 0 mongo_client = helper.get_mongo_client() maif_db = mongo_client['maif_db'] tweets_col = maif_db['tweets'] age_labels_col = maif_db['age_labels'] twitter_ids = tweets_col.distinct('user.id_str') if mode == TOKENS_MODE: all_tweets_path = ALL_TWEETS_TOKENS_PATH elif mode == EXT_FEATURES_MODE: all_tweets_path = ALL_TWEETS_EXT_FEATURES_PATH elif mode == TWEETS_MODE: all_tweets_path = ALL_TWEETS_COMBINED_PATH with open(all_tweets_path, 'w', newline='', encoding='utf-8') as wf_csv: writer = csv.writer(wf_csv, delimiter=',') for twitter_id in twitter_ids: age = age_labels_col.find_one({'id_str': twitter_id})['age'] tweet_objs = tweets_col.find({'user.id_str': twitter_id}) tweet_obj_list = list(tweet_objs) tweets_dist[twitter_id] = len(tweet_obj_list) if tweet_obj_list: writer.writerow( get_csv_row(twitter_id, tweet_obj_list, 0, len(tweet_obj_list) - 1, age, mode)) count += 1 print('count: {}'.format(count)) with open(all_tweets_path + 'dist.txt', 'w') as wf: for k, v in tweets_dist.items(): wf.write('{} {}\n'.format(k, v))
def insert_ages_in_db(): mongo_client = helper.get_mongo_client() maif_db = mongo_client['maif_db'] tweets_col = maif_db['tweets'] age_labels_col = maif_db['age_labels'] age_label_objs = [] with open( r'D:\Data\Linkage\Other Datasets\Age\Zhang_ICWSM_2016\ageLabels.txt' ) as rf: for line in rf: tokens = line.strip().split(' ') id_str = tokens[0] age = tokens[1] age_label_obj = {'id_str': id_str, 'age': age} age_label_objs.append(age_label_obj) age_labels_col.insert_many(age_label_objs)
def gen_ds_x_tweets(): mongo_client = helper.get_mongo_client() twitter_db = mongo_client['twitter'] tweets_col = twitter_db['tweets'] voters_col = twitter_db['voters'] ground_truths_col = twitter_db['ground_truths'] tuples = [(x['twitter_id'], x['voter_serial']) for x in ground_truths_col.find({})] with open(X_TWEETS_PATH, 'w', newline='', encoding='utf-8') as wf_csv: writer = csv.writer(wf_csv, delimiter=',') writer.writerow(CSV_HEADER) for twitter_id, voter_serial in tuples: voter = voters_col.find_one({'serial': voter_serial}) voter['twitter_id'] = twitter_id tweet_objs = tweets_col.find({'user.id_str': twitter_id, 'retweeted_status': {'$exists': False}}) tweet_obj_list = list(tweet_objs) begin_index = None begin_datetime = None for curr_index in range(len(tweet_obj_list)): if begin_index == None: begin_index = curr_index begin_datetime = get_datetime(tweet_obj_list[begin_index]['created_at']) else: curr_datetime = get_datetime(tweet_obj_list[curr_index]['created_at']) while date_difference_days(curr_datetime, begin_datetime) > 365: begin_index += 1 begin_datetime = get_datetime(tweet_obj_list[begin_index]['created_at']) if curr_index - begin_index == 49: writer.writerow( get_csv_row(voter, tweet_obj_list, begin_index, curr_index)) begin_index = curr_index
def gen_ds_yearly_tweets(): mongo_client = helper.get_mongo_client() twitter_db = mongo_client['twitter'] tweets_col = twitter_db['tweets'] voters_col = twitter_db['voters'] ground_truths_col = twitter_db['ground_truths'] tuples = [(x['twitter_id'], x['voter_serial']) for x in ground_truths_col.find({})] with open(YEARLY_TWEETS_PATH, 'w', newline='', encoding='utf-8') as wf_csv: writer = csv.writer(wf_csv, delimiter=',') writer.writerow(CSV_HEADER) for twitter_id, voter_serial in tuples: voter = voters_col.find_one({'serial': voter_serial}) voter['twitter_id'] = twitter_id tweet_objs = tweets_col.find({'user.id_str': twitter_id, 'retweeted_status': {'$exists': False}}).sort( [('id', 1)]) tweet_obj_list = list(tweet_objs) end_index = len(tweet_obj_list) - 1 curr_index = end_index while curr_index >= 0: end_datetime = get_datetime(tweet_obj_list[end_index]['created_at']) curr_datetime = get_datetime(tweet_obj_list[curr_index]['created_at']) if date_difference_days(curr_datetime, end_datetime) <= 365: if curr_index == 0: writer.writerow(get_csv_row(voter, tweet_obj_list, curr_index, end_index)) curr_index -= 1 else: writer.writerow(get_csv_row(voter, tweet_obj_list, curr_index + 1, end_index)) end_index = curr_index
def gen_ds_all_tweets_chunked(): mongo_client = helper.get_mongo_client() twitter_db = mongo_client['twitter'] tweets_col = twitter_db['tweets'] voters_col = twitter_db['voters'] ground_truths_col = twitter_db['ground_truths'] tuples = [(x['twitter_id'], x['voter_serial']) for x in ground_truths_col.find({})] with open(ALL_TWEETS_CHUNKED_PATH, 'w', newline='', encoding='utf-8') as wf_csv: writer = csv.writer(wf_csv, delimiter=',') writer.writerow(CSV_HEADER_CHUNKED) for twitter_id, voter_serial in tuples: voter = voters_col.find_one({'serial': voter_serial}) voter['twitter_id'] = twitter_id tweet_objs = tweets_col.find({'user.id_str': twitter_id, 'retweeted_status': {'$exists': False}}) tweet_obj_list = list(tweet_objs) if tweet_obj_list: writer.writerow(get_csv_row_chunked(voter, tweet_obj_list, 0, len(tweet_obj_list) - 1))
include_rts=include_rts) error = False except tweepy.TweepError as te: print('api error: {}'.format(te.reason)) if 'Not authorized' in te.reason or 'page does not exist' in te.reason: error = False return tweet_objs if __name__ == '__main__': apis = helper.get_twitter_app_apis() mongo_client = helper.get_mongo_client() twitter_db = mongo_client['twitter'] tweets_col = twitter_db['new_tweets'] gt_col = twitter_db['ground_truths'] index_col = twitter_db['new_tweets_index'] user_ids = [] gts = gt_col.find({}) for gt in gts: user_ids.append(gt['twitter_id']) user_ids.sort() index = get_index(index_col) position = index['position']
voter_objs = [] for i in range(2000): voter_file_path = os.path.join(FL_REC_SPLITS_DIR, 'rec_{}.txt'.format(i)) with open(voter_file_path, 'r') as rf: for line in rf: tokens = list(map(lambda x: x.strip(), line.split('\t'))) voter_obj = { 'serial': tokens[0], 'fname': tokens[1], 'mname': tokens[2], 'lname': tokens[3], 'sex': tokens[4], 'dob': tokens[5], 'race_code': tokens[6], 'add1': tokens[7], 'add2': tokens[8], 'city': tokens[9], 'zip_code': tokens[10], 'county_code': tokens[11], 'party': tokens[12], 'phone': tokens[13], 'email': tokens[14] } voter_objs.append(voter_obj) store_voters(helper.get_mongo_client(), voter_objs)
raise Exception( 'Already visited index {} at file {}'.format( vt_index, i)) vt_indexes_visited[vt_index] = 1 twitter_info = line[line.index(':\t') + 2:] attributes = twitter_info.split('\t') if len(attributes) > 3: twitter_name = attributes[0] twitter_id = attributes[2] count += 1 print('{} : {} ----- {}'.format(count, twitter_name, flnames[index])) ground_truth_obj = { 'twitter_id': twitter_id, 'voter_serial': serials[vt_index] } ground_truth_objs.append(ground_truth_obj) # if twitter_name == '': # print(master_ground_truths_file, index) store_ground_truths(helper.get_mongo_client(), ground_truth_objs)