def get_pair_to_tids(): print('Initializing Data Loader...') dl = Data_loader() test_ids = [tweet['tweet_id'] for tweet in dl.test_data()] pair2tids = {} for record in dl.all_data(): if record['tweet_id'] not in test_ids: involved = set() involved.add(record['user_post']) if 'user_retweet' in record: involved.add(record['user_retweet']) if 'user_mentions' in record: for user in record['user_mentions']: involved.add(user) involved = sorted(list(involved)) for i, u1 in enumerate(involved): for u2 in involved[i + 1:]: pair_id = str(u1) + '_' + str(u2) if pair_id in pair2tids: pair2tids[pair_id].append(record['tweet_id']) else: pair2tids[pair_id] = [record['tweet_id']] return pair2tids
def main(args): # params for data loader option = args['option'] print('Initializing Data Loader') dl = Data_loader(option=option) all_data = dl.all_data() print('Len of all data:', len(all_data)) test_ids = set([tweet['tweet_id'] for tweet in dl.test_data()]) print('Len of test data:', len(test_ids)) ensemble_ids = get_ensemble_tids() print('Len of ensemble data:', len(ensemble_ids)) mode = args['mode'] assert (mode == 'w2v' or mode == 'svd' or mode == 'd2v') if mode == 'w2v': sentences = [] for tweet in all_data: # need indices split if tweet['tweet_id'] not in test_ids and tweet[ 'tweet_id'] not in ensemble_ids: sentences.append([str(x) for x in tweet['int_arr']]) print('Num sentences:', len(sentences)) print('Check sentence0:', sentences[0]) generate_w2v_embs(sentences, option) elif mode == 'svd': sentences = [] for i, tweet in enumerate(all_data): # need indices joined if tweet['tweet_id'] not in test_ids and tweet[ 'tweet_id'] not in ensemble_ids: sentences.append(' '.join([str(x) for x in tweet['int_arr']])) print('Num sentences:', len(sentences)) print('Check sentence0:', sentences[0]) generate_svd_embs(sentences, option) else: # mode == d2v sentences = [] tags = [] for tweet in all_data: if tweet['tweet_id'] not in test_ids and tweet[ 'tweet_id'] not in ensemble_ids: # need indices split and use id's as tags sentences.append([str(x) for x in tweet['int_arr']]) tags.append([str(tweet['tweet_id'])]) print('Num sentences:', len(sentences)) print('Check sentence0:', sentences[0]) print('Check tag0:', tags[0]) generate_d2v_embs(sentences, tags, option)
def make_user_embeds(num_users): dim = 300 embeds = np.random.rand(num_users, dim) print('Initializing Data Loader...') dl = Data_loader() tl = init_tl('w2v') test_ids = [tweet['tweet_id'] for tweet in dl.test_data()] pretrained_count = 0 for user_idx in range( 2, num_users ): # reserve 0 for padding (i.e. no user), 1 for unknown user tweet_dicts = dl.tweets_by_user( user_idx) # all tweets WRITTEN by this user if tweet_dicts is not None and len(tweet_dicts) > 0: tweet_count = 0 all_tweets_sum = np.zeros(dim, dtype=np.float) for tweet_dict in tweet_dicts: tid = tweet_dict['tweet_id'] if tid not in test_ids: tweet_count += 1 tweet_avg = tl.get_representation(tid, mode='avg') all_tweets_sum += tweet_avg if tweet_count > 0: pretrained_count += 1 all_tweets_avg = all_tweets_sum / tweet_count embeds[user_idx] = all_tweets_avg print('Found tweets for {} out of {} users'.format(pretrained_count, num_users - 2)) embeds = StandardScaler().fit_transform(embeds) # mean 0, variance 1 embeds[0] = np.zeros(dim) # make sure padding is all 0's save_file = str(num_users) + '_user_emb.np' np.savetxt(save_file, embeds) print('Saved embeddings in', save_file)
type=int, default=20, help='iterations for word2vec; ignored if svd') args = vars(parser.parse_args()) print(args) # main(args) option = args['option'] print('Initializing Data Loader') dl = Data_loader(option=option) all_data = dl.all_data() all_tids = set([str(tweet['tweet_id']) for tweet in all_data]) print(list(all_tids)[:10]) print('Len of all data:', len(all_data)) test_ids = set([tweet['tweet_id'] for tweet in dl.test_data()]) print('Len of test data:', len(test_ids)) ensemble_ids = get_ensemble_tids() print('Len of ensemble data:', len(ensemble_ids)) print(list(ensemble_ids)[:10]) assert (len(ensemble_ids.intersection(all_tids)) == 0) # w2v_file = '../data/w2v_word_s300_w5_mc5_ep20.bin' # svd_file = '../data/svd_word_s300.pkl' # sample_usage(w2v_file, svd_file) # test_sents = [['2', '254', '440', '192', '94', '57', '72', '77'], # ['2', '16', '60', '10', '219', '259', '16', '142', '538'], # ['6', '132', '130', '11646', '47', '6', '25', '4', '132', '130', '3934', '73', '12', '163', '3035', '545', '221', '545']] # test_tags = [['740043438788345856'], ['258662084089368576'], ['842801723001487360']] # generate_d2v_embs(test_sents, test_tags, 'word')
class Experiment: def __init__(self, experiment_dir, input_name2id2np=None, adapt_train_vocab=False, comments='', epochs=100, patience=4, noise_function=None, filter_function=None, fold=5, predict_ens_test=True, by_fold=False, **kwargs): """ an experiment class that runs cross validation designed to enable easy experiments with combinations of: 1) context representation: handled by input_name2id2np 2) pre-training methods: handled by pretrained_weight_dir in the kwargs argument None if there is no pretraining weight available 3) char vs. word: specified in "options" options = ['char', 'word'] if you want to include both implement the value for key "word_content_input" options = ['char', 'word'] if you want to include everything Parameters ---------- input_name2id2np: experiment_dir: the directory that the experiment weights and results will be saved adapt_train_vocab: under supervised training without pretraining, some vocab will not be seen (twice) in the training set. if set to True, then vocab occuring less than twice will be removed. comments: the comments that will be written to the README epochs: number of epochs of training during cross validation patience: number of epochs allowable for not having any improvement on the validation set kwargs: arguments that will be passed to initializing the neural network model (shown below) ========== below is the parameters needed by the neural network model ========== options: an array containing all the options considered in the neural network model ['char', 'word'] (probably splex in the future) for each option, the input is mapped to a lower dimension, then the lower dimension representation of each option is concatenated and is followed by the final classification layer word_vocab_size: number of word level vocabs to be considered word_max_len: number of words in a tweet sentence char_vocab_size: number of char level vocabs to be considered char_max_len: number of chars in a tweet sentence drop_out: dropout rate for regularization filter: number of filters for each kernel size dense_size: the size of the dense layer following the max pooling layer embed_dim: embedding dimension for character and word level kernel_range: range of kernel sizes pretrained_weight_dir: a dictionary containing the pretrained weight. e.g. {'char': '../weights/char_ds.weights'} means that the pretrained weight for character level model is in ../weights/char_ds.weights weight_in_keras: whether the weight is in Keras context_dim: the dimension of context representation context_dense_size: the dense layer size right before the context representation splex_dense_size: dense layer size right before the splex reps """ # creating the experiment dir # automatically generate a README if experiment_dir[:-1] != '/': experiment_dir += '/' experiment_dir = '../experiments/' + experiment_dir self.experiment_dir, self.kwargs = experiment_dir, kwargs subprocess.call(['rm', '-rf', experiment_dir]) subprocess.call(['mkdir', experiment_dir]) self.adapt_train_vocab = adapt_train_vocab self.predict_ens_test = predict_ens_test ''' with open(self.experiment_dir + 'README', 'w') as readme: readme.write(comments + '\n') for key in kwargs: readme.write("%s: %s\n" % (str(key), str(kwargs[key]))) ''' if input_name2id2np is None: input_name2id2np = {} self.input_name2id2np = input_name2id2np self.fold = fold self.dl = Data_loader(option='both', labeled_only=True, **kwargs) self.epochs, self.patience = epochs, patience self.noise_function, self.filter_function = noise_function, filter_function self.pretrained_weight_dirs = self.kwargs.get('pretrained_weight_dirs') self.by_fold = by_fold # cross validation # write all results to the directory # see read_results for retrieving the performance def cv(self): results = [] for fold_idx in range(self.fold): print('cross validation fold %d.' % (fold_idx + 1)) # retriving cross validataion data fold_data = self.dl.cv_data(fold_idx) ((X_train, y_train), (X_val, y_val), (X_test, y_test)) = \ create_clf_data(self.input_name2id2np, fold_data, return_generators=False) if self.predict_ens_test: # retrieving the ensemble data ensemble_data = self.dl.ensemble_data() X_ensemble, y_ensemble = create_data(self.input_name2id2np, ensemble_data) # retrieving the held-out test data held_out_data = self.dl.test_data() X_held_out, y_held_out = create_data(self.input_name2id2np, held_out_data) if self.filter_function is not None: def apply_filter(X): X_filtered = {} for key in X: if 'char' in key and 'input' in key: X_filtered[key] = np.array([self.filter_function(x[:]) for x in X[key]]) else: X_filtered[key] = X[key] return X_filtered X_train, X_val, X_test = apply_filter(X_train), apply_filter(X_val), apply_filter(X_test) if self.predict_ens_test: X_ensemble, X_held_out = apply_filter(X_ensemble), apply_filter(X_held_out) # if no pretrained weights, adapting vocabulary so that those who appear in # X_train less than twice would not be counted if self.adapt_train_vocab: if self.predict_ens_test: adapt_vocab(X_train, (X_val, X_test, X_ensemble, X_held_out)) else: adapt_vocab(X_train, (X_val, X_test)) class_weight = calculate_class_weight(y_train) # initializing model, train and predict K.clear_session() self.kwargs['input_dim_map'] = extract_dim_input_name2id2np(self.input_name2id2np) # cross validation test data in categorical form y_test = np.argmax(y_test, axis=-1) # OBSOLETE if self.kwargs.get('mode') == 'ternary': if not self.by_fold: self.model = NN_architecture(**self.kwargs).model else: self.kwargs['pretrained_weight_dirs'] = self.pretrained_weight_dirs[fold_idx] self.model = NN_architecture(**self.kwargs).model self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[macro_f1]) # call backs es = EarlyStopping(patience=self.patience, monitor='val_loss', verbose=1) weight_dir = self.experiment_dir + str(fold_idx) + '.weight' mc = ModelCheckpoint(weight_dir, save_best_only=True, save_weights_only=True) callbacks = [es, mc] # fit for at least 1 epoch self.model.fit(x=X_train, y=y_train, validation_data=(X_val, y_val), class_weight=class_weight) # training self.model.fit(x=X_train, y=y_train, validation_data=(X_val, y_val), callbacks=callbacks, epochs=self.epochs, class_weight=class_weight) self.model.load_weights(weight_dir) # prediction y_pred = self.model.predict(x=X_test) y_pred_val = self.model.predict(x=X_val) # saving predictions for ensembles np.savetxt(self.experiment_dir + 'pred_test' + str(fold_idx) + '.np', y_pred) np.savetxt(self.experiment_dir + 'pred_val' + str(fold_idx) + '.np', y_pred_val) np.savetxt(self.experiment_dir + 'truth_test' + str(fold_idx) + '.np', y_test) np.savetxt(self.experiment_dir + 'truth_val' + str(fold_idx) + '.np', y_test) # make y categorical y_pred = np.argmax(y_pred, axis=-1) # only cascade model is used and it is the default elif self.kwargs.get('mode') is None or self.kwargs.get('mode') == 'cascade': # initialize the predictions num_train, num_val, num_test = y_train.shape[0], y_val.shape[0], y_test.shape[0] y_pred_val, y_pred_test = [None] * num_val, [None] * num_test for class_idx in range(2): # time the training start = int(round(time.time() * 1000)) # create layer name that has prefix # since for each fodl we train model for aggression and loss models separately if class_idx == 0: self.kwargs['prefix'] = 'aggression' else: self.kwargs['prefix'] = 'loss' # initialize a model if not self.by_fold: self.model = NN_architecture(**self.kwargs).model else: self.kwargs['pretrained_weight_dirs'] = self.pretrained_weight_dirs[fold_idx] self.model = NN_architecture(**self.kwargs).model self.model.compile(optimizer='adam', loss='binary_crossentropy') # create the label for this binary classification task _y_train_, _y_val_ = y_train[:,class_idx], y_val[:,class_idx] num_positive_train = sum(_y_train_) # call backs es = EarlyStopping(patience=self.patience, monitor='val_loss', verbose=1) weight_dir = self.experiment_dir + str(fold_idx) + '_' + str(class_idx) + '.weight' mc = ModelCheckpoint(weight_dir, save_best_only=True, save_weights_only=True) callbacks = [es, mc] # training if self.noise_function is None: self.model.fit(x=X_train, y=_y_train_, validation_data=(X_val, _y_val_)) history = self.model.fit(x=X_train, y=_y_train_, validation_data=(X_val, _y_val_), callbacks=callbacks, epochs=self.epochs) else: def add_noise2data(X_train): X_train_noised = {} for key in X_train: if 'char' in key and 'input' in key: X_train_noised[key] = np.array([self.noise_function(x[:]) for x in X_train[key]]) else: X_train_noised[key] = X_train[key] return X_train_noised self.model.fit(x=add_noise2data(X_train), y=_y_train_) best_val_loss, best_epoch = float('inf'), 0 for epoch_idx in range(1, self.epochs + 1): self.model.fit(x=add_noise2data(X_train), y=_y_train_) val_loss = self.model.evaluate(x=X_val, y=_y_val_) print('validation loss for epoch %d: %.3f' % (epoch_idx, val_loss)) if val_loss < best_val_loss: best_epoch = epoch_idx best_val_loss = val_loss self.model.save_weights(weight_dir) if epoch_idx - best_epoch >= self.patience: break self.model.load_weights(weight_dir) _y_pred_val_score, _y_pred_test_score = (self.model.predict(X_val).flatten(), self.model.predict(X_test).flatten()) if self.predict_ens_test: _y_pred_ensemble_score, _y_pred_held_out_score = (self.model.predict(X_ensemble).flatten(), self.model.predict(X_held_out).flatten()) prefix = self.experiment_dir + 'fold_%d_class_%d_' % (fold_idx, class_idx) np.savetxt(prefix + 'pred_val.np', _y_pred_val_score) np.savetxt(prefix + 'pred_test.np', _y_pred_test_score) if self.predict_ens_test: np.savetxt(prefix + 'pred_ensemble.np', _y_pred_ensemble_score) np.savetxt(prefix + 'pred_held_out.np', _y_pred_held_out_score) # threshold tuning best_t, best_f_val = 0, -1 for t in np.arange(0.01, 1, 0.01): y_val_pred_ = [0] * num_val for idx in range(num_val): if y_pred_val[idx] is None and _y_pred_val_score[idx] >= t: y_val_pred_[idx] = 1 f = f1_score(_y_val_, y_val_pred_) if f > best_f_val: best_f_val = f best_t = t # a temp variable that we do not want its value # to be accidentally accessed by outside code y_val_pred_ = None # predictions made only when predictions not made by the previous model # and larger than the best threshold # true for both val_pred and test_pred for idx in range(num_val): if y_pred_val[idx] is None and _y_pred_val_score[idx] >= best_t: y_pred_val[idx] = class_idx for idx in range(num_test): if y_pred_test[idx] is None and _y_pred_test_score[idx] >= best_t: y_pred_test[idx] = class_idx end = int(round(time.time())) # write how many time it takes for a run into the readme duration = end - start with open(self.experiment_dir + 'README', 'a') as readme: readme.write('fold %d class %d takes %d seconds\n' % (fold_idx, class_idx, duration)) # predict the rest as the "Other" class for idx in range(num_test): if y_pred_test[idx] is None: y_pred_test[idx] = 2 for idx in range(num_val): if y_pred_val[idx] is None: y_pred_val[idx] = 2 np.savetxt(self.experiment_dir + 'fold_%d_pred_val.np' % fold_idx, y_pred_val) np.savetxt(self.experiment_dir + 'fold_%d_pred_test.np' % fold_idx, y_pred_test) np.savetxt(self.experiment_dir + 'fold_%d_truth_val.np' % fold_idx, y_val) np.savetxt(self.experiment_dir + 'fold_%d_truth_test.np' % fold_idx, y_test) # append the result on this fold to results results.append(precision_recall_fscore_support(y_test, y_pred_test)) # saving results results = np.array(results) np.savetxt(self.experiment_dir + 'result_by_fold.np', results.flatten()) np.savetxt(self.experiment_dir + 'result_averaged.np', np.mean(results, axis=0)) np.savetxt(self.experiment_dir + 'result_std.np', np.std(results, axis=0)) avg_macro_f = np.mean(np.mean(results, axis=0)[2]) with open(self.experiment_dir + 'README', 'a') as readme: readme.write('macro F-score: %.4f\n' % avg_macro_f)