def process_tbn(X_train, y_train, X_test, y_test): # divide into train and corresponding handles X_train, handles_train = X_train[:, 1:].astype( np.float), X_train[:, :1].flatten() X_test, handles_test = X_test[:, 1:].astype( np.float), X_test[:, :1].flatten() numerical_train, embedding_bio_train, embedding_tweet_train, embedding_network_train = divide_data( X_train, bioLen, numLen, tweetLen) #print (embedding_bio_train.shape, embedding_tweet_train.shape) y_train = to_float_cuda( y_train ) #to_float_cuda(y_train.reshape(-1, 1)) if sourceType != SourceType.age else to_self_cuda(y_train) #embedding_train = cat_embeddings(numerical, embedding_bio, embedding_tweet) numerical_test, embedding_bio_test, embedding_tweet_test, embedding_network_test = divide_data( X_test, bioLen, numLen, tweetLen) y_test = to_self_cuda( y_test ) #to_float_cuda(y_test.reshape(-1, 1)) if sourceType == SourceType.age else to_self_cuda(y_test) #embedding_test = cat_embeddings(numerical, embedding_bio, embedding_tweet) bin_label = True #False if sourceType == SourceType.age else True if processType == ProcessType.mlp: embedding_train = cat_embeddings(numerical_train, embedding_bio_train, embedding_tweet_train) embedding_test = cat_embeddings(numerical_test, embedding_bio_test, embedding_tweet_test) index = y_train[:] > 1 embedding_train_greater = embedding_train[index] y_train_greater = y_train[index] y_train_greater = transfer_y(y_train_greater, True, 2, 1, 0) y_train_greater = to_float_cuda(y_train_greater) counter = [0, 0] model1 = MyMLP(embedding_train.shape[1], 20, D_out, bin_label) to_cuda(model1) train_model(model1, embedding_train_greater, y_train_greater) print("--------1") index = y_train[:] <= 1 embedding_train_smaller = embedding_train[index] y_train_smaller = y_train[index] #y_train_smaller = transfer_y(y_train_smaller, False, ) #print (y_train_smaller.shape) model2 = MyMLP(embedding_train.shape[1], 20, D_out, bin_label) to_cuda(model2) train_model(model2, embedding_train_smaller, y_train_smaller) #auc = eval_model(model2, embedding_train_smaller, y_train_smaller) #print (y_train_smaller) #print (auc) print("--------2") y_train_root = transfer_y(y_train, True, 1, 1, 0) model = MyMLP(embedding_train.shape[1], 20, D_out, bin_label) to_cuda(model) train_model(model, embedding_train, y_train_root) y_test_root = transfer_y(y_test, True, 1, 1, 0) #auc = eval_model(model, embedding_test, y_test_root) #print (auc) print("--------3") auc = han_eval_model([model1, model2, model], embedding_test, y_test) elif processType == ProcessType.name_c_tbn: l_out = 8 embedding_train = cat_embeddings(numerical_train, embedding_bio_train, embedding_tweet_train) embedding_test = cat_embeddings(numerical_test, embedding_bio_test, embedding_tweet_test) #print (handles_train) train_names_idx, train_names_len = divide_name(handles_train, handles2names) lstm_model = NameLstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble( embedding_train.shape[1] + hidden_size, int(embedding_train.shape[1] + hidden_size / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, embedding_train, train_names_idx, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, embedding_test, test_names_idx, test_names_len, y_test) elif processType == ProcessType.tbn_att: l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test), axis=1) index = y_train[:] > 1 embedding_train_greater, numerical_train_greater = embedding_train[ index], numerical_train[index] y_train_greater = y_train[index] y_train_greater = transfer_y(y_train_greater, True, 2, 1, 0) counter = [0, 0] lstm_model1 = LstmAttention(batch_size, hidden_size, embedding_length, l_out) #model1 = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model1, bin_label) model1 = Attention(768, 100, D_out, 768, bin_label) to_cuda(lstm_model1) to_cuda(model1) print(y_train_greater) train_model(model1, numerical_train_greater, embedding_train_greater, y_train_greater) print("--------1") index = y_train[:] <= 1 embedding_train_smaller, numerical_train_smaller = embedding_train[ index], numerical_train[index] y_train_smaller = y_train[index] lstm_model2 = LstmAttention(batch_size, hidden_size, embedding_length, l_out) #model2 = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model2, bin_label) model2 = Attention(768, 100, D_out, 768, bin_label) to_cuda(lstm_model2) to_cuda(model2) train_model(model2, numerical_train_smaller, embedding_train_smaller, y_train_smaller) print("--------2") y_train_root = transfer_y(y_train, True, 1, 1, 0) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) #model = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model, bin_label) model = Attention(768, 100, D_out, 768, bin_label) to_cuda(lstm_model) to_cuda(model) train_model(model, numerical_train, embedding_train, y_train_root) y_test_root = transfer_y(y_test, True, 1, 1, 0) #auc = eval_model(model, embedding_test, y_test_root) #print (auc) print("--------3") auc = han_eval_model([model1, model2, model], numerical_test, embedding_test, y_test) elif processType == ProcessType.name: l_out = 8 train_names_idx, train_names_len = divide_name(handles_train, handles2names) print(train_names_idx, train_names_len) lstm_model = NameLstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble(hidden_size, int(hidden_size / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, train_names_idx, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, test_names_idx, test_names_len, y_test) elif processType == ProcessType.tbn_c_name_att: l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test), axis=1) train_names_idx, train_names_len = divide_name(handles_train, handles2names) lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = MulLstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(embedding_train.shape[1] + hidden_size / 2), D_out, [lstm_sub_model], lstm_model, bin_label) to_cuda(lstm_sub_model) to_cuda(lstm_model) to_cuda(model) train_model(model, numerical_train, train_names_idx, embedding_train, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, numerical_test, test_names_idx, embedding_test, test_names_len, y_test) elif processType == ProcessType.tbnn_att: l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train, embedding_network_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test, embedding_network_test), axis=1) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(numerical_train.shape[1] + l_out / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, numerical_train, embedding_train, y_train) auc = eval_model(model, numerical_test, embedding_test, y_test) elif processType == ProcessType.tbnn_e_att: #emoji_embeddings, emoji_input_ids, dim = get_handle2idx_embeddings("/home/yaguang/db/wiki_sort_emoji_hashtag/") l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train, embedding_network_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test, embedding_network_test), axis=1) #emoji train_emoji_idx = divide_emojis(handles_train, emoji_input_ids) test_emoji_idx = divide_emojis(handles_test, emoji_input_ids) #print (emoji_embeddings) emoji_cnn_model = CNN_NLP(pretrained_embedding=emoji_embeddings, dropout=0.5) #emoji_cnn_model = CNN_NLP(vocab_size=dim) #lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = MulLstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(embedding_train.shape[1] + hidden_size / 2), D_out, [emoji_cnn_model], lstm_model, bin_label) to_cuda(emoji_cnn_model) to_cuda(lstm_model) to_cuda(model) train_model(model, numerical_train, train_emoji_idx, embedding_train, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, numerical_test, test_emoji_idx, embedding_test, y_test) elif processType == ProcessType.tbn_real_att: embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test), axis=1) l_out = 8 #model = Attention(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, 768, bin_label) model = Attention(768, 100, D_out, 768, bin_label) to_cuda(model) train_model(model, numerical_train, embedding_train, y_train) auc = eval_model(model, numerical_test, embedding_test, y_test) return auc
def process_tbn(X_train, y_train, X_test, y_test, X_test_mic, y_test_mic): # divide into train and corresponding handles X_train, handles_train = X_train[:, 1:].astype( np.float), X_train[:, :1].flatten() X_test, handles_test = X_test[:, 1:].astype( np.float), X_test[:, :1].flatten() numerical_train, embedding_bio_train, embedding_tweet_train, embedding_network_train = divide_data( X_train, bioLen, numLen, tweetLen) #print (embedding_bio_train.shape, embedding_tweet_train.shape) y_train = to_float_cuda(y_train.reshape( -1, 1)) if sourceType != SourceType.age else to_self_cuda(y_train) #embedding_train = cat_embeddings(numerical, embedding_bio, embedding_tweet) numerical_test, embedding_bio_test, embedding_tweet_test, embedding_network_test = divide_data( X_test, bioLen, numLen, tweetLen) y_test = to_float_cuda(y_test.reshape( -1, 1)) if sourceType == SourceType.age else to_self_cuda(y_test) #embedding_test = cat_embeddings(numerical, embedding_bio, embedding_tweet) bin_label = False if sourceType == SourceType.age else True if processType == ProcessType.mlp: embedding_train = cat_embeddings(numerical_train, embedding_bio_train, embedding_tweet_train) embedding_test = cat_embeddings(numerical_test, embedding_bio_test, embedding_tweet_test) model = MyMLP(embedding_train.shape[1], 20, D_out) to_cuda(model) train_model(model, embedding_train, y_train) auc = eval_model(model, embedding_test, y_test) elif processType == ProcessType.name_c_tbn: l_out = 8 embedding_train = cat_embeddings(numerical_train, embedding_bio_train, embedding_tweet_train) embedding_test = cat_embeddings(numerical_test, embedding_bio_test, embedding_tweet_test) print(handles_train) train_names_idx, train_names_len = divide_name(handles_train, handles2names) lstm_model = NameLstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble( embedding_train.shape[1] + hidden_size, int(embedding_train.shape[1] + hidden_size / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, embedding_train, train_names_idx, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, embedding_test, test_names_idx, test_names_len, y_test) elif processType == ProcessType.tbn_att: embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test), axis=1) #embedding_test_mic = torch.stack((embedding_bio_test_mic, embedding_tweet_test_mic), axis=1) l_out = 8 lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble(l_out, int(l_out / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, numerical_train, embedding_train, y_train) auc = eval_model(model, numerical_test, embedding_test, y_test) print(auc) auc = eval_model(model, numerical_test_mic, embedding_test_mic, y_test_mic) print(auc) elif processType == ProcessType.name: l_out = 8 train_names_idx, train_names_len = divide_name(handles_train, handles2names) lstm_model = NameLstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble(hidden_size, int(hidden_size / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, train_names_idx, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, test_names_idx, test_names_len, y_test) elif processType == ProcessType.tbn_c_name_att: l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test), axis=1) train_names_idx, train_names_len = divide_name(handles_train, handles2names) lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = MulLstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(embedding_train.shape[1] + hidden_size / 2), D_out, [lstm_sub_model], lstm_model, bin_label) to_cuda(lstm_sub_model) to_cuda(lstm_model) to_cuda(model) train_model(model, numerical_train, train_names_idx, embedding_train, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, numerical_test, test_names_idx, embedding_test, test_names_len, y_test) elif processType == ProcessType.tbnn_att: l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train, embedding_network_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test, embedding_network_test), axis=1) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(numerical_train.shape[1] + l_out / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, numerical_train, embedding_train, y_train) auc = eval_model(model, numerical_test, embedding_test, y_test) elif processType == ProcessType.tbnn_e_att: emoji_embeddings, emoji_input_ids, dim = get_handle2idx_embeddings( "/home/yaguang/pattern/db/wiki_sort_emoji_hashtag/") l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train, embedding_network_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test, embedding_network_test), axis=1) #emoji train_emoji_idx = divide_emojis(handles_train, emoji_input_ids) test_emoji_idx = divide_emojis(handles_test, emoji_input_ids) #emoji_cnn_model = CNN_NLP(pretrained_embedding=emoji_embeddings, dropout=0.5) emoji_cnn_model = CNN_NLP(vocab_size=dim) #lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = MulLstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(embedding_train.shape[1] + hidden_size / 2), D_out, [emoji_cnn_model], lstm_model, bin_label) to_cuda(emoji_cnn_model) to_cuda(lstm_model) to_cuda(model) train_model(model, numerical_train, train_emoji_idx, embedding_train, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, numerical_test, test_emoji_idx, embedding_test, test_names_len, y_test) return auc
for train_index, test_index in skf.split(index, labels): X_train_index, X_test_index = index[train_index], index[test_index] y_train, y_test = labels[train_index], labels[test_index] #print (y_train.tolist()) y_train = to_float_cuda(y_train) y_test = to_float_cuda(y_test) X_train_names = [] X_test_names = [] for idx in X_train_index: X_train_names.append(index_to_file_label[idx][0]) for idx in X_test_index: X_test_names.append(index_to_file_label[idx][0]) emoji_cnn_model = CNN(pretrained_embedding) to_cuda(emoji_cnn_model) model = EnsembleModelsAttention(D_in * 2, int(D_in * 2 // 2 / 2), D_out, int(D_in / 2), [emoji_cnn_model], bin_label) #model = SeqAttention(768, int(768/2/2), D_out, int(768/2), False, True) to_cuda(model) train_model(model, X_train_names, X_test_names) auc = eval_model(model, X_test_names) print(auc) w.write(str(auc) + "\n") rocs.append(auc) print("another epoch") break print(rocs) print(sum(rocs) / len(rocs))
counter[index_to_file_label[idx][1]] += 1 sample_files.write(index_to_file_label[idx][0]+"\n") sample_files.close() print (counter) for train_index, test_index in skf.split(index, labels): X_train_index, X_test_index = index[train_index], index[test_index] y_train, y_test = labels[train_index], labels[test_index] #print (y_train.tolist()) y_train = to_float_cuda(y_train) y_test = to_float_cuda(y_test) X_train_names = [] X_test_names = [] for idx in X_train_index: X_train_names.append(index_to_file_label[idx][0]) for idx in X_test_index: X_test_names.append(index_to_file_label[idx][0]) model = SeqAttention(768, int(768/2/2), D_out, int(768/2), False, True) to_cuda(model) train_model(model, X_train_names, X_test_names) auc = eval_model(model,X_test_names) print (auc) w.write(str(auc)+"\n") rocs.append(auc) print ("another epoch") print (rocs) print (sum(rocs)/len(rocs)) w.close()