def divide_name(handles, handles2names): names = [handles2names[handle] for handle in handles] names_idx, names_len = embedAndPack(names) print(names_idx) names_idx = to_self_cuda(names_idx) names_len = names_len.flatten() return names_idx, names_len
def eval_model(model, test_names): idx = 0 y_hat_test_class = [] y_test = [] model.eval() while idx < len(test_names): batch_test_names = test_names[idx:idx + batch_size] X_batch_test = [] X_batch_test_emoji = [] seq_lens = [] for i in range(len(batch_test_names)): handle = batch_test_names[i][:-4].lower() pick_emd = txn.get(handle.encode()) temp = pickle.loads(pick_emd) dates = [val[0] for val in temp][::-1] tweet_emb = [val[1:] for val in temp] #tweet_emb = pickle.loads(txn.get(handle.encode()))[:fix_seq_len] seq_lens.append(len(tweet_emb)) while len(tweet_emb) < fix_seq_len: tweet_emb.append([0 for i in range(768)]) emoji_idx = process_tweet(handle, dates, longest_emoji_len, word2idx, fix_seq_len)[::-1] X_batch_test_emoji.append(emoji_idx) X_batch_test.append(tweet_emb) y_test.append(map_attribute(handle)) X_batch_test = to_float_cuda(X_batch_test) X_batch_test = X_batch_test.permute(1, 0, 2) X_batch_test_emoji = to_self_cuda(X_batch_test_emoji) y_hat_test = model(fix_seq_len, None, X_batch_test, seq_lens, [(X_batch_test_emoji, None)]) #print (y_hat_test) #print (y_hat_test.cpu().detach().numpy()) if inference_type == InferenceType.age: target = np.argmax(y_hat_test.cpu().detach().numpy(), axis=1) for i in range(len(target)): y_hat_test_class.append(target[i]) else: target = np.where(y_hat_test.cpu().detach().numpy() < 0.5, 0, 1) for i in range(len(target)): y_hat_test_class.append(target[i][0]) #print ("epoch "+str(idx)) idx += batch_size print(y_test, y_hat_test_class) f1 = f1_score(y_test, y_hat_test_class, average='macro') #auc = roc_auc_score(y_test, y_hat_test_class ) return f1
def process_tbn(X_train, y_train, X_test, y_test): # divide into train and corresponding handles X_train, handles_train = X_train[:, 1:].astype( np.float), X_train[:, :1].flatten() X_test, handles_test = X_test[:, 1:].astype( np.float), X_test[:, :1].flatten() numerical_train, embedding_bio_train, embedding_tweet_train, embedding_network_train = divide_data( X_train, bioLen, numLen, tweetLen) #print (embedding_bio_train.shape, embedding_tweet_train.shape) y_train = to_float_cuda( y_train ) #to_float_cuda(y_train.reshape(-1, 1)) if sourceType != SourceType.age else to_self_cuda(y_train) #embedding_train = cat_embeddings(numerical, embedding_bio, embedding_tweet) numerical_test, embedding_bio_test, embedding_tweet_test, embedding_network_test = divide_data( X_test, bioLen, numLen, tweetLen) y_test = to_self_cuda( y_test ) #to_float_cuda(y_test.reshape(-1, 1)) if sourceType == SourceType.age else to_self_cuda(y_test) #embedding_test = cat_embeddings(numerical, embedding_bio, embedding_tweet) bin_label = True #False if sourceType == SourceType.age else True if processType == ProcessType.mlp: embedding_train = cat_embeddings(numerical_train, embedding_bio_train, embedding_tweet_train) embedding_test = cat_embeddings(numerical_test, embedding_bio_test, embedding_tweet_test) index = y_train[:] > 1 embedding_train_greater = embedding_train[index] y_train_greater = y_train[index] y_train_greater = transfer_y(y_train_greater, True, 2, 1, 0) y_train_greater = to_float_cuda(y_train_greater) counter = [0, 0] model1 = MyMLP(embedding_train.shape[1], 20, D_out, bin_label) to_cuda(model1) train_model(model1, embedding_train_greater, y_train_greater) print("--------1") index = y_train[:] <= 1 embedding_train_smaller = embedding_train[index] y_train_smaller = y_train[index] #y_train_smaller = transfer_y(y_train_smaller, False, ) #print (y_train_smaller.shape) model2 = MyMLP(embedding_train.shape[1], 20, D_out, bin_label) to_cuda(model2) train_model(model2, embedding_train_smaller, y_train_smaller) #auc = eval_model(model2, embedding_train_smaller, y_train_smaller) #print (y_train_smaller) #print (auc) print("--------2") y_train_root = transfer_y(y_train, True, 1, 1, 0) model = MyMLP(embedding_train.shape[1], 20, D_out, bin_label) to_cuda(model) train_model(model, embedding_train, y_train_root) y_test_root = transfer_y(y_test, True, 1, 1, 0) #auc = eval_model(model, embedding_test, y_test_root) #print (auc) print("--------3") auc = han_eval_model([model1, model2, model], embedding_test, y_test) elif processType == ProcessType.name_c_tbn: l_out = 8 embedding_train = cat_embeddings(numerical_train, embedding_bio_train, embedding_tweet_train) embedding_test = cat_embeddings(numerical_test, embedding_bio_test, embedding_tweet_test) #print (handles_train) train_names_idx, train_names_len = divide_name(handles_train, handles2names) lstm_model = NameLstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble( embedding_train.shape[1] + hidden_size, int(embedding_train.shape[1] + hidden_size / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, embedding_train, train_names_idx, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, embedding_test, test_names_idx, test_names_len, y_test) elif processType == ProcessType.tbn_att: l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test), axis=1) index = y_train[:] > 1 embedding_train_greater, numerical_train_greater = embedding_train[ index], numerical_train[index] y_train_greater = y_train[index] y_train_greater = transfer_y(y_train_greater, True, 2, 1, 0) counter = [0, 0] lstm_model1 = LstmAttention(batch_size, hidden_size, embedding_length, l_out) #model1 = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model1, bin_label) model1 = Attention(768, 100, D_out, 768, bin_label) to_cuda(lstm_model1) to_cuda(model1) print(y_train_greater) train_model(model1, numerical_train_greater, embedding_train_greater, y_train_greater) print("--------1") index = y_train[:] <= 1 embedding_train_smaller, numerical_train_smaller = embedding_train[ index], numerical_train[index] y_train_smaller = y_train[index] lstm_model2 = LstmAttention(batch_size, hidden_size, embedding_length, l_out) #model2 = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model2, bin_label) model2 = Attention(768, 100, D_out, 768, bin_label) to_cuda(lstm_model2) to_cuda(model2) train_model(model2, numerical_train_smaller, embedding_train_smaller, y_train_smaller) print("--------2") y_train_root = transfer_y(y_train, True, 1, 1, 0) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) #model = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model, bin_label) model = Attention(768, 100, D_out, 768, bin_label) to_cuda(lstm_model) to_cuda(model) train_model(model, numerical_train, embedding_train, y_train_root) y_test_root = transfer_y(y_test, True, 1, 1, 0) #auc = eval_model(model, embedding_test, y_test_root) #print (auc) print("--------3") auc = han_eval_model([model1, model2, model], numerical_test, embedding_test, y_test) elif processType == ProcessType.name: l_out = 8 train_names_idx, train_names_len = divide_name(handles_train, handles2names) print(train_names_idx, train_names_len) lstm_model = NameLstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble(hidden_size, int(hidden_size / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, train_names_idx, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, test_names_idx, test_names_len, y_test) elif processType == ProcessType.tbn_c_name_att: l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test), axis=1) train_names_idx, train_names_len = divide_name(handles_train, handles2names) lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = MulLstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(embedding_train.shape[1] + hidden_size / 2), D_out, [lstm_sub_model], lstm_model, bin_label) to_cuda(lstm_sub_model) to_cuda(lstm_model) to_cuda(model) train_model(model, numerical_train, train_names_idx, embedding_train, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, numerical_test, test_names_idx, embedding_test, test_names_len, y_test) elif processType == ProcessType.tbnn_att: l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train, embedding_network_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test, embedding_network_test), axis=1) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(numerical_train.shape[1] + l_out / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, numerical_train, embedding_train, y_train) auc = eval_model(model, numerical_test, embedding_test, y_test) elif processType == ProcessType.tbnn_e_att: #emoji_embeddings, emoji_input_ids, dim = get_handle2idx_embeddings("/home/yaguang/db/wiki_sort_emoji_hashtag/") l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train, embedding_network_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test, embedding_network_test), axis=1) #emoji train_emoji_idx = divide_emojis(handles_train, emoji_input_ids) test_emoji_idx = divide_emojis(handles_test, emoji_input_ids) #print (emoji_embeddings) emoji_cnn_model = CNN_NLP(pretrained_embedding=emoji_embeddings, dropout=0.5) #emoji_cnn_model = CNN_NLP(vocab_size=dim) #lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = MulLstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(embedding_train.shape[1] + hidden_size / 2), D_out, [emoji_cnn_model], lstm_model, bin_label) to_cuda(emoji_cnn_model) to_cuda(lstm_model) to_cuda(model) train_model(model, numerical_train, train_emoji_idx, embedding_train, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, numerical_test, test_emoji_idx, embedding_test, y_test) elif processType == ProcessType.tbn_real_att: embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test), axis=1) l_out = 8 #model = Attention(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, 768, bin_label) model = Attention(768, 100, D_out, 768, bin_label) to_cuda(model) train_model(model, numerical_train, embedding_train, y_train) auc = eval_model(model, numerical_test, embedding_test, y_test) return auc
def train_model(model, train_names, test_names): optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) loss_fn = nn.CrossEntropyLoss( ) if inference_type == InferenceType.age else nn.BCELoss() #model.train() for epoch in range(epochs): idx = 0 counter = [0, 0, 0, 0, 0] model.train() train_names = random.sample(train_names, len(train_names)) while idx < len(train_names): batch_train_names = train_names[idx:idx + batch_size] X_batch_train = [] y_batch_train = [] X_batch_train_emoji = [] seq_lens = [] for i in range(len(batch_train_names)): handle = batch_train_names[i][:-4].lower() #print (handle) pick_emd = txn.get(handle.encode()) temp = pickle.loads(pick_emd) #print (temp) dates = [val[0] for val in temp][::-1] tweet_emb = [val[1:] for val in temp] seq_lens.append(len(tweet_emb)) while len(tweet_emb) < fix_seq_len: tweet_emb.append([0 for i in range(768)]) emoji_idx = process_tweet(handle, dates, longest_emoji_len, word2idx, fix_seq_len)[::-1] X_batch_train.append(tweet_emb) X_batch_train_emoji.append(emoji_idx) y_batch_train.append(map_attribute(handle)) counter[map_attribute(handle)] += 1 X_batch_train = to_float_cuda(X_batch_train) X_batch_train_emoji = to_self_cuda(X_batch_train_emoji) #print ("X_batch_train") #print (X_batch_train.shape) X_batch_train = X_batch_train.permute(1, 0, 2) #print (X_batch_train.shape) #X_batch_train = pack_padded_sequence(X_batch_train) y_batch_train = to_float_cuda(y_batch_train).reshape( -1, 1) if inference_type != InferenceType.age else to_self_cuda( y_batch_train) #y_batch_train = to_float_cuda(y_batch_train).reshape(-1, 1) y_pred = model(fix_seq_len, None, X_batch_train, seq_lens, [(X_batch_train_emoji, None)]) loss = loss_fn(y_pred, y_batch_train) #print (y_pred, y_batch_train) loss.backward() optimizer.step() optimizer.zero_grad() idx += batch_size print("epoch " + str(epoch) + " with batch " + str(int(idx / batch_size)) + " is " + str(loss.item())) #print (eval_mem_model(model,X_batch_train, y_batch_train, seq_lens)) #break auc = eval_model(model, X_test_names) print(auc) return loss.item()
def process_tbn(X_train, y_train, X_test, y_test, X_test_mic, y_test_mic): # divide into train and corresponding handles X_train, handles_train = X_train[:, 1:].astype( np.float), X_train[:, :1].flatten() X_test, handles_test = X_test[:, 1:].astype( np.float), X_test[:, :1].flatten() numerical_train, embedding_bio_train, embedding_tweet_train, embedding_network_train = divide_data( X_train, bioLen, numLen, tweetLen) #print (embedding_bio_train.shape, embedding_tweet_train.shape) y_train = to_float_cuda(y_train.reshape( -1, 1)) if sourceType != SourceType.age else to_self_cuda(y_train) #embedding_train = cat_embeddings(numerical, embedding_bio, embedding_tweet) numerical_test, embedding_bio_test, embedding_tweet_test, embedding_network_test = divide_data( X_test, bioLen, numLen, tweetLen) y_test = to_float_cuda(y_test.reshape( -1, 1)) if sourceType == SourceType.age else to_self_cuda(y_test) #embedding_test = cat_embeddings(numerical, embedding_bio, embedding_tweet) bin_label = False if sourceType == SourceType.age else True if processType == ProcessType.mlp: embedding_train = cat_embeddings(numerical_train, embedding_bio_train, embedding_tweet_train) embedding_test = cat_embeddings(numerical_test, embedding_bio_test, embedding_tweet_test) model = MyMLP(embedding_train.shape[1], 20, D_out) to_cuda(model) train_model(model, embedding_train, y_train) auc = eval_model(model, embedding_test, y_test) elif processType == ProcessType.name_c_tbn: l_out = 8 embedding_train = cat_embeddings(numerical_train, embedding_bio_train, embedding_tweet_train) embedding_test = cat_embeddings(numerical_test, embedding_bio_test, embedding_tweet_test) print(handles_train) train_names_idx, train_names_len = divide_name(handles_train, handles2names) lstm_model = NameLstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble( embedding_train.shape[1] + hidden_size, int(embedding_train.shape[1] + hidden_size / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, embedding_train, train_names_idx, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, embedding_test, test_names_idx, test_names_len, y_test) elif processType == ProcessType.tbn_att: embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test), axis=1) #embedding_test_mic = torch.stack((embedding_bio_test_mic, embedding_tweet_test_mic), axis=1) l_out = 8 lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble(l_out, int(l_out / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, numerical_train, embedding_train, y_train) auc = eval_model(model, numerical_test, embedding_test, y_test) print(auc) auc = eval_model(model, numerical_test_mic, embedding_test_mic, y_test_mic) print(auc) elif processType == ProcessType.name: l_out = 8 train_names_idx, train_names_len = divide_name(handles_train, handles2names) lstm_model = NameLstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble(hidden_size, int(hidden_size / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, train_names_idx, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, test_names_idx, test_names_len, y_test) elif processType == ProcessType.tbn_c_name_att: l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test), axis=1) train_names_idx, train_names_len = divide_name(handles_train, handles2names) lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = MulLstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(embedding_train.shape[1] + hidden_size / 2), D_out, [lstm_sub_model], lstm_model, bin_label) to_cuda(lstm_sub_model) to_cuda(lstm_model) to_cuda(model) train_model(model, numerical_train, train_names_idx, embedding_train, train_names_len, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, numerical_test, test_names_idx, embedding_test, test_names_len, y_test) elif processType == ProcessType.tbnn_att: l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train, embedding_network_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test, embedding_network_test), axis=1) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = LstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(numerical_train.shape[1] + l_out / 2), D_out, lstm_model, bin_label) to_cuda(model) train_model(model, numerical_train, embedding_train, y_train) auc = eval_model(model, numerical_test, embedding_test, y_test) elif processType == ProcessType.tbnn_e_att: emoji_embeddings, emoji_input_ids, dim = get_handle2idx_embeddings( "/home/yaguang/pattern/db/wiki_sort_emoji_hashtag/") l_out = 8 embedding_train = torch.stack( (embedding_bio_train, embedding_tweet_train, embedding_network_train), axis=1) embedding_test = torch.stack( (embedding_bio_test, embedding_tweet_test, embedding_network_test), axis=1) #emoji train_emoji_idx = divide_emojis(handles_train, emoji_input_ids) test_emoji_idx = divide_emojis(handles_test, emoji_input_ids) #emoji_cnn_model = CNN_NLP(pretrained_embedding=emoji_embeddings, dropout=0.5) emoji_cnn_model = CNN_NLP(vocab_size=dim) #lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out) lstm_model = LstmAttention(batch_size, hidden_size, embedding_length, l_out) model = MulLstmAttentionEnsemble( numerical_train.shape[1] + l_out, int(embedding_train.shape[1] + hidden_size / 2), D_out, [emoji_cnn_model], lstm_model, bin_label) to_cuda(emoji_cnn_model) to_cuda(lstm_model) to_cuda(model) train_model(model, numerical_train, train_emoji_idx, embedding_train, y_train) test_names_idx, test_names_len = divide_name(handles_test, handles2names) auc = eval_model(model, numerical_test, test_emoji_idx, embedding_test, test_names_len, y_test) return auc
def divide_emojis(handles, input_ids): handle_ids = [input_ids[handle] for handle in handles] #print (handles[:4]) #print ([val.shape for val in handle_ids[:4]]) return to_self_cuda(np.concatenate(handle_ids, axis=0))