예제 #1
0
def process_tbn(X_train, y_train, X_test, y_test):
    # divide into train and corresponding handles
    X_train, handles_train = X_train[:, 1:].astype(
        np.float), X_train[:, :1].flatten()
    X_test, handles_test = X_test[:, 1:].astype(
        np.float), X_test[:, :1].flatten()

    numerical_train, embedding_bio_train, embedding_tweet_train, embedding_network_train = divide_data(
        X_train, bioLen, numLen, tweetLen)
    #print (embedding_bio_train.shape, embedding_tweet_train.shape)
    y_train = to_float_cuda(
        y_train
    )  #to_float_cuda(y_train.reshape(-1, 1)) if sourceType != SourceType.age else to_self_cuda(y_train)
    #embedding_train = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    numerical_test, embedding_bio_test, embedding_tweet_test, embedding_network_test = divide_data(
        X_test, bioLen, numLen, tweetLen)
    y_test = to_self_cuda(
        y_test
    )  #to_float_cuda(y_test.reshape(-1, 1)) if sourceType == SourceType.age else to_self_cuda(y_test)
    #embedding_test = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    bin_label = True  #False if sourceType == SourceType.age else True
    if processType == ProcessType.mlp:
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)

        index = y_train[:] > 1
        embedding_train_greater = embedding_train[index]
        y_train_greater = y_train[index]
        y_train_greater = transfer_y(y_train_greater, True, 2, 1, 0)
        y_train_greater = to_float_cuda(y_train_greater)
        counter = [0, 0]
        model1 = MyMLP(embedding_train.shape[1], 20, D_out, bin_label)
        to_cuda(model1)
        train_model(model1, embedding_train_greater, y_train_greater)
        print("--------1")

        index = y_train[:] <= 1
        embedding_train_smaller = embedding_train[index]
        y_train_smaller = y_train[index]
        #y_train_smaller = transfer_y(y_train_smaller, False, )
        #print (y_train_smaller.shape)
        model2 = MyMLP(embedding_train.shape[1], 20, D_out, bin_label)
        to_cuda(model2)
        train_model(model2, embedding_train_smaller, y_train_smaller)
        #auc = eval_model(model2, embedding_train_smaller, y_train_smaller)
        #print (y_train_smaller)
        #print (auc)
        print("--------2")

        y_train_root = transfer_y(y_train, True, 1, 1, 0)
        model = MyMLP(embedding_train.shape[1], 20, D_out, bin_label)
        to_cuda(model)
        train_model(model, embedding_train, y_train_root)
        y_test_root = transfer_y(y_test, True, 1, 1, 0)
        #auc = eval_model(model, embedding_test, y_test_root)
        #print (auc)
        print("--------3")

        auc = han_eval_model([model1, model2, model], embedding_test, y_test)
    elif processType == ProcessType.name_c_tbn:
        l_out = 8
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)
        #print (handles_train)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(
            embedding_train.shape[1] + hidden_size,
            int(embedding_train.shape[1] + hidden_size / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, embedding_train, train_names_idx, train_names_len,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, embedding_test, test_names_idx, test_names_len,
                         y_test)
    elif processType == ProcessType.tbn_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        index = y_train[:] > 1
        embedding_train_greater, numerical_train_greater = embedding_train[
            index], numerical_train[index]
        y_train_greater = y_train[index]
        y_train_greater = transfer_y(y_train_greater, True, 2, 1, 0)
        counter = [0, 0]
        lstm_model1 = LstmAttention(batch_size, hidden_size, embedding_length,
                                    l_out)
        #model1 = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model1, bin_label)
        model1 = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(lstm_model1)
        to_cuda(model1)
        print(y_train_greater)
        train_model(model1, numerical_train_greater, embedding_train_greater,
                    y_train_greater)
        print("--------1")

        index = y_train[:] <= 1
        embedding_train_smaller, numerical_train_smaller = embedding_train[
            index], numerical_train[index]
        y_train_smaller = y_train[index]
        lstm_model2 = LstmAttention(batch_size, hidden_size, embedding_length,
                                    l_out)
        #model2 = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model2, bin_label)
        model2 = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(lstm_model2)
        to_cuda(model2)
        train_model(model2, numerical_train_smaller, embedding_train_smaller,
                    y_train_smaller)
        print("--------2")

        y_train_root = transfer_y(y_train, True, 1, 1, 0)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        #model = LstmAttentionEnsemble(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, lstm_model, bin_label)
        model = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train_root)
        y_test_root = transfer_y(y_test, True, 1, 1, 0)
        #auc = eval_model(model, embedding_test, y_test_root)
        #print (auc)
        print("--------3")

        auc = han_eval_model([model1, model2, model], numerical_test,
                             embedding_test, y_test)
    elif processType == ProcessType.name:
        l_out = 8
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        print(train_names_idx, train_names_len)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(hidden_size, int(hidden_size / 2), D_out,
                                      lstm_model, bin_label)
        to_cuda(model)
        train_model(model, train_names_idx, train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, test_names_idx, test_names_len, y_test)
    elif processType == ProcessType.tbn_c_name_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [lstm_sub_model], lstm_model, bin_label)
        to_cuda(lstm_sub_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_names_idx, embedding_train,
                    train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_names_idx, embedding_test,
                         test_names_len, y_test)
    elif processType == ProcessType.tbnn_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = LstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(numerical_train.shape[1] + l_out / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
    elif processType == ProcessType.tbnn_e_att:
        #emoji_embeddings, emoji_input_ids, dim = get_handle2idx_embeddings("/home/yaguang/db/wiki_sort_emoji_hashtag/")
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        #emoji
        train_emoji_idx = divide_emojis(handles_train, emoji_input_ids)
        test_emoji_idx = divide_emojis(handles_test, emoji_input_ids)
        #print (emoji_embeddings)
        emoji_cnn_model = CNN_NLP(pretrained_embedding=emoji_embeddings,
                                  dropout=0.5)
        #emoji_cnn_model = CNN_NLP(vocab_size=dim)

        #lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [emoji_cnn_model], lstm_model, bin_label)

        to_cuda(emoji_cnn_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_emoji_idx, embedding_train,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_emoji_idx, embedding_test,
                         y_test)
    elif processType == ProcessType.tbn_real_att:
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        l_out = 8
        #model = Attention(numerical_train.shape[1]+l_out, int(numerical_train.shape[1]+l_out/2), D_out, 768, bin_label)
        model = Attention(768, 100, D_out, 768, bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
    return auc
예제 #2
0
def process_tbn(X_train, y_train, X_test, y_test, X_test_mic, y_test_mic):
    # divide into train and corresponding handles
    X_train, handles_train = X_train[:, 1:].astype(
        np.float), X_train[:, :1].flatten()
    X_test, handles_test = X_test[:, 1:].astype(
        np.float), X_test[:, :1].flatten()

    numerical_train, embedding_bio_train, embedding_tweet_train, embedding_network_train = divide_data(
        X_train, bioLen, numLen, tweetLen)
    #print (embedding_bio_train.shape, embedding_tweet_train.shape)
    y_train = to_float_cuda(y_train.reshape(
        -1, 1)) if sourceType != SourceType.age else to_self_cuda(y_train)
    #embedding_train = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    numerical_test, embedding_bio_test, embedding_tweet_test, embedding_network_test = divide_data(
        X_test, bioLen, numLen, tweetLen)
    y_test = to_float_cuda(y_test.reshape(
        -1, 1)) if sourceType == SourceType.age else to_self_cuda(y_test)
    #embedding_test = cat_embeddings(numerical, embedding_bio, embedding_tweet)

    bin_label = False if sourceType == SourceType.age else True

    if processType == ProcessType.mlp:
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)
        model = MyMLP(embedding_train.shape[1], 20, D_out)
        to_cuda(model)
        train_model(model, embedding_train, y_train)
        auc = eval_model(model, embedding_test, y_test)
    elif processType == ProcessType.name_c_tbn:
        l_out = 8
        embedding_train = cat_embeddings(numerical_train, embedding_bio_train,
                                         embedding_tweet_train)
        embedding_test = cat_embeddings(numerical_test, embedding_bio_test,
                                        embedding_tweet_test)
        print(handles_train)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(
            embedding_train.shape[1] + hidden_size,
            int(embedding_train.shape[1] + hidden_size / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, embedding_train, train_names_idx, train_names_len,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, embedding_test, test_names_idx, test_names_len,
                         y_test)
    elif processType == ProcessType.tbn_att:
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        #embedding_test_mic = torch.stack((embedding_bio_test_mic, embedding_tweet_test_mic), axis=1)
        l_out = 8
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = LstmAttentionEnsemble(l_out, int(l_out / 2), D_out, lstm_model,
                                      bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
        print(auc)
        auc = eval_model(model, numerical_test_mic, embedding_test_mic,
                         y_test_mic)
        print(auc)
    elif processType == ProcessType.name:
        l_out = 8
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_model = NameLstmAttention(batch_size, hidden_size,
                                       embedding_length, l_out)
        model = LstmAttentionEnsemble(hidden_size, int(hidden_size / 2), D_out,
                                      lstm_model, bin_label)
        to_cuda(model)
        train_model(model, train_names_idx, train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, test_names_idx, test_names_len, y_test)
    elif processType == ProcessType.tbn_c_name_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train), axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test), axis=1)
        train_names_idx, train_names_len = divide_name(handles_train,
                                                       handles2names)
        lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [lstm_sub_model], lstm_model, bin_label)
        to_cuda(lstm_sub_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_names_idx, embedding_train,
                    train_names_len, y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_names_idx, embedding_test,
                         test_names_len, y_test)
    elif processType == ProcessType.tbnn_att:
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = LstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(numerical_train.shape[1] + l_out / 2), D_out, lstm_model,
            bin_label)
        to_cuda(model)
        train_model(model, numerical_train, embedding_train, y_train)
        auc = eval_model(model, numerical_test, embedding_test, y_test)
    elif processType == ProcessType.tbnn_e_att:
        emoji_embeddings, emoji_input_ids, dim = get_handle2idx_embeddings(
            "/home/yaguang/pattern/db/wiki_sort_emoji_hashtag/")
        l_out = 8
        embedding_train = torch.stack(
            (embedding_bio_train, embedding_tweet_train,
             embedding_network_train),
            axis=1)
        embedding_test = torch.stack(
            (embedding_bio_test, embedding_tweet_test, embedding_network_test),
            axis=1)
        #emoji
        train_emoji_idx = divide_emojis(handles_train, emoji_input_ids)
        test_emoji_idx = divide_emojis(handles_test, emoji_input_ids)
        #emoji_cnn_model = CNN_NLP(pretrained_embedding=emoji_embeddings, dropout=0.5)
        emoji_cnn_model = CNN_NLP(vocab_size=dim)

        #lstm_sub_model = NameLstmAttention(batch_size, 768, 1000, l_out)
        lstm_model = LstmAttention(batch_size, hidden_size, embedding_length,
                                   l_out)
        model = MulLstmAttentionEnsemble(
            numerical_train.shape[1] + l_out,
            int(embedding_train.shape[1] + hidden_size / 2), D_out,
            [emoji_cnn_model], lstm_model, bin_label)

        to_cuda(emoji_cnn_model)
        to_cuda(lstm_model)
        to_cuda(model)
        train_model(model, numerical_train, train_emoji_idx, embedding_train,
                    y_train)
        test_names_idx, test_names_len = divide_name(handles_test,
                                                     handles2names)
        auc = eval_model(model, numerical_test, test_emoji_idx, embedding_test,
                         test_names_len, y_test)
    return auc
for train_index, test_index in skf.split(index, labels):
    X_train_index, X_test_index = index[train_index], index[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    #print (y_train.tolist())
    y_train = to_float_cuda(y_train)
    y_test = to_float_cuda(y_test)
    X_train_names = []
    X_test_names = []
    for idx in X_train_index:
        X_train_names.append(index_to_file_label[idx][0])
    for idx in X_test_index:
        X_test_names.append(index_to_file_label[idx][0])

    emoji_cnn_model = CNN(pretrained_embedding)
    to_cuda(emoji_cnn_model)
    model = EnsembleModelsAttention(D_in * 2, int(D_in * 2 // 2 / 2), D_out,
                                    int(D_in / 2), [emoji_cnn_model],
                                    bin_label)
    #model = SeqAttention(768, int(768/2/2), D_out, int(768/2), False, True)
    to_cuda(model)
    train_model(model, X_train_names, X_test_names)

    auc = eval_model(model, X_test_names)
    print(auc)
    w.write(str(auc) + "\n")
    rocs.append(auc)
    print("another epoch")
    break
print(rocs)
print(sum(rocs) / len(rocs))
예제 #4
0
    counter[index_to_file_label[idx][1]] += 1
    sample_files.write(index_to_file_label[idx][0]+"\n")
sample_files.close()
print (counter)

for train_index, test_index in skf.split(index, labels):
    X_train_index, X_test_index = index[train_index], index[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    #print (y_train.tolist())
    y_train = to_float_cuda(y_train)
    y_test = to_float_cuda(y_test)
    X_train_names = []
    X_test_names = []
    for idx in X_train_index:
        X_train_names.append(index_to_file_label[idx][0])
    for idx in X_test_index:
        X_test_names.append(index_to_file_label[idx][0])

    model = SeqAttention(768, int(768/2/2), D_out, int(768/2), False, True)
    to_cuda(model)
    train_model(model, X_train_names, X_test_names)

    auc = eval_model(model,X_test_names)
    print (auc)
    w.write(str(auc)+"\n")
    rocs.append(auc)
    print ("another epoch")
print (rocs)
print (sum(rocs)/len(rocs))
w.close()