示例#1
0
def get_features_for_prediction(features, i, use_pca=False):
    X_train, y_train, X_test, X_val, y_val = [],[],[],[],[]

    for item in features:
        ## distinguish twitter glove and common glove
        ## distinguish deepmoji sum and avg
        feature, ty, mode = featureAnalysis(item)

        if feature=="glove" and ty=="twitter":
            constant.emb_dim = 200
        elif: feature=="emoji":
            pass
        else:
            constant.emb_dim = 300
            pass
        pass

        print(feature)
        ## prepare data for feature-10 folders
        vocab = generate_vocab(include_test=True)
        train, val, dev_no_lab = read_data(is_shuffle=True, random_state=i, dev_with_label=False, include_test=True)
        ## Add labels to dev_no_lab for getting features
        ind = dev_no_lab[0]
        X_text = dev_no_lab[1]
        labels = ["others" for i in range(len(ind))]
        dev = (ind, X_text, labels)
        
        ## feature_list: glove emoji elmo bert deepmoji emo2vec
        ## if you want twitter glove or common glove use  ty='twitter' and ty='common'
        print(ty)
        Xi_train, yi_train = get_feature(train, vocab, feature_list=[feature], mode=[mode],split="final_train"+str(i),ty=[ty]) ## [29010,3,emb_size] 3 is number of sentence
        # Xi_val, yi_val = get_feature(val, vocab, feature_list=[feature], mode=[mode],split="final_valid"+str(i),ty=[ty]) ## [1150,3,emb_size]        
        Xi_test, _ = get_feature(dev, vocab, feature_list=[feature], mode=[mode],split="final_test"+str(i),ty=[ty]) ## [2755,3,emb_size]

        # Xi_train = np.concatenate((Xi_train, Xi_val), axis = 0)
        # yi_train = np.concatenate((yi_train, yi_val), axis = 0)
        if use_pca:
            Xi_train, Xi_val, Xi_test = pca(Xi_train,Xi_val,Xi_test)
            pass

        # if feature == "bert":
        #     print(Xi_train.shape)
        #     Xi_train = np.squeeze(Xi_train,axis = 2)
        #     Xi_test = np.squeeze(Xi_test,axis = 2)
        #     Xi_val = np.squeeze(Xi_val,axis = 2)
        #     pass
        if X_train==[]:
            X_train = Xi_train
            y_train = yi_train
            X_test = Xi_test
            # X_val = Xi_val
            # y_val = yi_val
        else:
            X_train = np.concatenate((X_train, Xi_train), axis = 2)
            X_test = np.concatenate((X_test, Xi_test), axis = 2)
            # X_val = np.concatenate((X_val, Xi_val), axis = 2)
    return X_train, y_train, X_val, y_val, X_test, ind, X_text
示例#2
0
def get_single_feature_for_svm(feature, ty, i):
    
    ## prepare data for feature-10 folders
    vocab = generate_vocab()
    train, val, dev_no_lab = read_data(is_shuffle=True, random_state=i, dev_with_label=constant.dev_with_label, include_test=constant.include_test)
    ## feature_list: glove emoji elmo bert deepmoji emo2vec
    ## if you want twitter glove or common glove use  ty='twitter' and ty='common'
    X_train, y_train = get_feature(train, vocab, feature_list=[feature], mode=['sum'],split="train",ty=ty) ## [29010,3,emb_size] 3 is number of sentence
    X_test, y_test = get_feature(val, vocab, feature_list=[feature], mode=['sum'],split="valid",ty=ty) ## [1150,3,emb_size]
    
    X_train_reduced, X_test_reduced, _ = pca(X_train, X_test)

    return X_train_reduced, y_train, X_test_reduced, y_test
示例#3
0
def get_multi_features(features, i, emb_dim, use_pca=False):
    X_train, y_train, X_test, y_test = [],[],[],[]

    for item in features:
        ## distinguish twitter glove and common glove
        ## distinguish deepmoji sum and avg
        feature, ty, mode = featureAnalysis(item)
        
        if item == features[-2]:
            constant.emb_dim = emb_dim[0]
        elif item == features[-1]:
            constant.emb_dim = emb_dim[1]
        else:
            constant.emb_dim = 300

        print(feature)
        ## prepare data for feature-10 folders
        vocab = generate_vocab()
        train, val, dev_no_lab = read_data(is_shuffle=True, random_state=i, dev_with_label=constant.dev_with_label, include_test=constant.include_test)

        ## feature_list: glove emoji elmo bert deepmoji emo2vec
        ## if you want twitter glove or common glove use  ty='twitter' and ty='common'
        split_train = "merged_train"+str(i) if constant.include_test else "train"+str(i)
        split_val = "merged_val"+str(i) if constant.include_test else "valid"+str(i)

        print("Loading split", split_train)

        Xi_train, yi_train = get_feature(train, vocab, feature_list=[feature], mode=[mode],split=split_train,ty=ty) ## [29010,3,emb_size] 3 is number of sentence
        Xi_test, yi_test = get_feature(val, vocab, feature_list=[feature], mode=[mode],split=split_val,ty=ty) ## [1150,3,emb_size]

        if use_pca:
            Xi_train, Xi_test, _ = pca(Xi_train, Xi_test)
            pass

        if feature == "bert":
            Xi_train = np.squeeze(Xi_train,axis = 2)
            Xi_test = np.squeeze(Xi_test,axis = 2)
            pass
        if X_train==[]:
            X_train = Xi_train
            y_train = yi_train
            X_test = Xi_test
            y_test = yi_test
        else:
            X_train = np.concatenate((X_train, Xi_train), axis = 2)
            X_test = np.concatenate((X_test, Xi_test), axis = 2)
    return X_train, y_train, X_test, y_test
    for j in range(1, 10):
        c = j / 1000
        model = get_classifier(ty='LR', c=c)

        microF1s = 0
        for i in range(constant.num_split):

            ## prepare data for feature-10 folders
            vocab = generate_vocab()
            train, val, dev_no_lab = read_data(is_shuffle=True, random_state=i)
            ## feature_list: glove emoji elmo bert deepmoji emo2vec
            ## if you want twitter glove or common glove use  ty='twitter' and ty='common'
            X_train, y_train = get_feature(
                train,
                vocab,
                feature_list=[feature],
                mode=['sum'],
                split="train",
                ty=ty)  ## [29010,3,emb_size] 3 is number of sentence
            X_test, y_test = get_feature(val,
                                         vocab,
                                         feature_list=[feature],
                                         mode=['sum'],
                                         split="valid",
                                         ty=ty)  ## [1150,3,emb_size]

            print("###### EXPERIMENT %d when C equals to %f ######" %
                  ((i + 1), c))
            print("(EXPERIMENT %d) Create the model" % (i + 1))

            ## train aval and predict