Пример #1
0
        filepaths.append(feature_set_path + 'bigramOnlyBinaryWordData' + tag +
                         '_train.npz')
        #         filepaths.append(feature_set_path + 'bigramOnlyTfidfWordData' + tag + '_train.npz')
        #         filepaths.append(feature_set_path + 'trigramOnlyBinaryWordData' + tag + '_train.npz')
        #         filepaths.append(feature_set_path + 'trigramOnlyTfidfWordData' + tag + '_train.npz')

        for file in filepaths:
            print file
            print tag

            Xn = csr_matrix(np.array((0, 0)))
            yn = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                                   '_train.npy')
            print Counter(yn)
            Xn = load_sparse_csr(file)
            Xn = SelectKBest(score_func=chi2,
                             k=min(200000,
                                   int(Xn.shape[1] *
                                       (perc / 100.0)))).fit_transform(Xn, yn)

            if split:
                sss = StratifiedShuffleSplit(yn, 1, test_size=0.75)
                for train, test in sss:
                    Xn, yn = Xn[train], yn[train]

            parameter_tuning(Xn, yn, scale=-1)

    if sparse_2_tests:
        filepaths = list()
        #         filepaths.append(feature_set_path+ 'binaryCharacterData' + tag + '_train.npz')
Пример #2
0
#filepath = feature_set_path+ 'trigramOnlyBinaryWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'trigramOnlyTfidfWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'quadgramOnlyBinaryWordData' + tag + '_train.npz'
#filepath = feature_set_path+ 'quadgramOnlyTfidfWordData' + tag + '_train.npz'

#filepath = feature_set_path+ 'binaryCharacterData_train.npz'
#filepath = feature_set_path+ 'tfidfCharacterData_train.npz'

#filepath = feature_set_path+ 'binaryCharacterSkipgramData_train.npz'
#filepath = feature_set_path+ 'tfidfCharacterSkipgramData_train.npz'

#filepath = feature_set_path+ 'binaryLexicalBigramsData_train.npz'
#filepath = feature_set_path+ 'tfidfLexicalBigramsData_train.npz'

if filepath != 'MANUAL':
    Xn = load_sparse_csr(filepath)

#filepath = feature_set_path+ 'Basic300_TfidfFeatures_train.npy'
#filepath = feature_set_path+ 'Basic300_BOCFeatures_train.npy'

#filepath = feature_set_path+ 'google_model_TfidfFeatures_train.npy'
#filepath = feature_set_path+ 'google_model_BOCFeatures_train.npy'
'''
if filepath != 'MANUAL':
    Xn = load_numpy_matrix(filepath)
'''
print filepath

sss = StratifiedShuffleSplit(yn, 1, test_size=0.90, random_state=0)
for train, test in sss:
    Xn, yn = Xn[train], yn[train]
Пример #3
0
                                   '_test.npy')

        if featureV == 0:
            X_train = load_numpy_matrix(feature_set_path + r'featureArray' +
                                        tag + '_train.npy')
            sd = load_numpy_matrix(feature_set_path + r'socialVector' + tag +
                                   '_train.npy')
            X_train = np.hstack((X_train, sd))
            X_test = load_numpy_matrix(feature_set_path + r'featureArray' +
                                       tag + '_test.npy')
            sd2 = load_numpy_matrix(feature_set_path + r'socialVector' + tag +
                                    '_test.npy')
            X_test = np.hstack((X_test, sd2))
            perc = 80
        elif featureV == 1:
            X_train = load_sparse_csr(feature_set_path + r'binaryWordData' +
                                      tag + '_train.npz')
            X_test = load_sparse_csr(feature_set_path + r'binaryWordData' +
                                     tag + '_test.npz')
        elif featureV == 2:
            X_train = load_sparse_csr(feature_set_path + r'freqWordData' +
                                      tag + '_train.npz')
            X_test = load_sparse_csr(feature_set_path + r'freqWordData' + tag +
                                     '_test.npz')
        elif featureV == 3:
            X_train = load_sparse_csr(feature_set_path + r'tfidfWordData' +
                                      tag + '_train.npz')
            X_test = load_sparse_csr(feature_set_path + r'tfidfWordData' +
                                     tag + '_test.npz')
        elif featureV == 4:
            X_train = load_sparse_csr(feature_set_path +
                                      r'bigramBinaryWordData' + tag +
Пример #4
0
if __name__ == '__main__':
    
    for featureV in [1,2,3,4,5,6,7,10,11,12,13]:          
        y_train = load_numpy_matrix(feature_set_path +  r'valueVector'+tag+'_train.npy')
        y_test = load_numpy_matrix(feature_set_path +  r'valueVector'+tag+'_test.npy')
        
        if featureV == 0:
            X_train = load_numpy_matrix(feature_set_path +  r'featureArray'+tag+'_train.npy')
            sd = load_numpy_matrix(feature_set_path +  r'socialVector'+tag+'_train.npy')
            X_train =  np.hstack((X_train,sd))
            X_test = load_numpy_matrix(feature_set_path +  r'featureArray'+tag+'_test.npy')
            sd2 = load_numpy_matrix(feature_set_path +  r'socialVector'+tag+'_test.npy')
            X_test =  np.hstack((X_test,sd2))
            perc = 50
        elif featureV == 1:
            X_train = load_sparse_csr(feature_set_path +  r'binaryWordData'+tag+'_train.npz')  
            X_test = load_sparse_csr(feature_set_path +  r'binaryWordData'+tag+'_test.npz')  
        elif featureV == 2:
            X_train = load_sparse_csr(feature_set_path +  r'freqWordData'+tag+'_train.npz')  
            X_test = load_sparse_csr(feature_set_path +  r'freqWordData'+tag+'_test.npz') 
        elif featureV == 3:
            X_train = load_sparse_csr(feature_set_path +  r'tfidfWordData'+tag+'_train.npz') 
            X_test = load_sparse_csr(feature_set_path +  r'tfidfWordData'+tag+'_test.npz') 
        elif featureV == 4:
            X_train = load_sparse_csr(feature_set_path +  r'bigramBinaryWordData'+tag+'_train.npz')  
            X_test = load_sparse_csr(feature_set_path +  r'bigramBinaryWordData'+tag+'_test.npz')  
        elif featureV == 5:
            X_train = load_sparse_csr(feature_set_path +  r'bigramTfidfWordData'+tag+'_train.npz')  
            X_test = load_sparse_csr(feature_set_path +  r'bigramTfidfWordData'+tag+'_test.npz')  
        elif featureV == 6:
            X_train = load_sparse_csr(feature_set_path +  r'trigramBinaryWordData'+tag+'_train.npz')  
#         filepaths.append(feature_set_path + 'trigramBinaryWordData' + tag + '_train.npz')
#         filepaths.append(feature_set_path + 'trigramTfidfWordData' + tag + '_train.npz')

        filepaths.append(feature_set_path + 'bigramOnlyBinaryWordData' + tag + '_train.npz')
#         filepaths.append(feature_set_path + 'bigramOnlyTfidfWordData' + tag + '_train.npz')
#         filepaths.append(feature_set_path + 'trigramOnlyBinaryWordData' + tag + '_train.npz')
#         filepaths.append(feature_set_path + 'trigramOnlyTfidfWordData' + tag + '_train.npz')

        for file in filepaths:
            print file
            print tag

            Xn = csr_matrix(np.array((0,0)))
            yn = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_train.npy')
            print Counter(yn)
            Xn = load_sparse_csr(file)
            Xn = SelectKBest(score_func=chi2, k=min(200000, int(Xn.shape[1]*(perc/100.0)))).fit_transform(Xn,yn)

            if split:
                sss = StratifiedShuffleSplit(yn, 1, test_size=0.75)
                for train, test in sss:
                    Xn , yn = Xn[train], yn[train]

            parameter_tuning(Xn, yn, scale=-1)

    if sparse_2_tests:
        filepaths = list()
#         filepaths.append(feature_set_path+ 'binaryCharacterData' + tag + '_train.npz')
#         filepaths.append(feature_set_path+ 'tfidfCharacterData' + tag + '_train.npz')
# 
#         filepaths.append(feature_set_path+ 'binaryCharacterSkipgramData' + tag + '_train.npz')
#filepath = feature_set_path+ 'quadgramOnlyTfidfWordData' + tag + '_train.npz'

#filepath = feature_set_path+ 'binaryCharacterData_train.npz'
#filepath = feature_set_path+ 'tfidfCharacterData_train.npz'

#filepath = feature_set_path+ 'binaryCharacterSkipgramData_train.npz'
#filepath = feature_set_path+ 'tfidfCharacterSkipgramData_train.npz'

#filepath = feature_set_path+ 'binaryLexicalBigramsData_train.npz'
#filepath = feature_set_path+ 'tfidfLexicalBigramsData_train.npz'




if filepath != 'MANUAL':
    Xn = load_sparse_csr(filepath)


#filepath = feature_set_path+ 'Basic300_TfidfFeatures_train.npy'
#filepath = feature_set_path+ 'Basic300_BOCFeatures_train.npy'

#filepath = feature_set_path+ 'google_model_TfidfFeatures_train.npy'
#filepath = feature_set_path+ 'google_model_BOCFeatures_train.npy'

'''
if filepath != 'MANUAL':
    Xn = load_numpy_matrix(filepath)
'''
print filepath

sss = StratifiedShuffleSplit(yn, 1, test_size=0.90, random_state=0)