def test_select_percentile_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the percentile heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_percentile_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the percentile heuristic """ X, Y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='percentile', param=25).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_percentile_regression_full(): """ Test whether the relative univariate feature selection selects all features when '100%' is asked. """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=100).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def test_select_percentile_regression_full(): """ Test whether the relative univariate feature selection selects all features when '100%' is asked. """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=100).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def test_select_percentile_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the percentile heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=25).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
def test_select_percentile_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the percentile heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=25).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
def selected_features(pair_data, labels, features): #过滤掉低方差的特征值 vt_sel = VarianceThreshold(threshold=(0.85 * (1 - 0.85))) vt_sel.fit(pair_data) #本次试验中没有需要过滤的特征,在这里只是举例 print 'vt_sel.get_support()====', vt_sel.get_support() sel_features1 = features[vt_sel.get_support()] sel_pair_data1 = pair_data[:, vt_sel.get_support()] print '低方差过滤掉%d个特征' % (features.shape[0] - sel_features1.shape[0]) print 'features.shape[0]====', features.shape[0], '======', features.shape print 'sel_features1.shape[0]====', sel_features1.shape[ 0], '=========', sel_features1.shape #2 根据 单变量统计分析 选择特证 #保留重要的前90%的特征 sp_sel = SelectPercentile(percentile=95) sp_sel.fit(sel_pair_data1, labels) sel_features2 = sel_features1[sp_sel.get_support()] sel_pair_data2 = sel_pair_data1[:, sp_sel.get_support()] print '单变量统计分析过滤掉%d个特征' % (sel_features1.shape[0] - sel_features2.shape[0]) # 根据特征scroe绘制柱状图 feat_ser = pd.Series(data=sp_sel.scores_, index=features) sort_feat_ser = feat_ser.sort_values(ascending=False) plt.figure(figsize=(18, 12)) sort_feat_ser.plot(kind='bar') plt.savefig('../feat_importance.png') plt.show() return sel_pair_data2, sel_features2
def select_features(pair_data, labels, features): """ 进行特征选择 """ print '特征选择...' # 1. 过滤掉“低方差”的特征列 vt_sel = VarianceThreshold(threshold=(0.85 * (1 - 0.85))) vt_sel.fit(pair_data) # 本次实验中没有需要过滤的特征,在这里只是举例 sel_features1 = features[vt_sel.get_support()] sel_pair_data1 = pair_data[:, vt_sel.get_support()] print '“低方差”过滤掉%d个特征' % (features.shape[0] - sel_features1.shape[0]) # 2. 根据“单变量统计分析”选择特征\ # 保留重要的前90%的特征 sp_sel = SelectPercentile(percentile=95) sp_sel.fit(sel_pair_data1, labels) sel_features2 = sel_features1[sp_sel.get_support()] sel_pair_data2 = sel_pair_data1[:, sp_sel.get_support()] print '“单变量统计分析”过滤掉%d个特征' % (sel_features1.shape[0] - sel_features2.shape[0]) # 根据特征的score绘制柱状图 feat_ser = pd.Series(data=sp_sel.scores_, index=features) sorted_feat_ser = feat_ser.sort_values(ascending=False) plt.figure(figsize=(18, 12)) sorted_feat_ser.plot(kind='bar') plt.savefig('./feat_importance.png') plt.show() return sel_pair_data2, sel_features2
def select_features(pair_data, labels, features): # 1. 过滤掉“低方差”的特征列 vt_sel = VarianceThreshold(threshold=(0.9*(1-0.9))) vt_sel.fit(pair_data) # print(vt_sel.get_support()) #过滤掉噪声特征 features = features[vt_sel.get_support()] pair_data = pair_data[:,vt_sel.get_support()] # print(pair_data) #得到最重要的95%的样本 sp_sel = SelectPercentile(percentile=95) sp_sel.fit(pair_data, labels) features = features[sp_sel.get_support()] pair_data_1 = pair_data[:,sp_sel.get_support()] # print(pair_data_1) return pair_data_1,features
Xn = csr_matrix(np.array((0, 0))) yn = load_numpy_matrix(feature_set_path + 'valueVector' + tag + '_train.npy') print Counter(yn) filepath = 'MANUAL' print load_numpy_matrix(feature_set_path + 'featureArray' + tag + '_train.npy').shape print load_numpy_matrix(feature_set_path + 'socialVector' + tag + '_train.npy').shape Xn = np.hstack((load_numpy_matrix(feature_set_path + 'featureArray' + tag + '_train.npy'), load_numpy_matrix(feature_set_path + 'socialVector' + tag + '_train.npy'))) Xn = SelectPercentile(score_func=f_classif, percentile=perc).fit_transform(Xn, yn) if split: sss = StratifiedShuffleSplit(yn, 1, test_size=0.85, random_state=42) for train, test in sss: Xn, yn = Xn[train], yn[train] parameter_tuning(Xn, yn, scale=1) print "DONE WITH MANUAL" if sparse_tests: filepaths = list() # filepaths.append(feature_set_path + 'binaryWordData' + tag + '_train.npz')
from sklearn.externals import joblib #get the data_txt from DB numDimensions = 22 numFolds = 5 X_train = uux_data.getUUXSentences(numDimensions) y_train = uux_data.getUUXSentenceDimension(numDimensions) y_train_binary = MultiLabelBinarizer().fit_transform(y_train) target_names = uux_data.getUUXDimensions(numDimensions) #data_txt preproccessing - tokenization, selecting 90% of the best features vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize) X_train_features = vectorizer.fit_transform(X_train) X_train_features_names = vectorizer.fit(X_train).vocabulary_ ch2 = SelectPercentile(chi2, percentile=16) X_train_features = ch2.fit_transform(X_train_features, y_train_binary) selected_features_names = np.asarray(vectorizer.get_feature_names())[ch2.get_support()] print str(len(selected_features_names)) classifier = Pipeline([ ('tfidf', vectorizer), ('chi2', ch2), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(X_train, y_train_binary) joblib.dump(classifier, 'classifier/uux_classifier.pkl')
from sklearn.feature_extraction.text import TfidfTransformer f = open('../sentistrength/data_txt/combined/truth_dataset_3_scale.txt') lines = f.readlines() f.close() sentences = [] sentiments = [] for line in lines: row = [] elements = line.rstrip('\r\n').split('\t') sentences.append(elements[1]) sentiments.append(int(elements[0])) ch2 = SelectPercentile(chi2, percentile=96) # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ]) parameters = {'tfidf__use_idf': (True, False)} # K-Fold cross-validation strategy skf = cross_validation.StratifiedKFold(sentences, n_folds=5) mnb_grid = GridSearchCV(pipeline,
from sklearn.cross_validation import KFold from sklearn.pipeline import Pipeline import numpy as np X_train = uux_data.getUUXSentences(22) y_train = uux_data.getUUXSentenceDimension(22) y_train_binary = MultiLabelBinarizer().fit_transform(y_train) target_names = uux_data.getUUXDimensions(22) ############################################################################### # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('feature_selection', SelectPercentile()), ('clf', OneVsRestClassifier(LinearSVC())), ]) # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way parameters = { 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': ((1, 1), (1, 2), (2, 2)), # unigrams or bigrams 'vect__lowercase': (True, False), 'tfidf__use_idf': (True, False), 'tfidf__smooth_idf': (True, False), 'tfidf__sublinear_tf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'feature_selection__score_func': (chi2, f_classif), 'feature_selection__percentile': (25, 90)
'Lemma', # 'Form', # 'LemmaFormDiff_Back', # class # 'LemmaFormDiff_Front', # not used, yet 'LemmaSuff_1', 'LemmaSuff_2', 'LemmaSuff_3', 'LemmaSuff_4', #'LemmaSuff_5', 'LemmaSuff_6', #'LemmaSuff_7', 'LemmaSuff_8', 'Tag_POS', 'Tag_CPOS', 'NEIGHBOR-1_Tag_POS', 'NEIGHBOR-1_Tag_CPOS', 'NEIGHBOR-1_Lemma' ], # 'filter_attr': lambda key, val: False if key.startswith('Tag') and val in ['.', '-'] else True, 'vectorizer': DictVectorizer(), 'feature_filter': SelectPercentile(percentile=20), 'classifier_class': LogisticRegression, 'classifier_params': { 'penalty': ['l1'], 'C': [1, 10, 100, 1000], 'tol': [0.01, 0.001, 0.0001] }, 'unfold_pattern': '^(penalty|C|tol)$' }
# Support Vector Machine for perc in range(1, 100, 2): vectorizer = CountVectorizer(tokenizer=negation_handling.tokenize, ngram_range=(1, 1), max_df=0.5, lowercase=False) tfidfTans = TfidfTransformer(use_idf=True, sublinear_tf=True, smooth_idf=False, norm='l2') classifier = Pipeline([ ('vect', vectorizer), ('tfidf', tfidfTans), ('feature_selection', SelectPercentile(chi2, percentile=perc)), ('clf', LinearSVC(C=0.10000000000000001, multi_class='ovr')), ]) scores = cross_validation.cross_val_score(classifier, sentences, sentiments, cv=5, scoring='precision') results = np.append(results, scores.mean()) skf = cross_validation.StratifiedKFold(sentiments, n_folds=10) # Multinomial Naive Bayes # for perc in range(1, 100, 2): # # vectorizer = CountVectorizer(tokenizer=negation_handling.tokenize, ngram_range=(1, 2), max_df=0.5, lowercase=True)
y_train = uux_data.getUUXSentenceDimension(numDimensions) y_train_binary = MultiLabelBinarizer().fit_transform(y_train) target_names = uux_data.getUUXDimensions(numDimensions) x_train_folds, x_test_folds, y_train_folds, y_test_folds = uux_labelset_stratification.kFoldStratify( numFolds) target_names = uux_data.getUUXDimensions(numDimensions) percentiles = range(1, 100, 5) results = [] for perc in range(1, 100, 5): p = np.empty([numFolds]) ch2 = SelectPercentile(chi2, percentile=perc) #perfrom 5folds cross-validation for i in range(0, numFolds): #data_txt preproccessing - tokenization, selecting 90% of the best features vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize) X_train_features = vectorizer.fit_transform(x_train_folds[i]) X_train_features_names = vectorizer.fit(x_train_folds[i]).vocabulary_ X_train_features = ch2.fit_transform(X_train_features, y_train_folds[i]) selected_features_names = np.asarray( vectorizer.get_feature_names())[ch2.get_support()] classifier = Pipeline([('tfidf', vectorizer), ('chi2', ch2),
'RandomizedLogisticRegression':RandomizedLogisticRegression(), 'RandomizedPCA':RandomizedPCA(), 'Ridge':Ridge(), 'RidgeCV':RidgeCV(), 'RidgeClassifier':RidgeClassifier(), 'RidgeClassifierCV':RidgeClassifierCV(), 'RobustScaler':RobustScaler(), 'SGDClassifier':SGDClassifier(), 'SGDRegressor':SGDRegressor(), 'SVC':SVC(), 'SVR':SVR(), 'SelectFdr':SelectFdr(), 'SelectFpr':SelectFpr(), 'SelectFwe':SelectFwe(), 'SelectKBest':SelectKBest(), 'SelectPercentile':SelectPercentile(), 'ShrunkCovariance':ShrunkCovariance(), 'SkewedChi2Sampler':SkewedChi2Sampler(), 'SparsePCA':SparsePCA(), 'SparseRandomProjection':SparseRandomProjection(), 'SpectralBiclustering':SpectralBiclustering(), 'SpectralClustering':SpectralClustering(), 'SpectralCoclustering':SpectralCoclustering(), 'SpectralEmbedding':SpectralEmbedding(), 'StandardScaler':StandardScaler(), 'TSNE':TSNE(), 'TheilSenRegressor':TheilSenRegressor(), 'VBGMM':VBGMM(), 'VarianceThreshold':VarianceThreshold(),}
# Y=labelProp.transduction_ # print('Shape of Y:', Y.shape) # print('first row: ', Y[0]) # SCORER scorer = make_scorer(score_func=singleLabelScore, greater_is_better=False) # PREPROCESSING # SCALING minMaxScaler = MinMaxScaler(feature_range=(0.0, 1.0)) #normalizer = skprep.Normalizer() columnDeleter = fs.FeatureDeleter() # FEATURE SELECTION varianceThresholdSelector = VarianceThreshold(threshold=(0)) percentileSelector = SelectPercentile(score_func=f_classif, percentile=20) kBestSelector = SelectKBest(f_classif, 1000) # FEATURE EXTRACTION #rbmPipe = skpipe.Pipeline(steps=[('scaling', minMaxScaler), ('rbm', rbm)]) nmf = NMF(n_components=150) pca = PCA(n_components=80) sparse_pca = SparsePCA(n_components=700, max_iter=3, verbose=2) kernel_pca = KernelPCA(n_components=150) # Costs huge amounts of ram randomized_pca = RandomizedPCA(n_components=500) # REGRESSORS random_forest_regressor = RandomForestRegressor(n_estimators=256) gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=60) support_vector_regressor = svm.SVR()
'LemmaSuff_5', 'LemmaSuff_6', 'LemmaSuff_7', 'LemmaSuff_8', 'Tag_POS', 'Tag_SubPOS', 'Tag_Gen', 'Tag_Num', 'Tag_Cas', 'Tag_PGe', 'Tag_PNu', 'Tag_Per', 'Tag_Ten', 'Tag_Gra', 'Tag_Neg', 'Tag_Voi', 'Tag_Var' ], # This filters out some feature values (here 'Tag_*' values equal to '.' or '-'. # You can use an arbitrary lambda function here (or None if you don't want it). 'filter_attr': lambda key, val: False if key.startswith('Tag') and val in ['.', '-'] else True, 'vectorizer': DictVectorizer(), # Feature filtering using ANOVA (recommended) 'feature_filter': SelectPercentile(percentile=10), # You can use any Scikit-Learn classifier here 'classifier_class': LogisticRegression, # Classifier parameter settings (see Scikit-Learn documentation for the list of parameters). # If you use lists instead of single values and specify the unfold_pattern, all the values # in the lists will be tried in parallel on a cluster using qsub). # Do not use lists of values and the unfold_pattern setting if you don't have access to # cluster/qsub. 'classifier_params': { 'penalty': ['l1', 'l2'], 'C': [1, 10, 100, 1000], 'tol': [0.01, 0.001, 0.0001] },
labels_test_gold = data['labels'] labels_test_gold.shape = (labels_test_gold.shape[1], ) # with open('../feature_names.pickle', 'r') as pickled: # feature_names = pickle.load(pickled) print "Loaded data; testing classifier..." features_train, labels_train = ClassBalancingClassifierWrapper.rebalance( features_train, labels_train, ratio=2) results = [] for i in range(15): print 'Round', i classifier = DecisionTreeClassifier() classifier = SKLPipeline([('feature_selection', SelectPercentile(f_classif, 1)), ('classification', classifier)]) classifier.fit(features_train, labels_train) labels_test_predicted = classifier.predict(features_test) results.append(diff_binary_vectors(labels_test_predicted, labels_test_gold)) # support = classifier.steps[0][1].get_support(True) # print 'Selected', len(support), 'features:' # for index in support: # print ' ', feature_names[index] print 'Results:' print ClassificationMetrics.average(results, False)