def feature_importance_with_forest(rforest_classifier, issues_train, priority_train, issues_test, priority_test): """ Assess feature importance using a Random Forest. :param rforest_classifier: An already fitted classifier. :param issues_train: Train features. :param priority_train: Train classes. :param issues_test: Test features. :param priority_test: Test classes. :return: None """ importances = rforest_classifier.feature_importances_ indices = np.argsort(importances)[::-1] for column_index in range(len(issues_train.columns)): print column_index + 1, ") ", issues_train.columns[column_index], " ", importances[indices[column_index]] figure, axes = plt.subplots(1, 1) plt.title('Feature importance') plt.bar(range(len(issues_train.columns)), importances[indices], color='lightblue', align='center') plt.xticks(range(len(issues_train.columns)), issues_train.columns, rotation=90) plt.xlim([-1, len(issues_train.columns)]) plt.tight_layout() plt.show() evaluate_performance("FOREST", rforest_classifier, issues_train, priority_train, issues_test, priority_test) print "Selecting important features ..." select = SelectFromModel(rforest_classifier, threshold=0.05, prefit=True) train_selected = select.transform(issues_train) test_selected = select.transform(issues_test) rforest_classifier.fit(train_selected, priority_train) evaluate_performance("FOREST-IMPORTANT", rforest_classifier, train_selected, priority_train, test_selected, priority_test)
def lassoCV_regression(data,target,alphas): clf=LassoCV() sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(data, target) n_features = sfm.transform(data).shape[1] while n_features > 2: sfm.threshold += 0.1 data_transform = sfm.transform(data) n_features = data_transform.shape[1] rmses=[] kf=KFold(len(target),10,True,None) for train_index, test_index in kf: data_train,data_test=data_transform[train_index],data_transform[test_index] target_train,target_test=target[train_index],target[test_index] clf.fit(data_train,target_train) rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2)) rmses.append(rmse) x0=np.arange(1,11) plt.figure() plt.plot(x0,rmses,label='LassoCV') plt.legend() plt.show() return rmses
def selecttest(): import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import load_boston from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LassoCV boston = load_boston() X,y = boston['data'], boston['target'] clf = LassoCV() sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(X,y) n_features = sfm.transform(X).shape[1] while n_features > 2: sfm.threshold += 0.1 X_transform = sfm.transform(X) n_features = X_transform.shape[1] plt.title( "Features selected from Boston using SelectFromModel with " "threshold %0.3f." % sfm.threshold) feature1 = X_transform[:, 0] feature2 = X_transform[:, 1] plt.plot(feature1, feature2, 'r.') plt.xlabel("Feature number 1") plt.ylabel("Feature number 2") plt.ylim([np.min(feature2), np.max(feature2)]) plt.show()
def forests(input_df, target_df): """This method implements two types of forest features selection. ExtraTreesClassifier & RandomForestClassifier. Features are ranked in order of importance.""" from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel clf = ExtraTreesClassifier(random_state = 0) clf = clf.fit(input_df, target_df) model = SelectFromModel(clf, prefit=True) input_df_new = model.transform(input_df) original_space = input_df.shape new_space_ETC = input_df_new.shape tuple_holder = [(j, i) for i, j in zip(feature_space, clf.feature_importances_)] tuple_holder.sort() tuple_holder.reverse() ################################################ ################################################ from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state = 0) clf = clf.fit(input_df, target_df) model = SelectFromModel(clf, prefit=True) input_df_new = model.transform(input_df) new_space_RFC = input_df_new.shape tuple_holder_2 = [(j, i) for i, j in zip(feature_space, clf.feature_importances_)] tuple_holder_2.sort() tuple_holder_2.reverse() ################################################ ################################################ rank_number = 0 print 'ExtraTreesClassifier', '\t'*4, 'RandomForestClassifier' print 'Old Space: ', original_space, '\t'*4, 'Old Space:', original_space print 'New Space: ', new_space_ETC, '\t'*4, 'New Space:', new_space_RFC for i, j in zip(tuple_holder, tuple_holder_2): rank_number += 1 print rank_number, '|', i, '\t'*3, rank_number, '|', j
def lasso_reducer(X, y): clf = LassoCV() # Set a minimum threshold of 0.25 # this is a 'maxing out' of the sum of all coefficients sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(X, y) n_features = sfm.transform(X).shape[1] # reset the threshold until the number of features equals two. # Note that the attribute can be set directly instead of repeatedley # fitting the metatransformer. while n_features > 2: sfm.threshold += 0.1 X_transform = sfm.transform(X) n_features = X_transform.shape[1] # Plot the seelcted two features from X. plt.title('features selected from boston using the SelectFromModel with' 'threshold of %0.3f.' % sfm.threshold) feature1 = X_transform[:, 0] feature2 = X_transform[:, 1] plt.plot(feature1, feature2, 'r.') plt.xlabel("Value of Feature number 1") plt.ylabel("Value of Feature number 2") plt.ylim([np.min(feature2), np.max(feature2)]) plt.show() return
def feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc'): thresholds = [thres for thres in sorted(model.feature_importances_) if thres != 0] # Use feat. with >0 importance roc_scores = {} for thresh in thresholds: # select features using threshold selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) selection_model = XGBClassifier() # train model selection_model.fit(select_X_train, y_train, eval_metric=eval_metric) select_X_test = selection.transform(X_test) # eval model y_pred = selection_model.predict(select_X_test) roc = roc_auc_score(y_test, y_pred) roc_scores[selection.threshold] = roc best_thresh = max(roc_scores, key=roc_scores.get) fs = SelectFromModel(model, threshold=best_thresh, prefit=True) pickle_model(fs, 'feature.select') X_train_trans_ = fs.transform(X_train) X_test_trans_ = fs.transform(X_test) print 'total features kept: {}'.format(X_train_trans_.shape[1]) return X_train_trans_, X_test_trans_
def selectfeature(x, y, x_pre): x, x_pre = datscater(x, x_pre) clf = linear_model.LassoLars().fit(x, y) model = SelectFromModel(clf, prefit=True) x_new = model.transform(x) print 'x',x.shape print x_new.shape x_pre = model.transform(x_pre) return x_new, x_pre
def test_threshold_without_refitting(): """Test that the threshold can be set without refitting the model.""" clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0) model = SelectFromModel(clf, threshold=0.1) model.fit(data, y) X_transform = model.transform(data) # Set a higher threshold to filter out more features. model.threshold = 1.0 assert_greater(X_transform.shape[1], model.transform(data).shape[1])
def train(self): rfc = RandomForestRegressor() rfc.fit(self.data, self.target) model = SelectFromModel(rfc, prefit=True) X = model.transform(self.data) self.predict = model.transform(self.predict) rfc.fit(X, self.target) return rfc
def test_partial_fit(): est = PassiveAggressiveClassifier(random_state=0, shuffle=False) transformer = SelectFromModel(estimator=est) transformer.partial_fit(data, y, classes=np.unique(y)) old_model = transformer.estimator_ transformer.partial_fit(data, y, classes=np.unique(y)) new_model = transformer.estimator_ assert_true(old_model is new_model) X_transform = transformer.transform(data) transformer.fit(np.vstack((data, data)), np.concatenate((y, y))) assert_array_equal(X_transform, transformer.transform(data))
def extra_trees_classifier(): tianic=Titanic_Data('../input/train.csv','../input/test.csv') combined_normalized_data=tianic.get_normalized_data() train,test,targets = recover_train_test_target('../input/train.csv', combined_normalized_data) clf = ExtraTreesClassifier(n_estimators=200) clf = clf.fit(train, targets) features = pd.DataFrame() features['feature'] = train.columns features['importance'] = clf.feature_importances_ features.sort(['importance'],ascending=False) model = SelectFromModel(clf, prefit=True) train_new = model.transform(train) train_new.shape test_new = model.transform(test) test_new.shape forest = RandomForestClassifier(max_features='sqrt') parameter_grid = { 'max_depth' : [4,5,6,7,8], 'n_estimators': [200,210,240,250], 'criterion': ['gini','entropy'] } cross_validation = StratifiedKFold(targets, n_folds=5) grid_search = GridSearchCV(forest, param_grid=parameter_grid, cv=cross_validation) grid_search.fit(train_new, targets) print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) output = grid_search.predict(test_new).astype(int) df_output = pd.DataFrame() df_output['PassengerId'] = test['PassengerId'] df_output['Survived'] = output df_output[['PassengerId','Survived']].to_csv('./extra_trees_classifier_output.csv',index=False)
def run(): allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove = ["KWH", "KWHSPH", "KWHCOL", "KWHWTH", "KWHRFG", "KWHOTH", "BTUEL", "BTUELSPH", "BTUELCOL", "BTUELWTH", "BTUELRFG","BTUELOTH", "DOLLAREL", "DOLELSPH", "DOLELCOL", "DOLELWTH", "DOLELRFG", "DOLELOTH", "TOTALBTUOTH", "TOTALBTUCOL", 'TOTALBTU', 'TOTALBTUWTH', 'TOTALBTU', 'TOTALBTUSPH', 'TOTALBTURFG', 'TOTALDOL', 'TOTALDOLSPH', 'TOTALDOLCOL', 'TOTALDOLWTH', 'TOTALDOLRFG', 'TOTALDOLOTH']) #allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove = [], forceUse = # [ # 'WGTP', 'NP', 'TYPE', 'ACR', 'BDSP', 'BATH', 'FS','MHP', 'RMSP', 'RNTP', 'REFR', 'RNTP', 'RWAT', 'STOV', 'TEN', 'VALP', 'YBL', 'FES', 'FINCP', 'HINCP', 'HHT', 'KIT', 'NOC', 'NPF', 'PLM', 'SRNT', 'SVAL', 'TAXP', 'WIF', 'WORKSTAT', # ]) clf = RandomForestRegressor(n_estimators = 100, n_jobs = 7) clf.fit(X, y) model = SelectFromModel(clf, prefit = True) X = model.transform(X) relevantFeatures = [allKeys[i] for i in range(len(model._get_support_mask())) if model._get_support_mask()[i] == True] print("Relevant Features", relevantFeatures) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) clf.fit(X_train, y_train) print(y_test[:100]) print(metrics.mean_squared_error(clf.predict(X_test), y_test)) features = sorted(zip(allKeys, clf.feature_importances_), key = lambda x : x[1], reverse = True) print("Features", features)
def tree_based_selection(self, data_set, data_target, feature_names): """ :param data_set: :return: """ clf = ExtraTreesClassifier() clf = clf.fit(data_set, data_target) print clf.feature_importances_ model = SelectFromModel(clf, prefit=True) feature_set = model.transform(data_set) fea_index = [] for A_col in np.arange(data_set.shape[1]): for B_col in np.arange(feature_set.shape[1]): if (data_set[:, A_col] == feature_set[:, B_col]).all(): fea_index.append(A_col) check = {} for i in fea_index: check[feature_names[i]] = data_set[0][i] print np.array(check) return feature_set, fea_index
def feature_selction(_train_data, _valid_data, _test_data, _train_label, _valid_label, _test_label): train_imageNo = _train_data.shape[0] valid_imageNo = _valid_data.shape[0] whole_data = numpy.concatenate((_train_data, _valid_data, _test_data)) whole_data = whole_data.reshape((-1, 120)) whole_label = numpy.concatenate((_train_label, _valid_label, _test_label)) whole_label = list(whole_label) new_label_list = list() for i in whole_label: for j in range(100): new_label_list.append(i) assert len(new_label_list) == whole_data.shape[0] lsvc = LinearSVC(C=0.1, penalty="l1", dual=False).fit(whole_data, new_label_list) model = SelectFromModel(lsvc, prefit=True) data_new = model.transform(whole_data) print ('After feature selection we have', data_new.shape[1], 'features.') data_new = data_new.reshape((-1, 100, data_new.shape[1])) _train_data = data_new[:train_imageNo,:,:] _valid_data = data_new[train_imageNo:train_imageNo+valid_imageNo,:,:] _test_data = data_new[train_imageNo+valid_imageNo:,:,:] return _train_data, _valid_data, _test_data
def test_feature_importances_2d_coef(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, n_classes=4, ) est = LogisticRegression() for threshold, func in zip(["mean", "median"], [np.mean, np.median]): for order in [1, 2, np.inf]: # Fit SelectFromModel a multi-class problem transformer = SelectFromModel(estimator=LogisticRegression(), threshold=threshold, norm_order=order) transformer.fit(X, y) assert_true(hasattr(transformer.estimator_, "coef_")) X_new = transformer.transform(X) assert_less(X_new.shape[1], X.shape[1]) # Manually check that the norm is correctly performed est.fit(X, y) importances = norm(est.coef_, axis=0, ord=order) feature_mask = importances > func(importances) assert_array_equal(X_new, X[:, feature_mask])
def rf_feat_reduction(rf_model, features): print " Reducing number of input features based on feature importance." subset_model = SelectFromModel(rf_model, prefit=True) feat_subset = subset_model.transform(features) feat_bool = subset_model.get_support() print " " + str(len(feat_subset[0])) + " features chosen after model selection." return feat_subset, feat_bool
def test_partial_fit(): est = PassiveAggressiveClassifier(random_state=0, shuffle=False) transformer = SelectFromModel(estimator=est) transformer.partial_fit(data, y, classes=np.unique(y)) old_model = transformer.estimator_ transformer.partial_fit(data, y, classes=np.unique(y)) new_model = transformer.estimator_ assert_true(old_model is new_model) X_transform = transformer.transform(data) transformer.fit(np.vstack((data, data)), np.concatenate((y, y))) assert_array_equal(X_transform, transformer.transform(data)) # check that if est doesn't have partial_fit, neither does SelectFromModel transformer = SelectFromModel(estimator=RandomForestClassifier()) assert_false(hasattr(transformer, "partial_fit"))
def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th): # SelectfromModel clf = LGBMClassifier(n_estimators=400) clf.fit(matrix_x_temp, label_y) sfm = SelectFromModel(clf, prefit=True, threshold=th) matrix_x = sfm.transform(matrix_x_temp) # 打印出有多少特征重要性非零的特征 feature_score_dict = {} for fn, s in zip(fe_name, clf.feature_importances_): feature_score_dict[fn] = s m = 0 for k in feature_score_dict: if feature_score_dict[k] == 0.0: m += 1 print 'number of not-zero features:' + str(len(feature_score_dict) - m) # 打印出特征重要性 feature_score_dict_sorted = sorted(feature_score_dict.items(), key=lambda d: d[1], reverse=True) print 'feature_importance:' for ii in range(len(feature_score_dict_sorted)): print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1] print '\n' f = open('../eda/lgb_feature_importance.txt', 'w') f.write(th) f.write('\nRank\tFeature Name\tFeature Importance\n') for i in range(len(feature_score_dict_sorted)): f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') f.close() # 打印具体使用了哪些字段 how_long = matrix_x.shape[1] # matrix_x 是 特征选择后的 输入矩阵 feature_used_dict_temp = feature_score_dict_sorted[:how_long] feature_used_name = [] for ii in range(len(feature_used_dict_temp)): feature_used_name.append(feature_used_dict_temp[ii][0]) print 'feature_chooesed:' for ii in range(len(feature_used_name)): print feature_used_name[ii] print '\n' f = open('../eda/lgb_feature_chose.txt', 'w') f.write('Feature Chose Name :\n') for i in range(len(feature_used_name)): f.write(str(feature_used_name[i]) + '\n') f.close() # 找到未被使用的字段名 feature_not_used_name = [] for i in range(len(fe_name)): if fe_name[i] not in feature_used_name: feature_not_used_name.append(fe_name[i]) return matrix_x, feature_not_used_name[:], len(feature_used_name)
def select_features_tree(X, y, feature_names = []): print X.shape #forest = RandomForestClassifier(n_estimators=1000, n_jobs=4) forest = ExtraTreesClassifier(n_estimators=1000, n_jobs=8) fo = forest.fit(X, y) sorted_feature_names = plot_feature_importance(fo, X, feature_names) model = SelectFromModel(fo, prefit=True, ) X_new = model.transform(X) print X_new.shape return X_new, sorted_feature_names[0:X_new.shape[1]]
def select_feature(clf,x_train,x_valid): clf.fit(x_train, y_train) model = SelectFromModel(clf, prefit=True, threshold="mean") print x_train.shape x_train = model.transform(x_train) x_valid = model.transform(x_valid) print x_train.shape return x_train,x_valid
def test_threshold_string(): est = RandomForestClassifier(n_estimators=50, random_state=0) model = SelectFromModel(est, threshold="0.5*mean") model.fit(data, y) X_transform = model.transform(data) # Calculate the threshold from the estimator directly. est.fit(data, y) threshold = 0.5 * np.mean(est.feature_importances_) mask = est.feature_importances_ > threshold assert_array_equal(X_transform, data[:, mask])
def select_feature_from_model(X, y, max_features): from sklearn.feature_selection import SelectFromModel X_scaled = pd.DataFrame(preprocessing.scale(X), columns=X.keys()) classifier = SVC(kernel='linear', class_weight='balanced', C=0.025) sfm = SelectFromModel(classifier, threshold=0.05) sfm.fit(X_scaled, y) n_features = sfm.transform(X_scaled).shape[1] while n_features > max_features: # set the max number of features to select sfm.threshold += 0.05 X_transform = sfm.transform(X_scaled) n_features = X_transform.shape[1] X_final = pd.DataFrame(X_transform) hashes = {} features_selected = [] for c in X_scaled.keys(): hashes[hash(tuple(X_scaled[c].values))] = c for c in X_final.keys(): features_selected.append(hashes[hash(tuple(X_final[c].values))]) print('Features selection by SelectFromModel: {}'.format(features_selected))
def test_coef_default_threshold(): X, y = datasets.make_classification( n_samples=100, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # For the Lasso and related models, the threshold defaults to 1e-5 transformer = SelectFromModel(estimator=Lasso(alpha=0.1)) transformer.fit(X, y) X_new = transformer.transform(X) mask = np.abs(transformer.estimator_.coef_) > 1e-5 assert_array_almost_equal(X_new, X[:, mask])
def predict_probabilities(X_train,X_test,y_train,threshold,component,m): ## Selector phase selector = SelectFromModel(linear_model.LogisticRegression(),threshold=threshold) #print X_train, y_train selector.fit(X_train,y_train) new_X_train = selector.transform(X_train) ##PCA phase pca = PCA(n_components=component) pca.fit(new_X_train) pca_variance = sum(pca.explained_variance_ratio_) pca_X_train = pca.transform(new_X_train) #convert the X_test pca_X_test = pca.transform(selector.transform(X_test)) ##Model phase model = m[1] model.fit(pca_X_train,y_train) return model.predict_proba(pca_X_test), pca_variance
def lasso_by_num(X_train, y_train, num): # if random_state not specified, each run gives different result X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0) print X_train # number of features = ycol-1 clf = linear_model.LassoCV() sfm = SelectFromModel(clf, threshold=0.00001) sfm.fit(X_train, y_train) # select 3 features using lasso X_train_trans = sfm.transform(X_train) n_features = X_train_trans.shape[1] while n_features > num: sfm.threshold += 0.01 #print sfm.threshold X_train_trans = sfm.transform(X_train) n_features = X_train_trans.shape[1] print X_train_trans
def select_features(inputs, label, threshold): print 'training ExtraTreesClassifier...' clf = ExtraTreesClassifier(criterion='entropy') clf.fit(inputs, label) threshold='%f*mean'%(threshold) print 'training SelectFromModel, threshold=%s...'%(threshold) sfm = SelectFromModel(clf, threshold=threshold, prefit=True) inputs_new = sfm.transform(inputs) #pdb.set_trace() print inputs_new.shape return sfm, inputs_new
def test_prefit(): """ Test all possible combinations of the prefit parameter. """ # Passing a prefit parameter with the selected model # and fitting a unfit model with prefit=False should give same results. clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0) model = SelectFromModel(clf) model.fit(data, y) X_transform = model.transform(data) clf.fit(data, y) model = SelectFromModel(clf, prefit=True) assert_array_equal(model.transform(data), X_transform) # Check that the model is rewritten if prefit=False and a fitted model is # passed model = SelectFromModel(clf, prefit=False) model.fit(data, y) assert_array_equal(model.transform(data), X_transform) # Check that prefit=True and calling fit raises a ValueError model = SelectFromModel(clf, prefit=True) assert_raises(ValueError, model.fit, data, y)
def test_feature_importances(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0 ) est = RandomForestClassifier(n_estimators=50, random_state=0) for threshold, func in zip(["mean", "median"], [np.mean, np.median]): transformer = SelectFromModel(estimator=est, threshold=threshold) transformer.fit(X, y) assert_true(hasattr(transformer.estimator_, "feature_importances_")) X_new = transformer.transform(X) assert_less(X_new.shape[1], X.shape[1]) importances = transformer.estimator_.feature_importances_ feature_mask = np.abs(importances) > func(importances) assert_array_almost_equal(X_new, X[:, feature_mask]) # Check with sample weights sample_weight = np.ones(y.shape) sample_weight[y == 1] *= 100 est = RandomForestClassifier(n_estimators=50, random_state=0) transformer = SelectFromModel(estimator=est) transformer.fit(X, y, sample_weight=sample_weight) importances = transformer.estimator_.feature_importances_ transformer.fit(X, y, sample_weight=3 * sample_weight) importances_bis = transformer.estimator_.feature_importances_ assert_almost_equal(importances, importances_bis) # For the Lasso and related models, the threshold defaults to 1e-5 transformer = SelectFromModel(estimator=Lasso(alpha=0.1)) transformer.fit(X, y) X_new = transformer.transform(X) mask = np.abs(transformer.estimator_.coef_) > 1e-5 assert_array_equal(X_new, X[:, mask])
def execute(fdata): data = list() target = list() storeDict = dict() for i, lines in enumerate(fdata): sline = lines.split(",") target.append(int(sline[0])) data.append([float(x) for j, x in enumerate(sline) if j != 0]) storeDict[i] = [float(x) for j, x in enumerate(sline) if j != 0] data = np.array(data) X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.25, random_state=0) clf = ExtraTreesClassifier() clf = clf.fit(X_train, y_train) model = SelectFromModel(clf, prefit=True) X_new = model.transform(X_train) clfNew = svm.SVC(kernel='linear', C=1).fit(X_new, y_train) value_feature = list() countDict = dict() for key, val in storeDict.items(): countDict[key] = 0 for i, inval in enumerate(val): if inval in X_new[0]: countDict[key] = countDict[key] + 1 keyName = max(countDict, key=countDict.get) posStore = list() for val in X_new[0]: posStore.append(storeDict[keyName].index(val)) X_test_new = list() for val in X_test: inlist = list() for i, inval in enumerate(val): if i in posStore: inlist.append(inval) X_test_new.append(inlist) X_test_new = np.array(X_test_new) return accuracy_score(y_test, clf.predict(X_test)), accuracy_score(y_test, clfNew.predict(X_test_new))
def fs_svm(X, y): # feature selection with SVM model lsvc = LinearSVC(C=0.001, penalty="l1", dual=False).fit(X, y) model = SelectFromModel(lsvc, prefit=True) X_new = model.transform(X) LIWC = iot.read_fields() print 'Original feature size', X.shape print 'New feature size', X_new.shape sample_X = X[0] sample_X_new = X_new[0] print 'Original feature length of sample', len(set(sample_X)) print 'New feature length of sample', len(set(sample_X_new)) for i in xrange(len(sample_X)): if sample_X[i] in sample_X_new: print i+1, LIWC[i]
class ExtraTreeBasedSelector(Transformer): def __init__(self, n_estimators=100, criterion='gini', min_samples_leaf=1, min_samples_split=2, max_features=0.5, bootstrap='False', max_leaf_nodes='None', max_depth='None', min_weight_fraction_leaf=0., min_impurity_decrease=0., oob_score=False, n_jobs=-1, random_state=1, verbose=0, class_weight=None): super().__init__("extra_trees_based_selector", 7) self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL] self.compound_mode = 'only_new' self.n_estimators = n_estimators self.estimator_increment = 10 if criterion not in ("gini", "entropy"): raise ValueError("'criterion' is not in ('gini', 'entropy'): " "%s" % criterion) self.criterion = criterion self.min_samples_leaf = min_samples_leaf self.min_samples_split = min_samples_split self.max_features = max_features self.bootstrap = bootstrap self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_weight_fraction_leaf = min_weight_fraction_leaf self.min_impurity_decrease = min_impurity_decrease self.oob_score = oob_score self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose self.class_weight = class_weight def operate(self, input_datanode, target_fields=None, sample_weight=None): from sklearn.feature_selection import SelectFromModel feature_types = input_datanode.feature_types X, y = input_datanode.data if target_fields is None: target_fields = collect_fields(feature_types, self.input_type) X_new = X[:, target_fields] n_fields = len(feature_types) irrevalent_fields = list(range(n_fields)) for field_id in target_fields: irrevalent_fields.remove(field_id) if self.model is None: from sklearn.ensemble import ExtraTreesClassifier if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.bootstrap = check_for_bool(self.bootstrap) self.n_jobs = int(self.n_jobs) self.min_impurity_decrease = float(self.min_impurity_decrease) self.max_features = self.max_features self.min_samples_leaf = int(self.min_samples_leaf) self.min_samples_split = int(self.min_samples_split) self.verbose = int(self.verbose) max_features = int(X_new.shape[1]**float(self.max_features)) estimator = ExtraTreesClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight) estimator.fit(X_new, y, sample_weight=sample_weight) self.model = SelectFromModel(estimator=estimator, threshold='mean', prefit=True) _X = self.model.transform(X_new) is_selected = self.model.get_support() irrevalent_types = [feature_types[idx] for idx in irrevalent_fields] selected_types = [ feature_types[idx] for idx in target_fields if is_selected[idx] ] selected_types.extend(irrevalent_types) new_X = np.hstack((_X, X[:, irrevalent_fields])) new_feature_types = selected_types output_datanode = DataNode((new_X, y), new_feature_types, input_datanode.task_type) output_datanode.trans_hist = input_datanode.trans_hist.copy() output_datanode.trans_hist.append(self.type) output_datanode.enable_balance = input_datanode.enable_balance output_datanode.data_balance = input_datanode.data_balance self.target_fields = target_fields.copy() return output_datanode @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() n_estimators = Constant("n_estimators", 100) criterion = CategoricalHyperparameter("criterion", ["gini", "entropy"], default_value="gini") max_features = UniformFloatHyperparameter("max_features", 0, 1, default_value=0.5, q=0.05) max_depth = UnParametrizedHyperparameter(name="max_depth", value="None") max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") min_samples_split = UniformIntegerHyperparameter("min_samples_split", 2, 20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter("min_samples_leaf", 1, 20, default_value=1) min_weight_fraction_leaf = UnParametrizedHyperparameter( 'min_weight_fraction_leaf', 0.) min_impurity_decrease = UnParametrizedHyperparameter( 'min_impurity_decrease', 0.) bootstrap = CategoricalHyperparameter("bootstrap", ["True", "False"], default_value="False") cs.add_hyperparameters([ n_estimators, criterion, max_features, max_depth, max_leaf_nodes, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, min_impurity_decrease, bootstrap ]) return cs
print('Random Forest CV score: {}'.format(np.mean(scores))) # Plot CV scores fig, ax = plt.subplots(figsize=(15, 7)) ax.plot(scores) ax.set_xlabel('Number of Trees (x10)') ax.set_ylabel('10-fold Cross-Validation Accuracy') ax.grid() plt.show() # Show feature importances rf.fit(X, Y) wine = load_wine() features = [wine['feature_names'][x] for x in np.argsort(rf.feature_importances_)][::-1] fig, ax = plt.subplots(figsize=(15, 8)) ax.bar([i for i in range(13)], np.sort(rf.feature_importances_)[::-1], align='center') ax.set_ylabel('Feature Importance') plt.xticks([i for i in range(13)], features, rotation=60) plt.show() # Select the most important features sfm = SelectFromModel(estimator=rf, prefit=True, threshold=0.02) X_sfm = sfm.transform(X) print('Feature selection shape: {}'.format(X_sfm.shape))
from sklearn.datasets import load_boston from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LassoCV # Load the boston dataset. boston = load_boston() X, y = boston.data, boston.target # We use the base estimator LassoCV since the L1 norm promotes sparsity of # features. clf = LassoCV(cv=5) # Set a minimum threshold of 0.25 sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(X, y) n_features = sfm.transform(X).shape[1] # Reset the threshold till the number of features equals two. # Note that the attribute can be set directly instead of repeatedly # fitting the metatransformer. while n_features > 2: sfm.threshold += 0.1 X_transform = sfm.transform(X) n_features = X_transform.shape[1] # Plot the selected two features from X. plt.title("Features selected from Boston using SelectFromModel with " "threshold %0.3f." % sfm.threshold) feature1 = X_transform[:, 0] feature2 = X_transform[:, 1] plt.plot(feature1, feature2, 'r.')
""" from sklearn.feature_selection import SelectFromModel sdt = SelectFromModel(dt, threshold=0.15) sdt.fit(x_train, y_train) #print name name of importtant variables for i in sdt.get_support(indices=True): print(var[i]) ''' Petal.Length Petal.Width ''' #creat a data subset with only most imp variables x_train = sdt.transform(x_train) x_test = sdt.transform(x_test) print(x_train.shape) #Train the new Random Forest Classifier Using only important Variables #----------model import------------------------------ #=====Random_forest_Classifier======u======= from sklearn.ensemble import RandomForestClassifier dt = RandomForestClassifier( n_estimators=100, random_state=101) #Accuracy 0.9666666666666667 using ginni index #--------fit model---------------------------------- dt.fit(x_train, y_train) #-------predict data test-----------------------------
#if len(z)!=1: # if z[1]<int(np.trunc(len(X)*0.1)): # from imblearn.over_sampling import RandomOverSampler # ros = RandomOverSampler(random_state=0) # X1, Label1 = ros.fit_sample(X[int(np.trunc(z[0]*0.85)):], Label[int(np.trunc(z[0]*0.85)):]) # X = np.vstack((X[0:int(np.trunc(z[0]*0.85))],X1)) # y_test = np.hstack((Label[0:int(np.trunc(z[0]*0.85))],Label1)) #Feature Selection from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel clf = ExtraTreesClassifier(n_estimators=1000,n_jobs=-1,bootstrap=True,oob_score=True)# clf = clf.fit(X, y_test)#,max_depth=20 print(clf.oob_score_) model = SelectFromModel(clf, prefit=True) X = model.transform(X) importances = clf.feature_importances_ std = np.std([tree.feature_importances_ for tree in clf.estimators_],axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking importanceindex = [] print("Feature ranking:") for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) from collections import Counter z1 = Counter(y_test) print(z1)
return train, test, targets train, test, targets = recover_train_test_target() clf = RandomForestClassifier(n_estimators=50, max_features='sqrt') clf = clf.fit(train, targets) features = pd.DataFrame() features['Feature'] = train.columns features['Importance'] = clf.feature_importances_ features.sort_values(by=['Importance'], ascending=False, inplace=True) features.set_index('Feature', inplace=True) features.plot(kind='bar', figsize=(20, 10)) plt.show() model = SelectFromModel(clf, prefit=True) train_reduced = model.transform(train) print(train_reduced.shape) test_reduced = model.transform(test) test_reduced.shape parameters = { 'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6 } model = RandomForestClassifier(**parameters) print(model.fit(train, targets)) print(compute_score(model, train, targets, scoring='accuracy'))
forest.fit(X_train, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(X_train.shape[1]): print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) plt.title('Feature Importances') plt.bar(range(X_train.shape[1]), importances[indices], color='lightblue', align='center') plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90) plt.xlim([-1, X_train.shape[1]]) plt.tight_layout() #plt.savefig('./random_forest.png', dpi=300) plt.show() from sklearn.feature_selection import SelectFromModel sfm = SelectFromModel(forest, threshold=0.15, prefit=True) X_selected = sfm.transform(X_train) print(X_selected.shape) for f in range(X_selected.shape[1]): print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
'pay_amount', 'discount_amount', 'basket_cnt', 'phonecall_cnt', 'favourite_cnt', 'forward_cnt' ]] #X_test = stepone_tb[['pay_amount', 'discount_amount', 'basket_cnt', 'phonecall_cnt', 'favourite_cnt', 'forward_cnt']][11001:16597] y_train = stepone_tb['purchase_cnt'] #y_test = stepone_tb['purchase_cnt'][11001:16597] from sklearn import linear_model clf = linear_model.Lasso(alpha=0.1) clf.fit(X_train, X_test).predict(y_train) from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train) model = SelectFromModel(lsvc, prefit=True) X_new = model.transform(X_train) from sklearn.ensemble import ExtraTreesClassifier clf = ExtraTreesClassifier() clf = clf.fit(X_train, y_train) # Lasso import matplotlib.pyplot as plt from sklearn.metrics import r2_score from sklearn.linear_model import Lasso from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC alpha = 0.1 clf = LassoCV(cv=20) y_pred_lasso = clf.fit(X_train, y_train).predict(X_test) r2_score_lasso = r2_score(y_test, y_pred_lasso) print(lasso) print("r^2 on test data : %f" % r2_score_lasso)
class LibLinear_Preprocessor(AutoSklearnPreprocessingAlgorithm): # Liblinear is not deterministic as it uses a RNG inside def __init__(self, penalty, loss, dual, tol, C, multi_class, fit_intercept, intercept_scaling, class_weight=None, random_state=None): self.penalty = penalty self.loss = loss self.dual = dual self.tol = tol self.C = C self.multi_class = multi_class self.fit_intercept = fit_intercept self.intercept_scaling = intercept_scaling self.class_weight = class_weight self.random_state = random_state self.preprocessor = None def fit(self, X, Y): import sklearn.svm from sklearn.feature_selection import SelectFromModel self.C = float(self.C) self.tol = float(self.tol) self.dual = self.dual == 'True' self.fit_intercept = self.fit_intercept == 'True' self.intercept_scaling = float(self.intercept_scaling) if self.class_weight == "None": self.class_weight = None estimator = sklearn.svm.LinearSVC( penalty=self.penalty, loss=self.loss, dual=self.dual, tol=self.tol, C=self.C, class_weight=self.class_weight, fit_intercept=self.fit_intercept, intercept_scaling=self.intercept_scaling, multi_class=self.multi_class, random_state=self.random_state) estimator.fit(X, Y) self.preprocessor = SelectFromModel(estimator=estimator, threshold='mean', prefit=True) return self def transform(self, X): if self.preprocessor is None: raise NotImplementedError() return self.preprocessor.transform(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'LinearSVC Preprocessor', 'name': 'Liblinear Support Vector Classification Preprocessing', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'input': (SPARSE, DENSE, UNSIGNED_DATA), 'output': (INPUT, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() penalty = Constant("penalty", "l1") loss = CategoricalHyperparameter("loss", ["hinge", "squared_hinge"], default="squared_hinge") dual = Constant("dual", "False") # This is set ad-hoc tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default=1e-4, log=True) C = UniformFloatHyperparameter("C", 0.03125, 32768, log=True, default=1.0) multi_class = Constant("multi_class", "ovr") # These are set ad-hoc fit_intercept = Constant("fit_intercept", "True") intercept_scaling = Constant("intercept_scaling", 1) cs.add_hyperparameters([ penalty, loss, dual, tol, C, multi_class, fit_intercept, intercept_scaling ]) penalty_and_loss = ForbiddenAndConjunction( ForbiddenEqualsClause(penalty, "l1"), ForbiddenEqualsClause(loss, "hinge")) cs.add_forbidden_clause(penalty_and_loss) return cs
def fit_and_score(X, y, scoring, train, test, parameters, fit_params=None, return_train_score=True, return_n_test_samples=True, return_times=True, return_parameters=False, return_estimator=False, error_score='raise', verbose=True, return_all=True): """Fit an estimator to a dataset and score the performance. The following methods can currently be applied as preprocessing before fitting, in this order: 0. Apply OneHotEncoder 1. Apply feature imputation 2. Select features based on feature type group (e.g. shape, histogram). 3. Scale features with e.g. z-scoring. 4. Apply feature selection based on variance of feature among patients. 5. Univariate statistical testing (e.g. t-test, Wilcoxon). 6. Use Relief feature selection. 7. Select features based on a fit with a LASSO model. 8. Select features using PCA. 9. Resampling 10. If a SingleLabel classifier is used for a MultiLabel problem, a OneVsRestClassifier is employed around it. All of the steps are optional. Parameters ---------- estimator: sklearn estimator, mandatory Unfitted estimator which will be fit. X: array, mandatory Array containingfor each object (rows) the feature values (1st Column) and the associated feature label (2nd Column). y: list(?), mandatory List containing the labels of the objects. scorer: sklearn scorer, mandatory Function used as optimization criterion for the hyperparamater optimization. train: list, mandatory Indices of the objects to be used as training set. test: list, mandatory Indices of the objects to be used as testing set. parameters: dictionary, mandatory Contains the settings used for the above preprocessing functions and the fitting. TODO: Create a default object and show the fields. fit_params:dictionary, default None Parameters supplied to the estimator for fitting. See the SKlearn site for the parameters of the estimators. return_train_score: boolean, default True Save the training score to the final SearchCV object. return_n_test_samples: boolean, default True Save the number of times each sample was used in the test set to the final SearchCV object. return_times: boolean, default True Save the time spend for each fit to the final SearchCV object. return_parameters: boolean, default True Return the parameters used in the final fit to the final SearchCV object. return_estimator : bool, default=False Whether to return the fitted estimator. error_score: numeric or "raise" by default Value to assign to the score if an error occurs in estimator fitting. If set to "raise", the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. verbose: boolean, default=True If True, print intermediate progress to command line. Warnings are always printed. return_all: boolean, default=True If False, only the ret object containing the performance will be returned. If True, the ret object plus all fitted objects will be returned. Returns ---------- Depending on the return_all input parameter, either only ret or all objects below are returned. ret: list Contains optionally the train_scores and the test_scores, fit_time, score_time, parameters_est and parameters_all. GroupSel: WORC GroupSel Object Either None if the groupwise feature selection is not used, or the fitted object. VarSel: WORC VarSel Object Either None if the variance threshold feature selection is not used, or the fitted object. SelectModel: WORC SelectModel Object Either None if the feature selection based on a fittd model is not used, or the fitted object. feature_labels: list Labels of the features. Only one list is returned, not one per feature object, as we assume all samples have the same feature names. scaler: scaler object Either None if feature scaling is not used, or the fitted object. encoder: WORC Encoder Object Either None if feature OneHotEncoding is not used, or the fitted object. imputer: WORC Imputater Object Either None if feature imputation is not used, or the fitted object. pca: WORC PCA Object Either None if PCA based feature selection is not used, or the fitted object. StatisticalSel: WORC StatisticalSel Object Either None if the statistical test feature selection is not used, or the fitted object. ReliefSel: WORC ReliefSel Object Either None if the RELIEF feature selection is not used, or the fitted object. Sampler: WORC ObjectSampler Object Either None if no resampling is used, or an ObjectSampler object """ # We copy the parameter object so we can alter it and keep the original if verbose: print("\n") print('#######################################') print('Starting fit and score of new workflow.') para_estimator = parameters.copy() estimator = cc.construct_classifier(para_estimator) # Check the scorer scorers, __ = check_multimetric_scoring(estimator, scoring=scoring) para_estimator = delete_cc_para(para_estimator) # Get random seed from parameters random_seed = para_estimator['random_seed'] del para_estimator['random_seed'] # X is a tuple: split in two arrays feature_values = np.asarray([x[0] for x in X]) feature_labels = np.asarray([x[1] for x in X]) # Split in train and testing X_train, y_train = _safe_split(estimator, feature_values, y, train) X_test, y_test = _safe_split(estimator, feature_values, y, test, train) train = np.arange(0, len(y_train)) test = np.arange(len(y_train), len(y_train) + len(y_test)) # Set some defaults for if a part fails and we return a dummy fit_time = np.inf score_time = np.inf Sampler = None encoder = None imputer = None scaler = None GroupSel = None SelectModel = None pca = None StatisticalSel = None VarSel = None ReliefSel = None if isinstance(scorers, dict): test_scores = {name: np.nan for name in scorers} if return_train_score: train_scores = test_scores.copy() else: test_scores = error_score if return_train_score: train_scores = error_score # Initiate dummy return object for when fit and scoring failes: sklearn defaults ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(para_estimator) if return_estimator: ret.append(estimator) # Additional to sklearn defaults: return all parameters ret.append(parameters) # ------------------------------------------------------------------------ # OneHotEncoder if 'OneHotEncoding' in para_estimator.keys(): if para_estimator['OneHotEncoding'] == 'True': if verbose: print(f'Applying OneHotEncoding, will ignore unknowns.') feature_labels_tofit =\ para_estimator['OneHotEncoding_feature_labels_tofit'] encoder =\ OneHotEncoderWrapper(handle_unknown='ignore', feature_labels_tofit=feature_labels_tofit, verbose=verbose) encoder.fit(X_train, feature_labels) if encoder.encoder is not None: # Encoder is fitted feature_labels = encoder.encoder.encoded_feature_labels X_train = encoder.transform(X_train) X_test = encoder.transform(X_test) del para_estimator['OneHotEncoding'] del para_estimator['OneHotEncoding_feature_labels_tofit'] # Delete the object if we do not need to return it if not return_all: del encoder # ------------------------------------------------------------------------ # Feature imputation if 'Imputation' in para_estimator.keys(): if para_estimator['Imputation'] == 'True': imp_type = para_estimator['ImputationMethod'] if verbose: print(f'Imputing NaN with {imp_type}.') imp_nn = para_estimator['ImputationNeighbours'] imputer = Imputer(missing_values=np.nan, strategy=imp_type, n_neighbors=imp_nn) imputer.fit(X_train) original_shape = X_train.shape X_train = imputer.transform(X_train) imputed_shape = X_train.shape X_test = imputer.transform(X_test) if original_shape != imputed_shape: removed_features = original_shape[1] - imputed_shape[1] raise ae.WORCValueError( f'Several features ({removed_features}) were np.NaN for all objects. Hence, imputation was not possible. Either make sure this is correct and turn of imputation, or correct the feature.' ) del para_estimator['Imputation'] del para_estimator['ImputationMethod'] del para_estimator['ImputationNeighbours'] # Delete the object if we do not need to return it if not return_all: del imputer # Remove any NaN feature values if these are still left after imputation X_train = replacenan(X_train, verbose=verbose, feature_labels=feature_labels[0]) X_test = replacenan(X_test, verbose=verbose, feature_labels=feature_labels[0]) # ------------------------------------------------------------------------ # Groupwise feature selection if 'SelectGroups' in para_estimator: if verbose: print("Selecting groups of features.") del para_estimator['SelectGroups'] # TODO: more elegant way to solve this feature_groups = [ 'shape_features', 'histogram_features', 'orientation_features', 'texture_gabor_features', 'texture_glcm_features', 'texture_gldm_features', 'texture_glcmms_features', 'texture_glrlm_features', 'texture_glszm_features', 'texture_gldzm_features', 'texture_ngtdm_features', 'texture_ngldm_features', 'texture_lbp_features', 'dicom_features', 'semantic_features', 'coliage_features', 'vessel_features', 'phase_features', 'fractal_features', 'location_features', 'rgrd_features', 'original_features', 'wavelet_features', 'log_features' ] # First take out the toolbox selection, which is a list toolboxes = para_estimator['toolbox'] del para_estimator['toolbox'] # Check per feature group if the parameter is present parameters_featsel = dict() for group in feature_groups: if group not in para_estimator: # Default: do use the group, except for texture features if group == 'texture_features': value = 'False' else: value = 'True' else: value = para_estimator[group] del para_estimator[group] parameters_featsel[group] = value # Fit groupwise feature selection object GroupSel = SelectGroups(parameters=parameters_featsel, toolboxes=toolboxes) GroupSel.fit(feature_labels[0]) if verbose: print("\t Original Length: " + str(len(X_train[0]))) # Transform all objectd accordingly X_train = GroupSel.transform(X_train) X_test = GroupSel.transform(X_test) if verbose: print("\t New Length: " + str(len(X_train[0]))) feature_labels = GroupSel.transform(feature_labels) # Delete the object if we do not need to return it if not return_all: del GroupSel # Check whether there are any features left if len(X_train[0]) == 0: # TODO: Make a specific WORC exception for this warning. if verbose: print( '[WARNING]: No features are selected! Probably all feature groups were set to False. Parameters:' ) print(parameters) # Delete the non-used fields para_estimator = delete_nonestimator_parameters(para_estimator) if return_all: return ret, GroupSel, VarSel, SelectModel, feature_labels[ 0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret # ------------------------------------------------------------------------ # Feature scaling if verbose and para_estimator['FeatureScaling'] != 'None': print(f'Fitting scaler and transforming features, method ' + f'{para_estimator["FeatureScaling"]}.') scaling_method = para_estimator['FeatureScaling'] if scaling_method == 'None': scaler = None else: skip_features = para_estimator['FeatureScaling_skip_features'] n_skip_feat = len([ i for i in feature_labels[0] if any(e in i for e in skip_features) ]) if n_skip_feat == len(X_train[0]): # Don't need to scale any features if verbose: print( '[WORC Warning] Skipping scaling, only skip features selected.' ) scaler = None else: scaler = WORCScaler(method=scaling_method, skip_features=skip_features) scaler.fit(X_train, feature_labels[0]) if scaler is not None: X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) del para_estimator['FeatureScaling'] # Delete the object if we do not need to return it if not return_all: del scaler # -------------------------------------------------------------------- # Feature selection based on variance if para_estimator['Featsel_Variance'] == 'True': if verbose: print("Selecting features based on variance.") if verbose: print("\t Original Length: " + str(len(X_train[0]))) try: X_train, feature_labels, VarSel =\ selfeat_variance(X_train, feature_labels) X_test = VarSel.transform(X_test) except ValueError: if verbose: print( '[WARNING]: No features meet the selected Variance threshold! Skipping selection.' ) if verbose: print("\t New Length: " + str(len(X_train[0]))) del para_estimator['Featsel_Variance'] # Delete the object if we do not need to return it if not return_all: del VarSel # Check whether there are any features left if len(X_train[0]) == 0: # TODO: Make a specific WORC exception for this warning. if verbose: print( '[WARNING]: No features are selected! Probably your features have too little variance. Parameters:' ) print(parameters) para_estimator = delete_nonestimator_parameters(para_estimator) if return_all: return ret, GroupSel, VarSel, SelectModel, feature_labels[ 0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret # -------------------------------------------------------------------- # Relief feature selection, possibly multi classself. # Needs to be done after scaling! # para_estimator['ReliefUse'] = 'True' if 'ReliefUse' in para_estimator.keys(): if para_estimator['ReliefUse'] == 'True': if verbose: print("Selecting features using relief.") # Get parameters from para_estimator n_neighbours = para_estimator['ReliefNN'] sample_size = para_estimator['ReliefSampleSize'] distance_p = para_estimator['ReliefDistanceP'] numf = para_estimator['ReliefNumFeatures'] # Fit RELIEF object ReliefSel = SelectMulticlassRelief(n_neighbours=n_neighbours, sample_size=sample_size, distance_p=distance_p, numf=numf, random_state=random_seed) ReliefSel.fit(X_train, y) if verbose: print("\t Original Length: " + str(len(X_train[0]))) # Transform all objects accordingly X_train = ReliefSel.transform(X_train) X_test = ReliefSel.transform(X_test) if verbose: print("\t New Length: " + str(len(X_train[0]))) feature_labels = ReliefSel.transform(feature_labels) del para_estimator['ReliefUse'] del para_estimator['ReliefNN'] del para_estimator['ReliefSampleSize'] del para_estimator['ReliefDistanceP'] del para_estimator['ReliefNumFeatures'] # Delete the object if we do not need to return it if not return_all: del ReliefSel # Check whether there are any features left if len(X_train[0]) == 0: # TODO: Make a specific WORC exception for this warning. if verbose: print( '[WARNING]: No features are selected! Probably RELIEF could not properly select features. Parameters:' ) print(parameters) para_estimator = delete_nonestimator_parameters(para_estimator) if return_all: return ret, GroupSel, VarSel, SelectModel, feature_labels[ 0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret # ------------------------------------------------------------------------ # Perform feature selection using a model para_estimator['SelectFromModel'] = 'True' if 'SelectFromModel' in para_estimator.keys( ) and para_estimator['SelectFromModel'] == 'True': model = para_estimator['SelectFromModel_estimator'] if verbose: print(f"Selecting features using model {model}.") if model == 'Lasso': # Use lasso model for feature selection alpha = para_estimator['SelectFromModel_lasso_alpha'] selectestimator = Lasso(alpha=alpha) elif model == 'LR': # Use logistic regression model for feature selection selectestimator = LogisticRegression() elif model == 'RF': # Use random forest model for feature selection n_estimators = para_estimator['SelectFromModel_n_trees'] selectestimator = RandomForestClassifier(n_estimators=n_estimators) else: raise ae.WORCKeyError( f'Model {model} is not known for SelectFromModel. Use Lasso, LR, or RF.' ) # Prefit model selectestimator.fit(X_train, y_train) # Use fit to select optimal features SelectModel = SelectFromModel(selectestimator, prefit=True) if verbose: print("\t Original Length: " + str(len(X_train[0]))) X_train_temp = SelectModel.transform(X_train) if len(X_train_temp[0]) == 0: if verbose: print( '[WORC WARNING]: No features are selected! Probably your data is too noisy or the selection too strict. Skipping SelectFromModel.' ) SelectModel = None parameters['SelectFromModel'] = 'False' else: X_train = SelectModel.transform(X_train) X_test = SelectModel.transform(X_test) feature_labels = SelectModel.transform(feature_labels) if verbose: print("\t New Length: " + str(len(X_train[0]))) if 'SelectFromModel' in para_estimator.keys(): del para_estimator['SelectFromModel'] del para_estimator['SelectFromModel_lasso_alpha'] del para_estimator['SelectFromModel_estimator'] del para_estimator['SelectFromModel_n_trees'] # Delete the object if we do not need to return it if not return_all: del SelectModel # Check whether there are any features left if len(X_train[0]) == 0: # TODO: Make a specific WORC exception for this warning. if verbose: print( '[WARNING]: No features are selected! Probably SelectFromModel could not properly select features. Parameters:' ) print(parameters) para_estimator = delete_nonestimator_parameters(para_estimator) if return_all: return ret, GroupSel, VarSel, SelectModel, feature_labels[ 0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret # ---------------------------------------------------------------- # PCA dimensionality reduction # Principle Component Analysis if 'UsePCA' in para_estimator.keys( ) and para_estimator['UsePCA'] == 'True': if verbose: print('Fitting PCA') print("\t Original Length: " + str(len(X_train[0]))) if para_estimator['PCAType'] == '95variance': # Select first X components that describe 95 percent of the explained variance pca = PCA(n_components=None, random_state=random_seed) try: pca.fit(X_train) except (ValueError, LinAlgError) as e: if verbose: print( f'[WARNING]: skipping this setting due to PCA Error: {e}.' ) if return_all: return ret, GroupSel, VarSel, SelectModel, feature_labels[ 0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret evariance = pca.explained_variance_ratio_ num = 0 sum = 0 while sum < 0.95: sum += evariance[num] num += 1 # Make a PCA based on the determined amound of components pca = PCA(n_components=num, random_state=random_seed) try: pca.fit(X_train) except (ValueError, LinAlgError) as e: if verbose: print( f'[WARNING]: skipping this setting due to PCA Error: {e}.' ) if return_all: return ret, GroupSel, VarSel, SelectModel, feature_labels[ 0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret X_train = pca.transform(X_train) X_test = pca.transform(X_test) else: # Assume a fixed number of components: cannot be larger than # n_samples n_components = min(len(X_train), int(para_estimator['PCAType'])) if n_components >= len(X_train[0]): if verbose: print( f"[WORC WARNING] PCA n_components ({n_components})> n_features ({len(X_train[0])}): skipping PCA." ) else: pca = PCA(n_components=n_components, random_state=random_seed) pca.fit(X_train) X_train = pca.transform(X_train) X_test = pca.transform(X_test) if verbose: print("\t New Length: " + str(len(X_train[0]))) # Delete the object if we do not need to return it if not return_all: del pca if 'UsePCA' in para_estimator.keys(): del para_estimator['UsePCA'] del para_estimator['PCAType'] # -------------------------------------------------------------------- # Feature selection based on a statistical test if 'StatisticalTestUse' in para_estimator.keys(): if para_estimator['StatisticalTestUse'] == 'True': metric = para_estimator['StatisticalTestMetric'] threshold = para_estimator['StatisticalTestThreshold'] if verbose: print( f"Selecting features based on statistical test. Method {metric}, threshold {round(threshold, 5)}." ) print("\t Original Length: " + str(len(X_train[0]))) StatisticalSel = StatisticalTestThreshold(metric=metric, threshold=threshold) StatisticalSel.fit(X_train, y) X_train_temp = StatisticalSel.transform(X_train) if len(X_train_temp[0]) == 0: if verbose: print( '[WORC WARNING]: No features are selected! Probably your statistical test feature selection was too strict. Skipping thresholding.' ) StatisticalSel = None parameters['StatisticalTestUse'] = 'False' else: X_train = StatisticalSel.transform(X_train) X_test = StatisticalSel.transform(X_test) feature_labels = StatisticalSel.transform(feature_labels) if verbose: print("\t New Length: " + str(len(X_train[0]))) del para_estimator['StatisticalTestUse'] del para_estimator['StatisticalTestMetric'] del para_estimator['StatisticalTestThreshold'] # Delete the object if we do not need to return it if not return_all: del StatisticalSel # ------------------------------------------------------------------------ # Use object resampling if 'Resampling_Use' in para_estimator.keys(): if para_estimator['Resampling_Use'] == 'True': # Determine our starting balance pos_initial = int(np.sum(y_train)) neg_initial = int(len(y_train) - pos_initial) len_in = len(y_train) # Fit ObjectSampler and transform dataset # NOTE: need to save random state for this one as well! Sampler =\ ObjectSampler(method=para_estimator['Resampling_Method'], sampling_strategy=para_estimator['Resampling_sampling_strategy'], n_jobs=para_estimator['Resampling_n_cores'], n_neighbors=para_estimator['Resampling_n_neighbors'], k_neighbors=para_estimator['Resampling_k_neighbors'], threshold_cleaning=para_estimator['Resampling_threshold_cleaning'], verbose=verbose) try: Sampler.fit(X_train, y_train) X_train_temp, y_train_temp = Sampler.transform( X_train, y_train) except ae.WORCValueError as e: message = str(e) if verbose: print('[WORC WARNING] Skipping resampling: ' + message) Sampler = None parameters['Resampling_Use'] = 'False' except RuntimeError as e: if 'ADASYN is not suited for this specific dataset. Use SMOTE instead.' in str( e): # Seldomly occurs, therefore return performance dummy if verbose: print( f'[WARNING]: {e}. Returning dummies. Parameters: ') print(parameters) para_estimator = delete_nonestimator_parameters( para_estimator) if return_all: return ret, GroupSel, VarSel, SelectModel, feature_labels[ 0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret else: raise e else: pos = int(np.sum(y_train_temp)) neg = int(len(y_train_temp) - pos) if pos < 10 or neg < 10: if verbose: print( f'[WORC WARNING] Skipping resampling: to few objects returned in one or both classes (pos: {pos}, neg: {neg}).' ) Sampler = None parameters['Resampling_Use'] = 'False' else: X_train = X_train_temp y_train = y_train_temp # Notify the user what the resampling did pos = int(np.sum(y_train)) neg = int(len(y_train) - pos) if verbose: message = f"Resampling from {len_in} ({pos_initial} pos," +\ f" {neg_initial} neg) to {len(y_train)} ({pos} pos, {neg} neg) patients." print(message) # Also reset train and test indices train = np.arange(0, len(y_train)) test = np.arange(len(y_train), len(y_train) + len(y_test)) del para_estimator['Resampling_Use'] del para_estimator['Resampling_Method'] del para_estimator['Resampling_sampling_strategy'] del para_estimator['Resampling_n_neighbors'] del para_estimator['Resampling_k_neighbors'] del para_estimator['Resampling_threshold_cleaning'] del para_estimator['Resampling_n_cores'] # Delete the object if we do not need to return it if not return_all: del Sampler # ---------------------------------------------------------------- # Fitting and scoring # Only when using fastr this is an entry if 'Number' in para_estimator.keys(): del para_estimator['Number'] # For certainty, we delete all parameters again para_estimator = delete_nonestimator_parameters(para_estimator) # NOTE: This just has to go to the construct classifier function, # although it is more convenient here due to the hyperparameter search if type(y) is list: labellength = 1 else: try: labellength = y.shape[1] except IndexError: labellength = 1 if labellength > 1 and type(estimator) not in [ RankedSVM, RandomForestClassifier ]: # Multiclass, hence employ a multiclass classifier for e.g. SVM, LR estimator.set_params(**para_estimator) estimator = OneVsRestClassifier(estimator) if verbose: print(f"Fitting ML method: {parameters['classifiers']}.") # Recombine feature values and label for train and test set feature_values = np.concatenate((X_train, X_test), axis=0) y = np.concatenate((y_train, y_test), axis=0) para_estimator = None try: ret = _fit_and_score(estimator, feature_values, y, scorers, train, test, verbose, para_estimator, fit_params, return_train_score=return_train_score, return_parameters=return_parameters, return_n_test_samples=return_n_test_samples, return_times=return_times, return_estimator=return_estimator, error_score=error_score) except (ValueError, LinAlgError) as e: if type(estimator) == LDA: if verbose: print( f'[WARNING]: skipping this setting due to LDA Error: {e}.') if return_all: return ret, GroupSel, VarSel, SelectModel, feature_labels[ 0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret else: raise e # Add original parameters to return object ret.append(parameters) if return_all: return ret, GroupSel, VarSel, SelectModel, feature_labels[ 0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret
# -------------- #Code Starts here #Import Libraries from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel #Intiate the linear svc class by fitting on X_train and y_train as follows (save it as lsvc) #C:0.01, penalty = 'l1', dual = False, random_state =42 lsvc = LinearSVC(C=0.01, penalty='l1', dual=False, random_state=42) lsvc.fit(X_train, y_train) #Initiate SelectFromModel class on lsvc and set prefit as True, also name the result of this variable as model_2. model_2 = SelectFromModel(lsvc, prefit=True) #Create new_train_features and new_test_features using model_2 to transform on X_train and X_test respectively. new_train_features = model_2.transform(X_train) new_test_features = model_2.transform(X_test) #Initiate the SVC class and call it as classifier_2. classifier_2 = SVC() #Fit the SVC classifier on new_train_features and y_train and store it in clf_2. clf_2 = classifier_2.fit(new_train_features, y_train) #Use clf_2 to predict on new_test_features and save it as y_pred_new. y_pred_new = clf_2.predict(new_test_features) #Store the accuracy score of the model in the variable named as model2_score. model2_score = accuracy_score(y_test, y_pred_new) precision, recall, f_score, support = error_metric(y_test, y_pred_new,
print( "===========================================================================================" ) print("Shape of Dataset:", maldata.shape) print( "===========================================================================================" ) df = maldata X = df.iloc[:, 0:88].values y = df.iloc[:, 88].values # Random Forest importance clf = RandomForestClassifier(random_state=0) model = clf.fit(X, y) select = SelectFromModel(model, prefit=True) X_new = select.transform(X) print("*Feature Selection*") print("Shape before using feature selection:", X.shape) print("Shape after feature selection:", X_new.shape) importances = model.feature_importances_ indices = np.argsort(importances)[::-1] # print(importances) # List of Feature print("Feature ranking:") for f in range(X_new.shape[1]): print("%d. %s (%f)" % (f + 1, df.columns[indices[f]], importances[indices[f]])) print( "===========================================================================================" ) # Visualization feature
def RF(): global New_data, data_test global x_train, x_test, y_train, y_test global new_x_train, new_x_test, new_data text.delete('1.0', END) text.insert(END, "\t\t\t\tRandom Forest Classifier\n\n") clf = RandomForestClassifier(n_estimators=50, max_features='sqrt') clf = clf.fit(x_train, y_train) features = pd.DataFrame() features['Feature'] = x_train.columns features['Importance'] = clf.feature_importances_ features.sort_values(by=['Importance'], ascending=False, inplace=True) features.set_index('Feature', inplace=True) text.insert( END, "Selected Important Features Automatically by using *feature_importances_* & *SelectFromModel*\n\n" ) text.insert(END, features[:5]) selector = SelectFromModel(clf, prefit=True) train_reduced = selector.transform(x_train) new_x_train = pd.DataFrame(train_reduced, columns=[ 'Debt_Income_Ratio', 'Credit_History_Bad', 'Total_Income', 'LoanAmount', 'Credit_History_Good' ]) test_reduced = selector.transform(x_test) new_x_test = pd.DataFrame(test_reduced, columns=[ 'Debt_Income_Ratio', 'Credit_History_Bad', 'Total_Income', 'LoanAmount', 'Credit_History_Good' ]) new_reduced = selector.transform(New_data) new_data = pd.DataFrame(new_reduced, columns=[ 'Debt_Income_Ratio', 'Credit_History_Bad', 'Total_Income', 'LoanAmount', 'Credit_History_Good' ]) parameters = { 'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6 } rf = RandomForestClassifier(**parameters) rf.fit(new_x_train, y_train) pred = rf.predict(new_x_test) acc = accuracy_score(y_test, pred) cm = confusion_matrix(y_test, pred) CR = classification_report(y_test, pred) output = rf.predict(new_data).astype(int) df_output = pd.DataFrame() df_output['Loan_ID'] = data_test['Loan_ID'] df_output['Loan_Predicted_Status'] = np.vectorize( lambda s: 'Y' if s == 1 else 'N')(output) df_output[['Loan_ID', 'Loan_Predicted_Status' ]].to_csv('*****@*****.**', index=False) text.insert(END, "\n\nConfusion Matrix:\n" + str(cm) + "\n\n") text.insert( END, "Accuracy Score:\n" + str(np.round(acc * 100, 4)) + ' %' + "\n\n") text.insert(END, "Predicted Values on Test Data:\n" + str(pred) + "\n\n") text.insert(END, "Classification Report:\n" + str(CR)) text.insert(END, "\n\nFinal Predicted values on New Data:\n\n") text.insert(END, df_output) text.insert(END, "\n\nCheck the Project Directory for Submission CSV file\n\n") text.insert(END, "@@@------------------Thank You--------------------@@@")
feats[feature] = importance importances = pd.DataFrame.from_dict( feats, orient='index').rename(columns={0: 'Gini-importance'}) Feature_Importance = importances.sort_values(by='Gini-importance') #print(Feature_Importance) feat_labels = df.loc[:, 'BTC':].columns #for feature in zip(feat_labels, regressor.feature_importances_): #print(feature) sfm = SelectFromModel(regressor, threshold=0.005) sfm.fit(X_train, y_train) #for feature_list_index in sfm.get_support(indices=True): #print(feat_labels[feature_list_index]) X_important_train = sfm.transform(X_train) X_important_test = sfm.transform(X_test) rfr_important = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1) rfr_important.fit(X_important_train, y_train) y_important_pred = rfr_important.predict(X_important_test) #print("Explained Variance 2:", explained_variance_score(y_test, y_important_pred)) #cross validation #cvscores_10 = cross_val_score(regressor, X, y, cv = 10) #print("CV Score",np.mean(cvscores_10)) #svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
# ============================================================================= # 2) 확장된 dataset을 SelectFromModel에 적용 m_rf = rf_c() m_select1 = SelectFromModel(m_rf, # 변수 중요도를 파악할 모델 명 전달 threshold='median') # 선택 범위 m_select1.fit(df_iris.data, df_iris.target) m_select1.get_support() m_select1.fit(df_iris_new, df_iris.target) m_select1.get_support() # 3) 선택된 변수의 dataset 추출 df_iris_new[:, m_select1.get_support()] # 중요변수 선택 후 dataset m_select1.transform(df_iris_new) # 4) 변수 중요도 확인 m_select1.estimator_.feature_importances_ # 2.2.2 변수선택 방법 2 : 일변량 통계 기법 # - 변수 하나와 종속변수와의 상관 관계 중심으로 변수 선택 # - 다른 변수가 함께 학습될때의 판단과는 다른 결과가 나올 수 있음 # - 학습 시킬 모델이 필요 없어 연산속도가 매우 빠름 from sklearn.feature_selection import SelectPercentile # 1) 변수 선택 모델 생성 및 적용 m_select2 = SelectPercentile(percentile=30) m_select2.fit(df_iris_new, df_iris.target) # 2) 변수 선택 결과 dataset 확인
def main(): data = pd.read_csv('data.csv', sep='|') X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values y = data['legitimate'].values print('Researching important feature based on %i total features\n' % X.shape[1]) # import pdb; pdb.set_trace() # Feature selection using Trees Classifier fsel = ske.ExtraTreesClassifier().fit(X, y) # fsel = ske.GradientBoostingClassifier(n_estimators=100).fit(X, y) model = SelectFromModel(fsel, prefit=True) X_new = model.transform(X) nb_features = X_new.shape[1] # nb_features = X.shape[1] # sklearn has a test_train_split (who doesn't?) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X_new, y, test_size=0.9) features = [] print('%i features identified as important:' % nb_features) indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features] for f in range(nb_features): print("%d. feature %s (%f)" % (f + 1, data.columns[2 + indices[f]], fsel.feature_importances_[indices[f]])) # XXX : take care of the feature order for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]): features.append(data.columns[2 + f]) #Algorithm comparison algorithms = { "Toms Classifier rand": TomsClassifier(), "Toms Classifier const": TomsClassifier(random=False), "DecisionTree": sklearn.tree.DecisionTreeClassifier(max_depth=10), "RandomForest": ske.RandomForestClassifier(n_estimators=100), "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=100), "AdaBoost": ske.AdaBoostClassifier(n_estimators=100), "Logistic Regression": LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X, y), "Gaussian Naive Bayes": GaussianNB(), "SVM": SVC(), "Perceptron": MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1), "Perceptron sgd": MLPClassifier(solver='sgd', alpha=1e-2, hidden_layer_sizes=(5, 2), random_state=1), "Perceptron adam": MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10, 10, 10, 10)), } results = {} print("\nNow testing algorithms") for algo in algorithms: clf = algorithms[algo] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print('{:>25} : {}'.format(algo, score * 100)) results[algo] = score winner = max(results, key=results.get) print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner] * 100)) # # Save the algorithm and the feature list for later predictions # print('Saving algorithm and feature list in classifier directory...') # joblib.dump(algorithms[winner], 'classifier/classifier.pkl') # open('classifier/features.pkl', 'wb').write(pickle.dumps(features)) # print('Saved') # Identify false and true positive rates clf = algorithms[winner] res = clf.predict(X_test) mt = confusion_matrix(y_test, res) print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0]))) * 100)) print('False negative rate : %f %%' % ((mt[1][0] / float(sum(mt[1])) * 100)))
from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_validate from sklearn.metrics import confusion_matrix MalwareDataset = pd.read_csv('MalwareData.csv', sep='|') Legit = MalwareDataset[0:41323].drop(['legitimate'], axis=1) Malware = MalwareDataset[41323::].drop(['legitimate'], axis=1) #print('[+] Number of important features is %i \n' % Legit.shape[1]) Data = MalwareDataset.drop(['Name', 'md5', 'legitimate'], axis=1).values Target = MalwareDataset['legitimate'].values FeatSelect = ExtraTreesClassifier().fit(Data, Target) Model = SelectFromModel(FeatSelect, prefit=True) Data_new = Model.transform(Data) Legit_Train, Legit_Test, Malware_Train, Malware_Test = train_test_split(Data_new, Target ,test_size=0.2) clf = sklearn.ensemble.RandomForestClassifier(n_estimators=50) clf.fit(Legit_Train, Malware_Train) score = clf.score(Legit_Test, Malware_Test) print("[+] model accuracy score of Random Forest Algorithm is: {}%".format(score*100)) Result = clf.predict(Legit_Test) CM = confusion_matrix(Malware_Test, Result) print("[+] False positive rate : %f %%" % ((CM[0][1] / float(sum(CM[0])))*100)) print('[+] False negative rate : %f %%' % ( (CM[1][0] / float(sum(CM[1]))*100)))
# with open('feature_model.pickle', 'wb') as fp: # pickle.dump(clf, fp) with open('feature_model.pickle') as fp: clf = pickle.load(fp) feature = pd.DataFrame() feature['feature'] = train_set.columns feature['importance'] = clf.feature_importances_ feature.sort_values(by=['importance'], ascending=True, inplace=True) feature.set_index('feature', inplace=True) # feature.plot(kind='barh', figsize=(20, 20)) # plt.savefig('figure1.png') model = SelectFromModel(clf, prefit=True, threshold=0.1) train_reduced = model.transform(train_set) test_reduced = model.transform(test_set) print 'Dimension after Feature Selection: ', train_reduced.shape[1] header = feature.index.tolist()[::-1][:train_reduced.shape[1]] header.append('label') train_reduced = np.concatenate( [train_reduced, np.array(train_labels).reshape((-1, 1))], axis=1) test_reduced = np.concatenate( [test_reduced, np.array(test_labels).reshape((-1, 1))], axis=1) pd.DataFrame(train_reduced).to_csv('dataset/kddcup.train.data.reduced.csv', index=False, header=header) pd.DataFrame(test_reduced).to_csv('dataset/kddcup.test.data.reduced.csv',
# 加载数据 cancer = load_breast_cancer() # 获得确定性的随机数 rng = np.random.RandomState(42) noise = rng.normal(size=(len(cancer.data), 50)) # 添加噪声 X_w_noise = np.hstack([cancer.data, noise]) X = X_w_noise y = cancer.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # 模型训练 select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median') select.fit(X_train, y_train) X_train_l1 = select.transform(X_train) # print(X_train.shape) # print(X_train_l1.shape) X_test_l1 = select.transform(X_test) score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test) print(score) # 可视化 mask = select.get_support() plt.matshow(mask.reshape(1, -1), cmap='gray_r') plt.xlabel('sample index') plt.show()
X_train, X_validation, y_train, y_validation = model_selection.train_test_split( X, y, test_size=validation_size, random_state=seed) X_train.shape # ### L1-based feature selection # Our dataset contains a lot of features (216 to be more specific). # # Some features are collinear, so we can and we must to transform our data. To do that, I choosed to use a L1-based feature selection method. # # Its importante to say that smaller C implies in fewer features selected. # In[82]: lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train) modellsvc = SelectFromModel(lsvc, prefit=True) X_train_new = modellsvc.transform(X_train) X_train_new.shape # ### Select a classifier # We will evaluate six classifiers, to choose the best model to classify our validation data. The criteria to choose the best is the accuracy of the model on the train data. # # We use a cross-validation (k-fold with k = 10) to evaluate the models. # In[83]: models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB()))
from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel array_FS = titanic_train[predictors + ["Survived"]].values X_FS = array_FS[:, 0:21] Y_FS = array_FS[:, 21] model = ExtraTreesClassifier() model.fit(X_FS, Y_FS) features = pandas.DataFrame() features['feature'] = predictors features['importance'] = model.feature_importances_ print(predictors) print(model.feature_importances_) features.sort(['importance'], ascending=False) model_tr = SelectFromModel(model, prefit=True) train_new = model_tr.transform(titanic_train[predictors]) train_new.shape test_new = model_tr.transform(titanic_test[predictors]) test_new.shape forest = RandomForestClassifier() parameter_grid = { 'max_depth': [4, 5, 6, 7, 8], 'n_estimators': [200, 210, 220, 230, 240, 250, 260, 270, 280, 290], 'criterion': ['gini', 'entropy'] } cross_validation = StratifiedKFold(Y_FS, n_folds=10) grid_search = GridSearchCV(forest, param_grid=parameter_grid, cv=cross_validation) grid_search.fit(train_new, Y_FS)
mse = mean_squared_error(y_test, y_pred) #, multioutput='raw_values') r2 = r2_score(y_test, y_pred) #, multioutput='raw_values') ONE_MEGABYTE = 1048576 print("Prediction score (MAE): %.2f" % (mae / ONE_MEGABYTE)) print("Prediction score (MSE): %.2f" % (mse / ONE_MEGABYTE)) print("Prediction score (R2): %.2f" % (r2)) # In[21]: from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel model = SelectFromModel(clf, prefit=True) tuxdata_reduced = model.transform(tuxdata.drop(columns=size_methods)) tuxdata_reduced.shape, tuxdata.shape # In[22]: #lass = SelectFromModel(LassoCV(tol = 0.001)) #lass.fit(X_train, y_train) #tuxdata_reduced_lass = lass.transform(tuxdata.drop(columns=size_methods)) #tuxdata_reduced_lass.shape, tuxdata.shape # In[23]: ft_vals = ['y', 'n'] tri_state_values = ['y', 'n', 'm'] all(x in tri_state_values for x in ft_vals)
clf_mri.fit(scaled_X, labels_train) # Create a selector object that will use the random forest classifier to identify # features that have an importance of more than median sfm_mri = SelectFromModel(clf_mri, threshold="mean") # Train the selector sfm_mri.fit(scaled_X, labels_train) # Collect the feature with importance anatomy = [] for feature_list_index in sfm_mri.get_support(indices=True): anatomy.append(data_train.columns[feature_list_index]) # Transform the data to create a new dataset containing only the most important features # to both the training X and test X data. X_important_train = sfm_mri.transform(scaled_X) X_important_test = sfm_mri.transform(scaled_test) # Create a new random forest classifier for the most important features clf_mri_features = RandomForestClassifier(n_estimators=10000, random_state=1988, oob_score=True, n_jobs=-1) # Train the new classifier on the new dataset containing the most important features clf_mri_features.fit(X_important_train, labels_train) from sklearn.metrics import accuracy_score # Apply the full featured classifier to the Test Data y_important_pred = clf_mri_features.predict(X_important_test) # View the Accuracy of the Limited Feature Model
np.std(results['test_accuracy']) * 2)) print("precision score: {0:.2%} (+/- {1:.2%})".format( np.mean(results['test_precision']), np.std(results['test_precision']) * 2)) print("recall score: {0:.2%} (+/- {1:.2%})".format( np.mean(results['test_recall']), np.std(results['test_recall']) * 2)) print("f1_score: {0:.2%} (+/- {1:.2%})".format( np.mean(results['test_f1_score']), np.std(results['test_f1_score']) * 2)) # plot feature importance plot_importance(model) pyplot.show() """# TRAIN SELECTED FEATURES""" thresholds = sort(model.feature_importances_) for thresh in thresholds: # select features using threshold selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) # train model selection_model = XGBClassifier() selection_model.fit(select_X_train, y_train) # eval model select_X_test = selection.transform(X_test) y_pred = selection_model.predict(select_X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy * 100.0))
from sklearn import tree, linear_model from sklearn.feature_selection import SelectFromModel from sklearn.externals import joblib from sklearn.naive_bayes import GaussianNB from sklearn.metrics import confusion_matrix data = pd.read_csv('data.csv', sep='|') X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values y = data['legitimate'].values print('Researching important feature based on %i total features\n' % X.shape[1]) # Feature selection using Trees Classifier fsel = ske.ExtraTreesClassifier().fit(X, y) model = SelectFromModel(fsel, prefit=True) X_new = model.transform(X) nb_features = X_new.shape[1] #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2) features = [] print('%i features identified as important:' % nb_features) indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features] for f in range(nb_features): print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]])) # XXX : take care of the feature order for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]): features.append(data.columns[2+f])
def xgb_feature_selection(fe_name, matrix_x_temp, label_y, th): # SelectfromModel clf = XGBClassifier(n_estimators=50) clf.fit(matrix_x_temp, label_y) sfm = SelectFromModel(clf, prefit=True, threshold=th) matrix_x = sfm.transform(matrix_x_temp) # 打印出有多少特征重要性非零的特征 feature_score_dict = {} for fn, s in zip(fe_name, clf.feature_importances_): feature_score_dict[fn] = s m = 0 for k in feature_score_dict: if feature_score_dict[k] == 0.0: m += 1 print 'number of not-zero features:' + str(len(feature_score_dict) - m) # 打印出特征重要性 feature_score_dict_sorted = sorted(feature_score_dict.items(), key=lambda d: d[1], reverse=True) print 'xgb_feature_importance:' for ii in range(len(feature_score_dict_sorted)): print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][ 1] print '\n' f = open('../eda/xgb_feature_importance.txt', 'w') f.write('Rank\tFeature Name\tFeature Importance\n') for i in range(len(feature_score_dict_sorted)): f.write( str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') f.close() # 打印具体使用了哪些字段 how_long = matrix_x.shape[1] # matrix_x 是 特征选择后的 输入矩阵 feature_used_dict_temp = feature_score_dict_sorted[:how_long] feature_used_name = [] for ii in range(len(feature_used_dict_temp)): feature_used_name.append(feature_used_dict_temp[ii][0]) print 'feature_chooesed:' for ii in range(len(feature_used_name)): print feature_used_name[ii] print '\n' f = open('../eda/xgb_feature_chose.txt', 'w') f.write('Feature Chose Name :\n') for i in range(len(feature_used_name)): f.write(str(feature_used_name[i]) + '\n') f.close() # 找到未被使用的字段名 feature_not_used_name = [] for i in range(len(fe_name)): if fe_name[i] not in feature_used_name: feature_not_used_name.append(fe_name[i]) # 生成一个染色体(诸如01011100这样的) chromosome_temp = '' feature_name_ivar = fe_name[:-1] for ii in range(len(feature_name_ivar)): if feature_name_ivar[ii] in feature_used_name: chromosome_temp += '1' else: chromosome_temp += '0' print 'Chromosome:' print chromosome_temp joblib.dump(chromosome_temp, '../config/chromosome.pkl') print '\n' return matrix_x, feature_not_used_name[:], len(feature_used_name)
# confusion matrix print('confusion_matrix') print(pd.DataFrame(confusion_matrix(y_test, pred))) model = model.best_estimator_ n = 0 b_acc = acc thresholds = np.sort(model.feature_importances_) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train) selection_model = XGBClassifier() selection_model.fit(select_x_train,y_train) select_x_test = selection.transform(x_test) y_predict = selection_model.predict(select_x_test) acc = selection_model.score(select_x_test,y_test) acc_score = accuracy_score(y_test,y_predict) if acc > b_acc: n = select_x_train.shape[1] b_acc = acc L_selection = selection print("Thresh=%.3f, n=%d, acc: %.15f%%, acc_score: %.15f%%"%(thresh,select_x_train.shape[1],acc,acc_score))
kmeans = KMeans(n_clusters=2) kmeans.fit(x_train) labels = kmeans.predict(x_train) a = np.array(y_train) df = pd.DataFrame({'Labels': labels, 'Actual': a.flatten()}) ct = pd.crosstab(df['Labels'], df['Actual']) print(ct) from sklearn.feature_selection import SelectFromModel select = SelectFromModel(RandomForestClassifier(max_depth=9), threshold='median') select.fit(x_train, y_train) x_train_l1 = select.transform(x_train) print(x_train.shape) print(x_train_l1.shape) mask = select.get_support() plt.matshow(mask.reshape(1, -1), cmap='gray_r') plt.yticks([0]) #총 특성 117개 중 59개가 선택되었습니다. dtc3 = DecisionTreeClassifier(max_depth=7) dtc3.fit(x_train_l1, y_train) dtcscores = cross_val_score(dtc3, x_train, y_train, cv=5) print("DecisionTreeClassifier Cross Validation Attempt 3: " + str(dtcscores)) #최종결과 출력
#Read in data bid = pd.read_csv("full_features.csv", index_col=0) bid = bid.drop(["address", "payment_account"], axis=1) bid = bid[(bid.outcome==0) | (bid.numbids > 10)] test = pd.read_csv("full_features_test.csv", index_col=0) test = test.drop("address", axis=1) X = bid.iloc[:,2:] Y = bid.iloc[:,1] testX = test.iloc[:,2:] testY = test.iloc[:,1] #SVM lsvc = LinearSVC(C=1, penalty="l1", dual=False).fit(X, Y) model = SelectFromModel(lsvc, prefit=True) X_new = model.transform(X) testX = model.transform(testX) #Random Forest print 'Random Forest' algo_rf = RandomForestClassifier(280) algo_rf.fit(X_new,Y) hyp = algo_rf.predict(X_new) kfold = KFold(n_splits=20, shuffle=True, random_state=200) score = cross_val_score(algo_rf, X_new, Y, cv=kfold, scoring="roc_auc") preds = algo_rf.predict_proba(X_new) print "On Train: ", metrics.roc_auc_score(Y, preds[:,1]) print "Cross-Val: ", np.mean(score) #Get test prediction and write to csv for Kaggle evaluation
print("%s) %f" % (feature_index[i], feature_influence[feature_index[i]])) newlist.append(feature_influence[feature_index[i]]) np.cumsum(newlist) print(count) yf_pred = forest1.predict(Xd_test) print('Accuracy: {:.3f}'.format(accuracy_score(yd_test, yf_pred))) print('Accuracy: {:.2f}%'.format(accuracy_score(yd_test, yf_pred) * 100)) # Accuracy: 96.48% # using sklearn SelectFromModel ## to select important features feature_select = SelectFromModel(forest1, threshold=0.0100, prefit=True) features_selected = feature_select.transform(Xd_train) print('Number meeting threshold criterion:', features_selected.shape[1]) print("{} {}".format('Feature Number', 'Percentage Influence')) for t in range(features_selected.shape[1]): print("{0:>12} {1:>11.02f}%".format(feature_index[t], feature_influence[feature_index[t]])) for t in range(features_selected.shape[1]): print("{0:>3}) {1:^10.04f}".format(feature_index[t], feature_influence[feature_index[t]]))