def perform_rfe(model, train, test, filename, to_remove=None): if to_remove is None: to_remove = floor(0.3 * len(train.columns)) X = train.drop(TARGET, axis=1) y = train[TARGET] model.fit(X, y) preds = model.predict_proba(test)[:, 1] build_results_csv(filename, X.columns, send_submission("doesnt_matter.csv", preds), create_file=True) sleep(3) for i in range(to_remove): rfe = RFE(model, n_features_to_select=len(X.columns) - 1).fit(X, y) preds = rfe.predict_proba(test)[:, 1] X = X.iloc[:, rfe.get_support()] test = test.iloc[:, rfe.get_support()] results = build_results_csv( filename, X.columns, send_submission("doesnt_matter.csv", preds)) sleep(3) return results
def SelectFeatureByRFE(self): rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=self.k, step=10, verbose=5) rfe_selector.fit(self.X.values, self.y) rfe_support = rfe_selector.get_support(indices=True) _ = rfe_selector.get_support() save_feat = [] for i in list(rfe_support): save_feat.append(self.X.columns[i]) return save_feat, _
def select_features_RFE_LR(X, y, columns, iteration): selection = RFE(estimator=LogisticRegression(max_iter=iteration)).fit( X, y) selected_features = np.array(columns)[selection.get_support()] print(" Features selected by RFE from logistic regression :{}".format( selected_features)) return selected_features
def feature_selection_LR(): from sklearn.feature_selection import RFE rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=30, step=5, verbose=5) rfe_selector.fit(X_train_scaled, y_train) y_pred = rfe_selector.predict(X_test_scaled) y_predprob = rfe_selector.predict_proba(X_test_scaled)[:, 1] rfe_support = rfe_selector.get_support() rfe_feature = X_train[predictors].loc[:, rfe_support].columns.tolist() print(str(len(rfe_feature)), 'selected features') print('RFE features') print(rfe_feature) # Print model report: print("\nModel Report") #print("Train Accuracy : %.4g" % metrics.accuracy_score(y_train, y_pred_train)) print("Test Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred)) #print('Train error: {:.3f}'.format(1 - metrics.accuracy_score(y_train, y_pred_train))) print('Test error: {:.3f}'.format(1 - metrics.accuracy_score(y_test, y_pred))) print("AUC Score : %f" % metrics.roc_auc_score(y_test, y_predprob)) print("Recall : %f" % metrics.recall_score(y_test, y_pred)) print("Precision : %f" % metrics.precision_score(y_test, y_pred)) print("F-measure : %f" % metrics.f1_score(y_test, y_pred)) c_matrix = metrics.confusion_matrix(y_test, y_pred) print('========Confusion Matrix==========') print(" Rejected Accepted") print('Rejected {} {}'.format(c_matrix[0][0], c_matrix[0][1])) print('Accepted {} {}'.format(c_matrix[1][0], c_matrix[1][1]))
def do_learning(X_training, Y_training, X_test, Y_test, reference_dic, model_class): ''' credit: Juan Arroyo-Miranda & Dani Alcala With training and testing data select the best features with recursive feature elimination method, then fit a classifier and return a tuple containing the predicted values on the test data and a list of the best features used. ''' model = model_class # Recursive Feature Elimination rfe = RFE(model) rfe = rfe.fit(X_training, Y_training) best_features = rfe.get_support(indices=True) best_features_names = [reference_dic[i] for i in best_features] predicted = rfe.predict(X_test) expected = Y_test accuracy = accuracy_score(expected, predicted) return (expected, predicted, best_features_names, accuracy)
def feature_selection(X_res, y_res, xcol, FEATURE_NUM): ###################### feature selections ########################### print( '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FEATURE SELECTION >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>' ) rfc = RandomForestClassifier() # fit random forest classifier on the training set y_res = y_res.reshape(-1, 1) # reshape the lables rfc.fit(X_res, y_res) # extract important features score = np.round(rfc.feature_importances_, 3) importances = pd.DataFrame({'feature': xcol, 'importance': score}) importances = importances.sort_values('importance', ascending=False).set_index('feature') # plot importances plt.rcParams['figure.figsize'] = (11, 4) importances.plot.bar() # create the RFE model and select 10 attributes rfe = RFE(rfc, FEATURE_NUM) rfe = rfe.fit(X_res, y_res) # summarize the selection of the attributes feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), xcol)] selected_features = [v for i, v in feature_map if i == True] return selected_features
def RFE(self): X_norm = MinMaxScaler().fit_transform(self.df_X) rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=10, step=10, verbose=5) rfe_selector.fit(X_norm, self.y) self.rfe_support = rfe_selector.get_support() self.rfe_feature = list(self.df_X.loc[:,self.rfe_support].columns) return pd.DataFrame(self.rfe_support, self.rfe_feature)
def feature_selection_RFE(self, data_table, X, y, input_file): no_best_features = int(len(X[1, :]) / 10) # This will call the svc parameter tuning function of MLTwithPython3 # svc_best_model = MLT.Execute_machine_learning().parameter_tuning_SVM(X, y, 5) class_weight_values = class_weight.compute_class_weight( 'balanced', np.unique(y), y) # data into dictionary format.. class_weights = dict(zip(np.unique(y), class_weight_values)) RF = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight=class_weights) recursive_feature_selection = RFE( estimator=RF, n_features_to_select=no_best_features).fit(X, y) idxs_selected = recursive_feature_selection.get_support(indices=True) selected_data_table = Execute_feature_selection( ).generate_dataframe_feature_selection(idxs_selected, data_table) selected_data_table.to_csv("Feature_selection_RF_RFE_" + str(input_file), index=False)
def select_from_tree_recursively(x_data, y_data, select_k): print(f"Applying tree derived importance filter") print( f"cat variables before tree derived recursive importance filter {x_data.select_dtypes(include='object').shape}" ) print( f"num variables before tree derived recursive importance filter {x_data.select_dtypes(include='number').shape}" ) num_cols = x_data.select_dtypes(include='number').columns temp = x_data[num_cols] select_ = RFE(estimator=RandomForestRegressor(n_estimators=100), n_features_to_select=10) select_.fit(temp, y_data) cols_to_keep = temp.columns[select_.get_support()] cols_to_drop = [x for x in num_cols if x not in cols_to_keep] x_data.drop(labels=cols_to_drop, axis=1, inplace=True) print( f"cat variables after tree derived recursive importance filter {x_data.select_dtypes(include='object').shape}" ) print( f"num variables after tree derived recursive importance filter {x_data.select_dtypes(include='number').shape}" ) return x_data
def feature_select(x_train, y_train, method='iv', kb=100, rfe=30): if method == 'iv': method = mutual_info_classif elif method == 'f': method = f_classif # chi2 fn = x_train.columns selector1 = SelectKBest(chi2, kb) selector1.fit(x_train, y_train) # information value selector2 = SelectKBest(method, kb) selector2.fit(x_train, y_train) left_features = list( set(fn[selector2.get_support()].tolist() + fn[selector1.get_support()].tolist())) # RFE _X_tmp = x_train[left_features] fn = _X_tmp.columns clf = LogisticRegression(penalty='l2', C=0.2) selector = RFE(estimator=clf, n_features_to_select=rfe) selector.fit(_X_tmp, y_train) left_features = fn[selector.get_support()].tolist() x_train = x_train[left_features] return left_features
def in46(): from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier select=RFE(RandomForestClassifier(n_estimators=100,random_state=42),n_features_to_select=40) from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split cancer = load_breast_cancer() rng = np.random.RandomState(42) noise = rng.normal(size=(len(cancer.data), 50)) # print(cancer.data.shape) (596,30) x_w_noise = np.hstack([cancer.data, noise]) x_train, x_test, y_train, y_test = train_test_split(x_w_noise, cancer.target, random_state=0, test_size=0.5) select.fit(x_train,y_train) mask = select.get_support() plt.matshow(mask.reshape(1, -1), cmap='gray_r') plt.xlabel('sample index') plt.show() x_train_rfe=select.transform(x_train) x_test_rfe=select.transform(x_test) from sklearn.linear_model import LogisticRegression print(LogisticRegression().fit(x_train, y_train).score(x_test, y_test)) print(LogisticRegression().fit(x_train_rfe, y_train).score(x_test_rfe, y_test))
def select_features(X, y, n_features, attributes, folder): string_attributes = '-'.join(attributes) file_features = f"{folder}/features/selected_features_" + string_attributes + "_" + str( n_features) + ".pkl" if os.path.exists(file_features): with open(file_features, "rb") as f: selected_features = pickle.load(f) else: estimator = DecisionTreeClassifier() rfe = RFE(estimator=estimator, n_features_to_select=n_features) # only take 0.05 rfe.fit(X, y) # alternatively RFECV # rfe = RFECV(estimator=DecisionTreeClassifier(), step=0.01, scoring='accuracy', min_features_to_select=10) # rfe.fit(X, y) # return the selected features selected_features = X.columns[rfe.get_support()] # save selected features to avoid retraining again os.makedirs(f'{folder}/features', exist_ok=True) with open( f"{folder}/features/selected_features_" + string_attributes + "_" + str(n_features) + ".pkl", "wb") as f: pickle.dump(selected_features, f) return selected_features
def RFE_filter(df: DataFrame, y: Series, col_list: List, estimator: Any, keep: float = 0.5, step: int = 1) -> List: """ 递归特征消除 :param df: :param y: :param col_list: :param estimator: 使用的学习器 :param keep: 保留特征数目或比例 :param step: 每次递归的步长 :return: """ if keep >= 1 and isinstance(keep, float): raise Exception('参数keep大于等于1时, 请输入整数') if isinstance(keep, float): keep = np.ceil(len(col_list) * keep) selector = RFE(estimator, n_features_to_select=keep, step=step) selector = selector.fit(df[col_list], y) mask = selector.get_support() res = np.array(col_list)[mask].tolist() return res
def wrapper(self, X, y, k='all'): """ Wrapper documentation for RFE: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html Normalization: depend on the used model; yes for LR Impute missing values: depend on the used model; yes for LR """ X_norm = MinMaxScaler().fit_transform(X) if not k == 'all': if k > len(X.columns.tolist()): raise NameError( "Numero de features seleccionas (k) mayor a features totales" ) else: k = len(X.columns.tolist()) rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=k, step=10, verbose=0) rfe_selector.fit(X_norm, y) rfe_support = rfe_selector.get_support() rfe_feature = X.loc[:, rfe_support].columns.tolist() print(str(len(rfe_feature)), 'selected features') return rfe_support, rfe_feature, rfe_selector
def get_top_features(train_x,train_Y ): RFC = RandomForestClassifier() rfe = RFE(RFC, n_features_to_select=50) rfe = rfe.fit(train_x, train_Y) feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), train_x.columns)] selected_features = [v for i, v in feature_map if i==True] return selected_features
def rf_rfe(df): X,y = df.iloc[:,:-1],df.iloc[:,-1] model = RandomForestClassifier() features_no = X.columns rfe = RFE(model,len(X.columns)/2) rfe.fit(X,y) return X.columns.values[rfe.get_support()].tolist()
def df5(): from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split import time from sklearn.linear_model import LogisticRegression select = RFE(RandomForestClassifier(n_estimators=100,random_state=42), n_features_to_select=40) cancer = load_breast_cancer() rng = np.random.RandomState(42) noise = rng.normal(size=(len((cancer.data)), 50)) X_w_noise = np.hstack([cancer.data, noise]) X_train, X_test, y_train, y_test = train_test_split(X_w_noise, cancer.target, random_state=0, test_size=.5) start_time = time.time() select.fit(X_train,y_train) print("Estimated execution time: {} seconds".format((time.time()-start_time))) X_train_rfe = select.transform(X_train) X_test_rfe = select.transform(X_test) score = LogisticRegression().fit(X_train_rfe,y_train).score(X_test_rfe,y_test) print("Score: {:.3f}".format(score)) mask = select.get_support() plt.matshow(mask.reshape(1,-1),cmap='gray_r') plt.xlabel("Sample index") plt.show()
class RFE_RandomForestRegPrim(primitive): def __init__(self, random_state=0): super(RFE_RandomForestRegPrim, self).__init__(name='RFE_RandomForestReg') self.id = 44 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Feature ranking with recursive feature elimination with Random-Forest regressor. Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached." self.hyperparams_run = {'default': True} self.selector = RFE(RandomForestRegressor()) self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) final_output = {0: output} return final_output
def RFE_selector(estimator, n_features_to_select, X_data, Y_data): columns = X_data.columns selector = RFE(estimator = estimator, n_features_to_select = n_features_to_select) selector.fit_transform(X_data, Y_data) labels = [columns[x] for x in selector.get_support(indices=True)] feature = pd.DataFrame(selector.fit_transform(X_data, Y_data), columns=labels) return feature
def stepwise_recur_select(data_df, target_df, model, step_val=0.1, k_vals=[]): col_names = list(data_df.columns.values) data_np = data_df.values target_np = target_df.values scorers = {'Accuracy': 'accuracy', 'roc_auc': 'roc_auc'} auc_results = [] acc_results = [] feature_select = [] auc_std_results = [] acc_std_results = [] for k_val in k_vals: sel = RFE(model, n_features_to_select=k_val, step=step_val) data_np_fs = sel.fit_transform(data_np, target_np) scores = cross_validate(model, data_np_fs, target_np, scoring=scorers, cv=5) auc_score = scores['test_roc_auc'].mean() acc_score = scores['test_Accuracy'].mean() auc_results = np.append(auc_results, auc_score) acc_results = np.append(acc_results, acc_score) auc_std = scores['test_roc_auc'].std() * 2 acc_std = scores['test_Accuracy'].std() * 2 auc_std_results = np.append(auc_std_results, auc_std) acc_std_results = np.append(acc_std_results, acc_std) feature_select.append(sel.get_support()) optimal_ndx = np.where(auc_results == auc_results.max())[0] if len(optimal_ndx) > 1: optimal_ndx = optimal_ndx[0] else: optimal_ndx = int(optimal_ndx) sel_idx = feature_select[int(optimal_ndx)] keep_cols, del_cols = get_keep_del_cols(col_names, sel_idx) print('-- Stepwise Recursive Feature Selection --') print('K Selected: {}'.format(k_vals[optimal_ndx])) print('Selected Model Mean Accuracy Score: {}'.format( acc_results[optimal_ndx])) print('Selected Model Accuracy Deviation: {}'.format( acc_std_results[optimal_ndx])) print('Selected Model Mean AUC Score: {}'.format(auc_results[optimal_ndx])) print('Selected Model AUC Deviation: {}'.format( auc_std_results[optimal_ndx])) print('Number of Original Features: {}'.format(len(col_names))) print('Number of Selected Features: {}'.format(len(keep_cols))) print('Features Selected:') print(keep_cols) print('Features Removed:') print(del_cols) new_data_df = data_df.drop(del_cols, axis=1) return new_data_df, del_cols
def selector_RFE(features, label): model = LogisticRegression() selector = RFE(model, 15) selector.fit(features, label) print sorted(zip(map(lambda x: round(x, 4), selector.get_support()), names), reverse=True)
def eva_model(c, n, X, y, X_test, y_test, class_names, outdir): model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c) rfe = RFE(model, n_features_to_select=n) ## learning curve plt.clf() viz_LC = LearningCurve( rfe, scoring='f1_weighted', n_jobs=4 ) viz_LC.fit(X, y) viz_LC.show(outpath=outdir + '/LC.png') ## classification report plt.clf() viz_CR = ClassificationReport(rfe, classes=class_names, support=True) viz_CR.fit(X, y) viz_CR.score(X_test, y_test) viz_CR.show(outpath=outdir + '/CR.png') ## confusion matrix plt.clf() viz_CM = ConfusionMatrix(rfe, classes=class_names) viz_CM.fit(X, y) viz_CM.score(X_test, y_test) viz_CM.show(outpath=outdir + '/CM.png') ## precision recall curve plt.clf() viz_PRC = PrecisionRecallCurve(rfe, per_class=True, iso_f1_curves=True, fill_area=False, micro=False, classes=class_names) viz_PRC.fit(X, y) viz_PRC.score(X_test, y_test) viz_PRC.show(outpath=outdir + '/PRC.png',size=(1080,720)) ## class prediction error plt.clf() viz_CPE = ClassPredictionError( rfe, classes=class_names ) viz_CPE.fit(X, y) viz_CPE.score(X_test, y_test) viz_CPE.show(outpath=outdir + '/CPE.png') ## ROCAUC plt.clf() viz_RA = ROCAUC(rfe, classes=class_names, size=(1080,720)) viz_RA.fit(X, y) viz_RA.score(X, y) viz_RA.show(outpath=outdir + '/RA.png') fit = rfe.fit(X,y) y_predict = fit.predict(X_test) f1 = f1_score(y_test, y_predict, average='weighted') features_retained_RFE = X.columns[rfe.get_support()].values feature_df =pd.DataFrame(features_retained_RFE.tolist()) feature_df.to_csv(outdir + '/features.csv', sep='\t', index=False) return f1
def q4(): LR = LinearRegression() x_train, y_train = fifa.drop('Overall', axis=1), fifa['Overall'] LR.fit(x_train, y_train) rfe = RFE(LR, n_features_to_select=5) rfe.fit(x_train, y_train) selected_columns = list(x_train.columns[rfe.get_support()]) return selected_columns
def rf_rfe(df): X, y = df.iloc[:, :-1], df.iloc[:, -1] model = RandomForestClassifier() # create the RFE model and select 3 attributes rfe = RFE(model) rfe = rfe.fit(X, y) #print rfe.ranking_ return X.columns.values[rfe.get_support()].tolist()
def q4(): # Retorne aqui o resultado da questão 4. reg = LinearRegression() X = fifa.drop(columns="Overall") y = fifa["Overall"] rfe_fit = RFE(estimator=reg, n_features_to_select=5, step=1).fit(X, y) features_selected = rfe_fit.get_support(indices=True) return list(X.columns[features_selected])
def q4(): # Retorne aqui o resultado da questão 4. regl = LinearRegression() X_train = fifa.drop(columns=['Overall'], axis=1) y = fifa.Overall rfe = RFE(regl, n_features_to_select=5, step=1) rfe.fit(X_train, y) return list(X_train.columns[rfe.get_support()])
def selectFeaturesFromSubsetRecursive(self,subset,numFeatures): model = svm.LinearSVC(class_weights='auto') rfe = RFE(model, numFeatures) rfe = rfe.fit(self.instances[:,subset], self.classes) # summarize the selection of the attributes # print(rfe.get_support(indices=True)) # print(rfe.ranking_) return rfe.get_support(indices=True)
def rfe_selector(X, y, num_feats, random_state=None): this_selector = RFE(estimator=LogisticRegression( C=.1, solver='liblinear', random_state=random_state), n_features_to_select=num_feats, step=.2, verbose=5) this_selector.fit(X, y) return this_selector.get_support()
def q4(): X = df.drop('Overall', axis=1) y = df.Overall reg = LinearRegression().fit(X, y) selecao = RFE(reg, 5) selecao = selecao.fit(X, y) index = selecao.get_support(True) return list(X.columns[index])
def selectionRecursiveFE(X, y, paramlist): #create estimator n_features_to_select = paramlist['number _of_features'] svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=n_features_to_select, step=3) Xnew = rfe.fit_transform(X, y) indexarr = rfe.get_support(indices=True) return [Xnew, indexarr]
def top_rfe(mod, x, y, n, step=0.05, **params): selector = RFE(mod(**params), n, step, 1) selector.fit(x, y) selected = selector.get_support() rfe_ftrs = np.asarray(x.columns)[selected] rfe_ftrs = pd.Series(1, index=rfe_ftrs) return rfe_ftrs
def rfe(): """Recursive feature elimination""" model = LogisticRegression() # create the RFE model and select 3 attributes rfe = RFE(model, 3) rfe = rfe.fit(features_train, labels_train) # summarize the selection of the attributes print([features_considered[i + 1] for i in rfe.get_support(indices=True) ]) print(rfe.ranking_) for i in range(len(rfe.ranking_)): print features_considered[i+1], ": ", rfe.ranking_[i]
def test_rfe_2(): """Ensure that the TPOT RFE outputs the same result as the sklearn rfe when num_features>no. of features in the dataframe """ tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) estimator = LinearSVC() rfe = RFE(estimator, 100, step=0.1) rfe.fit(training_features, training_classes) mask = rfe.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(training_testing_data[mask_cols], tpot_obj._rfe(training_testing_data, 64, 0.1))
def model_logistic(training_data, test_data, features, label): ''' With training and testing data and the data's features and label, select the best features with recursive feature elimination method, then fit a logistic regression model and return predicted values on the test data and a list of the best features used. ''' model = LogisticRegression() rfe = RFE(model) rfe = rfe.fit(training_data[features], training_data[label]) predicted = rfe.predict(test_data[features]) best_features = rfe.get_support(indices=True) return predicted, best_features
def feat3(matrix): last_column = [row[len(matrix[0])-1] for row in matrix] data_class = transform_to_int(last_column, matrix[0][len(matrix[0])-1]) indices = list(range(len(matrix[0])-1)) new_list = map(operator.itemgetter(*indices), matrix) data = np.asarray(new_list) data = data.astype(np.float) svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=5, step=1) matrix_new = rfe.fit_transform(data, data_class) data_class = np.array([data_class]) features_selected = np.concatenate((matrix_new,data_class.T),axis=1) indices_resultados = rfe.get_support(new_list) features = [] for data in indices_resultados: features.append(data) return features
def classify(X_train, X_test, y_train): ''' Train the best classifier on (X_train, and y_train) then predict X_test labels :param X_train: A dictionary with the following structure { instance_id: [w_1 count, w_2 count, ...], ... } :param X_test: A dictionary with the following structure { instance_id: [w_1 count, w_2 count, ...], ... } :param y_train: A dictionary with the following structure { instance_id : sense_id } :return: results: a list of tuples (instance_id, label) where labels are predicted by the best classifier ''' results = [] trainVectors, _, trainOutcomes = A.getFeatureVectors(X_train, y_train) testVectors, testKeys = A.getFeatureVectors(X_test) # Select Features svm_clf = svm.LinearSVC() selector = RFE(svm_clf, verbose=0, step=10) selector = selector.fit(trainVectors, trainOutcomes) featMask = selector.get_support() # Mask Features nItems = testVectors.shape[0] testVectorsNew = np.zeros((nItems, np.sum(featMask))) for k in range(nItems): testVectorsNew[k, :] = testVectors[k, :][featMask] model = selector.estimator_ svm_predict = model.predict(testVectorsNew) #svm_clf.fit(trainVectorsNew, trainOutcomes) #svm_predict = svm_clf.predict(testVectors) results = [(testKeys[k], svm_predict[k]) for k in range(len(testKeys))] return results
def _rfe(self, input_df, num_features, step): """Uses Scikit-learn's Recursive Feature Elimination to learn the subset of features that have the highest weights according to the estimator Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} Input DataFrame to perform feature selection on num_features: int The number of features to select step: float The percentage of features to drop each iteration Returns ------- subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']} Returns a DataFrame containing the `num_features` best features """ training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values if step < 0.1: step = 0.1 elif step >= 1.: step = 0.99 if num_features < 1: num_features = 1 elif num_features > len(training_features.columns): num_features = len(training_features.columns) if len(training_features.columns.values) == 0: return input_df.copy() estimator = SVC(kernel='linear') selector = RFE(estimator, n_features_to_select=num_features, step=step) try: selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group'] return input_df[mask_cols].copy() except ValueError: return input_df[['guess', 'class', 'group']].copy()
# In[ ]: svc = LinearSVC(C=20, penalty='l1', dual=False) svc.fit(X, y) selected_feature_names = feature_cols[[list(set(np.where(svc.coef_ != 0)[-1]))]] X_svm = svc.transform(X) print("X_svm L1 transformed:", X_svm.shape) X=X_svm # In[ ]: rfeSelect = RFE(estimator=rf,n_features_to_select=10, step=0.15) X_RFE = rfeSelect.fit_transform(X,y) print(X_RFE.shape) # In[ ]: RFE_FeatureNames = feature_cols[rfeSelect.get_support()] print("RFE_FeatureNames: \n",RFE_FeatureNames) # In[ ]: "http://stackoverflow.com/questions/21548750/plotting-histograms-against-classes-in-pandas-matplotlib" for featName in RFE_FeatureNames: df.groupby("class").feature.hist(alpha=0.4) df.groupby("classname")[featName].plot(kind='kde')
def main(): root = 'data/raw/' windowData = None windowLabelInfo = None files = [f for f in os.listdir(root) if path.isfile(path.join(root, f))] labels = [l for l in files if "label" in l] labels = sorted(labels) gl_data = [g for g in files if "glove" in g] gl_data = sorted(gl_data) for glove_data, label_data in zip(gl_data,labels): user = read_user(root, glove_data, label_data, False) if windowData is None: windowData = user.windowData windowLabelInfo = user.windowLabel else: windowData = pd.concat([windowData, user.windowData]) windowLabelInfo = pd.concat([windowLabelInfo, user.windowLabelInfo]) print "permutate data" # TODO: here compute the labels the way we want it for analysis! # first simple approach: just the the major labe in each window: windowLabelInfo = windowLabelInfo.drop('Unnamed: 0', 1) windowData = windowData.drop(u'gesture', 1) # permutate the data indices = np.random.permutation(windowData.index) windowData = windowData.reindex(indices) windowLabelInfo = windowLabelInfo.reindex(indices) # prepare data for feature selection: selectLabelDF, exclude = labelMatrixToArray(windowLabelInfo, 150) # now we need to balance the amount of the zero class to the other classes # get all 0 indexes: selectLabelDF = selectLabelDF.drop(exclude) selectData = windowData.drop(exclude) selectLabelDF, selectData, _ = normalizeZeroClass(selectLabelDF, selectData) # feature selection using VarianceThreshold filter # sel = VarianceThreshold(threshold=(.8 * (1 - .8))) # fit = sel.fit(selectData.values) # colIndex = fit.get_support(indices=True) # windowData = windowData[windowData.columns[colIndex]] # the blow is somehow valid, however: # first I would need to transform the features so each X > 0 # (means vor each colum add the col max negative offset to 0 to each value) # but I am more in doupth I should do that as these are univariate # selections, and I am not sure if we are more in the multivariate # world here. # - feature selection getting the X best features based on # - statistical tests for the data. We have 65 sensors, # - or about 12 different single movements in our case # - since in our gesture only complete finger flexation # - or relaxation is interesting so the minimum # - number of features should be in the range of # - 12-65. A good set might be the double amount of that #fit = SelectKBest(chi2, k=65).fit(selectData.values, selectLabelDF.values) #colIndex = fit.get_support(indices=True) #windowData = windowData[windowData.columns[colIndex]] # important toto! # todo: I think also for feature selection we should take care the 0 class is balanced! # todo: if you use it that way, scale the features print "Recursive eleminate features: " svc = sklearn.linear_model.Lasso(alpha = 0.1) #svm.SVR(kernel="linear") print "test fit." svc.fit(selectData.values, np.ravel(selectLabelDF.values)) print "run rfecv.." rfecv = RFE(estimator=svc, step=0.1, verbose=2) rfecv.fit(selectData.values, np.ravel(selectLabelDF.values)) print "get support..." colIndex = rfecv.get_support(indices=True) print "shrink data to selected features...." windowData = windowData[windowData.columns[colIndex]] print windowData.shape print "selected headers: " print windowData.columns # first we split trining and test already here. this # is because of the different learning approach # # windowData['gesture'] = windowLabelInfo.idxmax(axis=1) splitpoint = int(windowData.index.size * 0.7) trainData = windowData[0:splitpoint] testData = windowData[splitpoint + 1:] trainLabels = windowLabelInfo[0:splitpoint] testLabels = windowLabelInfo[splitpoint + 1:] # a complete window has 201 frames. we count the label with # more than 150, aka. 3/4 as the real label labelDF, exclude = labelMatrixToArray(trainLabels, 150) # now we need to balance the amount of the zero class to the other classes # get all 0 indexes: labelDF = labelDF.drop(exclude) trainData = trainData.drop(exclude) labelDF, trainData, _ = normalizeZeroClass(labelDF, trainData) print("++++++++++++++++") print(labelDF) print("++++++++++++++++") print("train data size:") print(trainData.shape) print("++++++++++++++++") headers = Constants.headers #d = trainData.loc[:, headers] d = trainData.values #d.values d = preprocessing.scale(d) print(d) clf = None kf = KFold(len(labelDF.values), n_folds=5) score = 0 for train_index, test_index in kf: X_train = d[train_index, :] X_ct = d[test_index, :] y_train = labelDF.values[train_index] y_ct = labelDF.values[test_index] # lin_clf = sklearn.linear_model.LogisticRegression() # lin_clf = sklearn.linear_model.LogisticRegression(class_weight='auto') # lin_clf = svm.LinearSVC() # lin_clf = svm.LinearSVC(class_weight='auto') # lin_clf = svm.SVR() # lin_clf = svm.SVC() # lin_clf = svm.SVC(class_weight='auto') lin_clf = svm.SVC(decision_function_shape='ovo') # lin_clf = sklearn.neighbors.nearest_centroid.NearestCentroid() # lin_clf = sklearn.linear_model.Lasso(alpha = 0.1) # lin_clf = sklearn.linear_model.SGDClassifier(loss="hinge", penalty="l2") # lin_clf = sklearn.linear_model.SGDClassifier(loss="hinge", penalty="l2", class_weight='auto') # lin_clf = sklearn.naive_bayes.MultinomialNB() # lin_clf = sklearn.tree.DecisionTreeClassifier() # lin_clf = sklearn.tree.DecisionTreeClassifier(class_weight='auto') # lin_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=10) # lin_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=10, class_weight='auto') # lin_clf = sklearn.ensemble.AdaBoostClassifier(n_estimators=100) # lin_clf = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) lin_clf.fit(X_train, y_train) s = lin_clf.score(X_ct, y_ct) if s > score: score = s clf = lin_clf #clf = svm.SVC(decision_function_shape='ovo') #clf.fit(d, labelDF.values) # TODO: test label approach: # compute our binary matrix with labels per frame # also compute our label vector as above # then correct the label vector by looking # at multilabel entries if they match with the prediction # and set the label to that testLabelDF, exclude = labelMatrixToArray(testLabels, 10) # testLabelDF, testData, removalIndex = normalizeZeroClass(testLabelDF, testData) # testLabels.drop(removalIndex) testLabels = testLabels.fillna(0) testLabels[testLabels > 0] = 1 #d = testData.loc[:, headers] d = testData.values #d.values d = preprocessing.scale(d) prediction = clf.predict(d) for row in range(prediction.size): p = prediction[row] val = testLabels.loc[testLabels.index[row]][p] if val == 1.0: testLabelDF.loc[testLabelDF.index[row]] = p print("------------") print(prediction) print("------------") print(testLabelDF) print("------------") print(classification_report(testLabelDF.values, prediction))
training_features = result1.loc[training_indices].drop('class', axis=1) if len(training_features.columns.values) > 0 and len(training_features.columns.values) <= 700: # The feature constructor must be fit on only the training data poly = PolynomialFeatures(degree=2, include_bias=False) poly.fit(training_features.values.astype(np.float64)) constructed_features = poly.transform(result1.drop('class', axis=1).values.astype(np.float64)) result2 = pd.DataFrame(data=constructed_features) result2['class'] = result1['class'].values else: result2 = result1.copy() # Perform classification with a logistic regression classifier lrc3 = LogisticRegression(C=0.48148148148148145) lrc3.fit(result2.loc[training_indices].drop('class', axis=1).values, result2.loc[training_indices, 'class'].values) result3 = result2.copy() result3['lrc3-classification'] = lrc3.predict(result3.drop('class', axis=1).values) # Use Scikit-learn's Recursive Feature Elimination (RFE) for feature selection training_features = result3.loc[training_indices].drop('class', axis=1) training_class_vals = result3.loc[training_indices, 'class'].values if len(training_features.columns.values) == 0: result4 = result3.copy() else: selector = RFE(SVC(kernel='linear'), n_features_to_select=min(77, len(training_features.columns)), step=0.99) selector.fit(training_features.values, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['class'] result4 = result3[mask_cols]
selector = RFE(SVC(kernel='linear', C=1.), n_features_to_select=1500, step=0.25) classifier = SGDClassifier(loss='log', penalty='l1') X_train, X_test, Y_train, Y_test = train_test_split(documents, labels, test_size=.25, random_state=42) #X_vec = vectorizer.fit_transform(X_train) #X_sel = selector.fit_transform(X_vec, Y_train) #classifier.fit_transform(X_sel, Y_train) #X_prep = vectorizer.transform(X_test) #X_new = selector.transform(X_prep) #X_pred = classifier.predict(X_new) steps = [('vectorizer', vectorizer), ('selector', selector), ('classifier', classifier)] pipeline = Pipeline(steps) pipeline.fit_transform(X_train, Y_train) X_pred = pipeline.predict(X_test) fnames = vectorizer.get_feature_names() indices = selector.get_support(True) selected_terms = [ fnames[i] for i in indices ] show_most_informative_features(selected_terms, classifier, n=25) print classification_report(Y_test, X_pred) print confusion_matrix(Y_test, X_pred)
# -*- coding: utf-8 -*- import pandas from sklearn.feature_selection import RFE from sklearn.linear_model import LinearRegression data = pandas.read_csv('D:\\PDM\\6.2\\data2.csv') feature = data[['月份', '季度', '广告费用', '客流量']] rfe = RFE( estimator=LinearRegression(), n_features_to_select=2 ) sFeature = rfe.fit_transform( feature, data['销售额'] ) rfe.get_support()
num_round = 250 # Number of rounds of training, increasing this increases the range of output values clf = xgbw.XGBWrapper(param, num_round, verbose_eval=0) k = 500 step = 25 result_all = [] for step in [400, 200, 100, 50, 25]: selector = RFE(clf, step=step, n_features_to_select=k, verbose=2) print "Fitting Selector: k = {}, step = {}".format(k, step) start = time.time() selector = selector.fit(X_train, y_train) train_time = time.time() - start support = selector.get_support(indices=True) file_name = str(data[0]).rjust(2, "0") + str(data[1]).rjust(2, "0") + "_k" + str(k) + "_s" + str(step) addr_out = os.path.join("/home/ubuntu/Weiyi/RFE_Select", file_name) np.save(addr_out, support) start = time.time() prob = selector.predict_proba(X_test) test_time = round(time.time() - start, 2) score, recall, filter_rate, cut, net_savings = search_cut(prob) result_all.append([k, train_time, test_time, score, recall, filter_rate, cut, net_savings, step]) data = pd.DataFrame(np.array(result_all), columns=["k", "train time", "test time", "score", "recall", "filter rate", "cut", "net savings", "step"]) data.to_csv("/home/ubuntu/Weiyi/RFE_Select/RFE_0604.csv")
# List of feature importances importances = pandas.DataFrame(grid.best_estimator_.feature_importances_, index = explanatory_df.columns, columns =['importance']) importances.sort(columns = ['importance'], ascending = False, inplace = True) print importances # Recursive feature elimination #rfWithCoef = RandomForestsWithCoef(n_estimators= 500) rfe = RFE(estimator=rfWithCoef, n_features_to_select=3, step=1, verbose = 0) rfe.fit(explanatory_df, response_series) features_used = explanatory_df.columns[rfe.get_support()] print features_used # Run random forests on 3 best features conn = sqlite3.connect('/Users/MatthewCohen/Documents/SQLite/TeamSeason1.sqlite') query = """SELECT t.won as wins, g.good_team, t.o_fgm as field_goals_made, t.o_fga as field_goals_attempted, t.o_ftm as free_throws_made, t.o_fta as free_throws_attempted, t.o_oreb as offensive_rebounds, t.o_dreb as defensive_rebounds, t.o_reb as total_rebounds, t.o_asts as assists, t.o_pf as personal_fouls, t.o_stl as steals, t.o_to as turnovers, t.o_3pm as three_pointers_made, t.o_3pa as three_pointers_attempted, t.d_fgm as field_goals_allowed, t.d_fga as field_goal_attempts_allowed, t.d_reb as rebounds_allowed, t.d_asts as assists_allowed, t.d_pf as fouls_against, t.d_3pm as three_point_makes_allowed, ((o_fgm / o_fga)*100) as field_goal_percentage, ((o_ftm / o_fta)*100) as free_throw_percentage, ((o_3pm / o_3pa)*100) as three_point_percentage, o_blk as blocks, o_pts as points, d_pts as points_against
# # Recursive Feature Elimination # In[14]: from sklearn.feature_selection import RFE lr = LogisticRegression() rfe = RFE(estimator=lr, n_features_to_select=15, step=1) rfe.fit(bc_X, bc_y) # In[15]: select_features_rfe = rfe.get_support() feature_names_rfe = bc_data.feature_names[select_features_rfe] print(feature_names_rfe) # In[16]: set(feature_names_kbest) & set(feature_names_rfe) # # Model based selection # In[17]: from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt from sklearn.feature_selection import RFE from sklearn import cross_validation from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import cross_val_predict from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report from sklearn.linear_model import LogisticRegression # from sklearn.svm import SVR model = LogisticRegression() selector = RFE(model, 12, step=1) selector.fit(X,y) # summarize the selection of the features print X.columns[selector.get_support()] #get the only selected features from X X_new=selector.transform(X) X_new = pd.DataFrame(X_new,columns = [X.columns[selector.get_support()]]) # 5-folder cross validation y_pred=cross_val_predict(model,X_new,y, cv=5) #print precision_score(y,y_pred,average=None) #print recall_score(y,y_pred,average=None) #print f1_score(y,y_pred,average=None) #print accuracy_score(y,y_pred) #print classification_report(y,y_pred) #######################################################
def select_train_predict(X, Y, Z, feature_list, selection_method, estimator_method, n_features, selection_args, estimator_args): W = [] features = [] if selection_method != '2step_kbest': n_features = min(n_features, len(feature_list)) if estimator_method == 'svm' and selection_method == 'rfe': estimator_args['kernel'] = 'linear' estimator = ESTIMATORS[estimator_method](**estimator_args) if selection_method == 'cluster': agglom = FeatureAgglomeration(n_clusters=n_features, affinity='cosine', linkage='average') clusters = agglom.fit_predict(X).tolist() sample = [clusters.index(i) for i in range(n_features)] X = X[:,sample] Z = Z[:,sample] selection_method = None if selection_method is None: for i, y in enumerate(Y): estimator.fit(X, y) w = estimator.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'rfe': selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args) for i, y in enumerate(Y): selector = selector.fit(X, y) features.append(feature_list[selector.support_]) w = selector.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'myrfe': selector = MyRFE(estimator=estimator, n_features=n_features, **selection_args) for i, y in enumerate(Y): selector.fit(X, y) features.append(feature_list[selector.support]) w = selector.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'kbest': selector = SelectKBest(f_regression, k=n_features, **selection_args) for i, y in enumerate(Y): X2 = selector.fit_transform(X, y) Z2 = selector.transform(Z) features.append(feature_list[selector.get_support()]) estimator.fit(X2, y) w = estimator.predict(Z2) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', print return W, features