def select_by_linearmodel(self, models=None): # Embedded ''' 线性模型 :param models: :return: ''' if self.numNull != 0: print('特征中有NaN!!!') elif self.numInf != 0: print('特征中有Inf!!!') else: if not models: models = [LinearRegression(), Ridge(), Lasso()] # 使用SelectFromModel训练一次,选择特征 for model in models: model_name = str(model).split('(')[0] selector = SelectFromModel(model, max_features=self.K, threshold=-np.inf) selector.fit_transform(X=self.train_X, y=self.train_y) mask = selector.get_support(True) feature_names = np.array(self.continuous_feature_names)[mask] print("{} selected feature:{}".format(model_name, feature_names)) if self.showFig: for clf in models: model_name = str(clf).split('(')[0] model = clf.fit(self.train_X, self.train_y) self.dict_features_score(model.coef_) # print(sorted(dict(zip(self.continuous_feature_names, model.coef_)).items(), key=lambda x: x[1], reverse=True)) sns.barplot(self.continuous_feature_names, abs(model.coef_)) plt.title('{} coef of features'.format(model_name)) plt.show()
def test_threshold_and_max_features(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, ) est = RandomForestClassifier(n_estimators=50, random_state=0) transformer1 = SelectFromModel(estimator=est, max_features=3, threshold=-np.inf) X_new1 = transformer1.fit_transform(X, y) transformer2 = SelectFromModel(estimator=est, threshold=0.04) X_new2 = transformer2.fit_transform(X, y) transformer3 = SelectFromModel(estimator=est, max_features=3, threshold=0.04) X_new3 = transformer3.fit_transform(X, y) assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1]) selected_indices = transformer3.transform( np.arange(X.shape[1])[np.newaxis, :]) assert_allclose(X_new3, X[:, selected_indices[0]])
def SelectFromModel_selector(estimator, threshold, X_data, Y_data): columns = X_data.columns selector = SelectFromModel(estimator, threshold = threshold) selector.fit_transform(X_data, Y_data) labels = [columns[x] for x in selector.get_support(indices=True)] feature = pd.DataFrame(selector.fit_transform(X_data, Y_data), columns=labels) return feature
def FeatureSelect(df, YcolName, featSelectMethod="SFM", printNumCoeff=False): X = df.iloc[:, EXPRESSION_START:] #get all expressions Y = df[YcolName].to_numpy() #the output we want to predict Yclass = label_binarize(Y, list(set(Y))) #get the binarized value best_model, coeff_used = grid_search(X, Yclass) #get best model and alpha if (printNumCoeff): print("Number of coefficients used ", coeff_used) best_model.fit(X, Yclass) #fit best model predictor = OneVsRestClassifier(SVC(C=1, kernel='linear', probability=True)) Xred = None if featSelectMethod == "SFM": selector = SFM(best_model, prefit=True) Xred = selector.transform(X) elif featSelectMethod == "PCA": selector = PCA(n_components=PCA_TSNE_COMP, random_state=RANDOM_STATE) Xred = selector.fit_transform(X) elif featSelectMethod == "tSNE": selector = TSNE(n_components=PCA_TSNE_COMP, random_state=RANDOM_STATE) Xred = selector.fit_transform(X) return Xred, Yclass, predictor
def select_from_model(df): X = df.iloc[:, :-1] y = df.iloc[:, -1] features = X.columns clf = RandomForestClassifier(random_state=9) model = SelectFromModel(clf) model.fit_transform(X, y) return features[model.get_support()].tolist()
def test_max_features_callable_data(max_features): """Tests that the callable passed to `fit` is called on X.""" clf = RandomForestClassifier(n_estimators=50, random_state=0) m = Mock(side_effect=max_features) transformer = SelectFromModel(estimator=clf, max_features=m, threshold=-np.inf) transformer.fit_transform(data, y) m.assert_called_with(data)
def selectDecisionTree(x_train_ds, y_train_ds, x_test_ds, y_test_ds, max_features): x_train = SelectFromModel(ExtraTreesClassifier(n_estimators=100), max_features=max_features) x_train = x_train.fit_transform(x_train_ds, y_train_ds) x_test = SelectFromModel(ExtraTreesClassifier(n_estimators=100), max_features=max_features) x_test = x_test.fit_transform(x_test_ds, y_test_ds) return x_train, x_test
def selectRandomForests(x_train_ds, y_train_ds, x_test_ds, y_test_ds, max_features): x_train = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=max_features) x_train = x_train.fit_transform(x_train_ds, y_train_ds) x_test = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=max_features) x_test = x_test.fit_transform(x_test_ds, y_test_ds) return x_train, x_test
def select_from_model(data): X = data.iloc[:, :-1] y = data.iloc[:, -1] m = SelectFromModel(estimator=RandomForestClassifier()) m.fit_transform(X, y) a = X.columns b = a[m.get_support()] c = b.tolist() return c
def select_features_from_model(self, x, y): selector = SelectFromModel(estimator=LogisticRegression().fit(x, y), threshold=self.threshold, prefit=self.prefit, norm_order=self.norm_order, max_features=self.max_features) selector.fit_transform(x, y) features = selector.get_support(indices=True) self.best_features = [column for column in x.columns[features]] x_select = self.select_features_in_test_set(x) return x_select
def select_from_model(data): X = data.drop('SalePrice', axis=1) y = data['SalePrice'] rf_model = RandomForestClassifier() select_fm = SelectFromModel(rf_model) select_fm.fit_transform(X, y) #print (list(X.columns[select_fm.get_support()])) return list(X.columns[select_fm.get_support()])
def select_from_model(data): X = data.drop('SalePrice',axis=1) y = data['SalePrice'] model = RandomForestClassifier() sfm = SelectFromModel(model) sfm.fit_transform(X,y) feature_name = list(X.columns[sfm.get_support()]) return feature_name
def random_forest(data_set, y_values, want_graph, random_state, max_depth): model = RandomForestRegressor(random_state=random_state, max_depth=max_depth) #ovde radi one hot encoding data_set = pd.get_dummies(data_set) model.fit(data_set, y_values) indices = [] if want_graph: features = data_set.columns importances = model.feature_importances_ indices = np.argsort(importances) plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='b', align='center') plt.yticks(range(len(indices)), [features[i] for i in indices]) plt.xlabel('Relative Importance') plt.show() feature = SelectFromModel(model) fit = feature.fit_transform(data_set, y_values) return fit
def featureSelectionFromModel(patient, mrna, trainingData, type): cols = [col for col in mrna.columns] x = trainingData[cols].copy() x.drop('track_name', axis=1, inplace=True) if (type == 'cancerStage' or type == 'grade' or type == 'survivalDays'): x = pd.concat([x[:], trainingData['diagonosisAge'], trainingData['mutation']], axis=1) if (type == 'vitalStatus'): x = pd.concat([x[:], trainingData['diagonosisAge'], trainingData['mutation'], trainingData['survivalDays']], axis=1) y = trainingData[type] #feature selection from model if (type == 'survivalDays'): clf = DecisionTreeRegressor() else: clf = DecisionTreeClassifier() trans = SelectFromModel(clf, threshold='median') #0.01 xTrans = trans.fit_transform(x, y) columnsSelected = x.columns[trans.get_support()].values print ('Selected features from model for ', type, ': Total numbers is ', len(columnsSelected), '\nFeatures are: \n', columnsSelected) #Plot feature_importances_ clf.fit(x, y) plt.bar(range(len(clf.feature_importances_)), clf.feature_importances_) plt.xticks(range(len(clf.feature_importances_)), x.columns, rotation=270) plt.title('Feature Importance for ' + type) plt.show() dfSelectedFeatures = pd.DataFrame(xTrans, columns=columnsSelected) dfSelectedFeatures4Merge = pd.concat([dfSelectedFeatures[:], trainingData['track_name']], axis=1) otherColumns = patient.columns.difference(dfSelectedFeatures.columns) trainingData = pd.merge(dfSelectedFeatures4Merge, patient[otherColumns], on='track_name', how='inner') return trainingData
def RandomForest_select(dataframe, num): """ 随机森林 传进来的数据已经做好了必要的特征工程 随机森林是⼀种⼴泛使⽤的特征选择算法,它会⾃动计算各个特征的重要性,所以⽆需单独编程。这有助于我们选择较⼩的特征⼦集。 在开始降维前,我们先把数据转换成数字格式,因为随机森林只接受数字输⼊。同时,ID这个变量虽然是数字,但它⽬前并不重要,所以可以删去。 Args: dataframe: num: Returns: """ from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(random_state=1, max_depth=10) features = dataframe.columns model.fit(dataframe, dataframe.label[:5000]) importances = model.feature_importances_ indices = np.argsort(importances[0:num]) # top 10 features plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='b', align='center') plt.yticks(range(len(indices)), [features[i] for i in indices]) plt.xlabel('Relative Importance') plt.show() from sklearn.feature_selection import SelectFromModel feature = SelectFromModel(model) Fit = feature.fit_transform(df, df.label) return indices
def l1_dim_reduce(self, M): df = self.df y = df['class'] X = pd.DataFrame(M) dim_reduce = SelectFromModel(LogisticRegression(solver='liblinear', class_weight='balanced', C=0.04, penalty='l1')) X_ = dim_reduce.fit_transform(X, y) return X_
def lasso(XTraining, YTraining, XTest, number_of_features): embeded_lr_selector = SelectFromModel(LogisticRegression(max_iter=1000), max_features=number_of_features) lasso_selected = embeded_lr_selector.fit(XTraining, YTraining) XTraining = embeded_lr_selector.fit_transform(XTraining, YTraining) XTest, m = new_features(lasso_selected, XTest) return XTraining, XTest, m
def process_feature_label(self, feature_extraction=False): """ 处理feature和label,用于最后的训练 :param feature_extraction: 是否进行特征选择 """ t = time() from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer() self.data_bunch.labels = [ self.cate_dict[label] for label in self.data_bunch.labels ] # 将labels由文字变为数字 self.data_bunch.tfidfs = tfidf.fit_transform(self.data_bunch.contents) print("tfidfs维度为:{}".format(self.data_bunch.tfidfs.shape)) if feature_extraction: from sklearn.feature_selection import SelectFromModel from sklearn import svm selector = SelectFromModel( svm.LinearSVC(C=1, penalty="l1", dual=False)) self.data_bunch.tfidfs = selector.fit_transform( self.data_bunch.tfidfs, self.data_bunch.labels) print("选择后的tfidfs维度为:{}".format(self.data_bunch.tfidfs.shape)) print("处理feature和label, 完成!用时:{:.2f}s".format(time() - t))
def learn_general(self, nfold, task, model_label, X_train, y_train, X_indexes, outfolder, feature_selection=False, instance_data_source_tags=None, accepted_ds_tags: list = None): if feature_selection: select = SelectFromModel( LogisticRegression(class_weight='balanced', penalty="l1", C=0.01)) X_train = select.fit_transform(X_train, y_train) cls, model_file = self.create_classifier(outfolder, model_label, task) nfold_predictions = cross_val_predict(cls, X_train, y_train, cv=nfold) ml_util.save_scores( nfold_predictions, y_train, None, None, X_indexes, # nfold index None, # heldout index model_label, task, 2, outfolder, instance_data_source_tags, accepted_ds_tags)
def test_max_features_dim(max_features): clf = RandomForestClassifier(n_estimators=50, random_state=0) transformer = SelectFromModel(estimator=clf, max_features=max_features, threshold=-np.inf) X_trans = transformer.fit_transform(data, y) assert X_trans.shape[1] == max_features
def select_rfecv_sfm(selection, features, labels): if selection[0] == "rfecv": for key, method in methods.items(): recursive = RFECV(method, step=1, cv=[(range(134), range(134, 200))], scoring="accuracy") recursive.fit(features, labels) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("accuracy score" + key + " (nb of correct classifications)") plt.plot(range(1, len(recursive.grid_scores_) + 1), recursive.grid_scores_) plt.savefig(glbs.RESULTS_PATH + "\\" + key + ".jpg", bbox_inches="tight") if selection[0] == "sfm": score = {} for key, method in methods.items(): sfm = SelectFromModel(method, max_features=int(selection[1])) train_new = sfm.fit_transform(features[0], labels[0]) test_new = sfm.transform(features[1]) clf = method clf.fit(train_new, labels[0]) pred = clf.predict(test_new) acc = accuracy_score(labels[1], pred) score[key] = acc write_sfm(score)
def logit_feature_selection(C_params): for C in C_params: est = linear_model.LogisticRegression(random_state=100, penalty="l1", C=C, tol=1e-4) transformer = SelectFromModel(estimator=est) train_features = transformer.fit_transform(X_train, Y_train) test_features = transformer.transform(X_test) print("\nWith C={}".format(C)) print("Logistic regression reduced number of features to {}.".format( test_features.shape[1])) model = linear_model.LogisticRegression(random_state=100) if test_features.shape[1] <= 200: model = model_tune_params(model, logit_params) model.fit(train_features, Y_train) score = recall_score(y_pred=model.predict(test_features), y_true=Y_test, average="macro") print( "Logistic regression recall after FEATURE SELECTION: {:5f}".format( score)) n_features_logit.append(test_features.shape[1]) recall_logit.append(score)
def feature_selection(self): """ sklearn.feature_selection provides VarianceThreshold, recursive feature elimination (RFE), SelectFromModel. I use directly SelectFromModel. But RFE isn't very good, so I will write a more powerful one. """ print("\n\n************Feature Selection************\n\n") print("\n1. -------SelectFromModel-------\n") ncolumns = self._training_data.shape[1] threshold = 1 / (ncolumns * 5) select_from_model = SelectFromModel( RandomForestClassifier(n_estimators=600), prefit=False, threshold=threshold) training_data1 = select_from_model.fit_transform( self._training_data, self._training_result) # get the names of remaining variables deleted_vars_index = select_from_model.estimator_.feature_importances_ > threshold names1 = self._training_data.columns[list(deleted_vars_index)] # convert numpy.array to pandas.DateFrame training_data1 = pd.DataFrame(training_data1, columns=names1) # save the global transformation self._global_transform["select_from_model"] = select_from_model print("\n-------Those variables are eliminated : -------\n") print(list(self._training_data.columns[list(~deleted_vars_index)])) print("\n2. -------Recursive Feature Elimination-------\n")
def generate_train_data(train_data, test_data, poly=False, select=False): y = train_data['发电量'] X = train_data.drop(['发电量', 'ID'], axis=1) sub_data = test_data.drop(['ID'], axis=1) polynm = None if poly: from sklearn.preprocessing import PolynomialFeatures polynm = PolynomialFeatures(degree=2, interaction_only=True) X = polynm.fit_transform(X) sub_data = polynm.transform(sub_data) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) sm = None if select: from sklearn.feature_selection import SelectFromModel sm = SelectFromModel(GradientBoostingRegressor(random_state=2)) X_train = sm.fit_transform(X_train, y_train) X_test = sm.transform(X_test) sub_data = sm.transform(sub_data) return X_train, X_test, y_train, y_test, sub_data, sm, polynm
def randomforest(XTraining, YTraining, XTest, number_of_features): embeded_lr_selector = SelectFromModel( RandomForestClassifier(n_estimators=100), max_features=number_of_features) randomforest_selected = embeded_lr_selector.fit(XTraining, YTraining) XTraining = embeded_lr_selector.fit_transform(XTraining, YTraining) XTest, m = new_features(randomforest_selected, XTest) return XTraining, XTest, m
def test_inferred_max_features_callable(max_features): """Check max_features_ and output shape for callable max_features.""" clf = RandomForestClassifier(n_estimators=5, random_state=0) transformer = SelectFromModel(estimator=clf, max_features=max_features, threshold=-np.inf) X_trans = transformer.fit_transform(data, y) assert transformer.max_features_ == max_features(data) assert X_trans.shape[1] == transformer.max_features_
def check_valid_max_features(est, X, y): max_features = X.shape[1] for valid_max_n_feature in [0, max_features, 'all', 5]: transformer = SelectFromModel(estimator=est, max_features=valid_max_n_feature) X_new = transformer.fit_transform(X, y) if valid_max_n_feature == 'all': valid_max_n_feature = max_features assert_equal(X_new.shape[1], valid_max_n_feature)
def remove_based_on_select_from_model(dataframe, max_features): features = dataframe.drop(columns=["target"]) target = dataframe["target"] model = ensemble.RandomForestRegressor() sfm = SelectFromModel(estimator=model, max_features=max_features) features_transformed = sfm.fit_transform(features, target) columns_kept = features.loc[:, sfm.get_support()].columns features_df = pd.DataFrame(features_transformed, columns=columns_kept) return pd.concat([features_df, target], axis=1)
def selectFromModel(df, method, **col): models = { '线性回归': LinearRegression, '朴素贝叶斯': GaussianNB, '逻辑回归': LogisticRegression, 'SVM': SVC } model = SelectFromModel(models[method]()) X_new = model.fit_transform(df[col['X']].values, df[col['y']].values.ravel()) return X_new
def test_max_features(): # Test max_features parameter using various values X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, ) max_features = X.shape[1] est = RandomForestClassifier(n_estimators=50, random_state=0) transformer1 = SelectFromModel(estimator=est, threshold=-np.inf) transformer2 = SelectFromModel(estimator=est, max_features=max_features, threshold=-np.inf) X_new1 = transformer1.fit_transform(X, y) X_new2 = transformer2.fit_transform(X, y) assert_allclose(X_new1, X_new2) # Test max_features against actual model. transformer1 = SelectFromModel( estimator=Lasso(alpha=0.025, random_state=42)) X_new1 = transformer1.fit_transform(X, y) scores1 = np.abs(transformer1.estimator_.coef_) candidate_indices1 = np.argsort(-scores1, kind="mergesort") for n_features in range(1, X_new1.shape[1] + 1): transformer2 = SelectFromModel( estimator=Lasso(alpha=0.025, random_state=42), max_features=n_features, threshold=-np.inf, ) X_new2 = transformer2.fit_transform(X, y) scores2 = np.abs(transformer2.estimator_.coef_) candidate_indices2 = np.argsort(-scores2, kind="mergesort") assert_allclose(X[:, candidate_indices1[:n_features]], X[:, candidate_indices2[:n_features]]) assert_allclose(transformer1.estimator_.coef_, transformer2.estimator_.coef_)
def test_threshold_and_max_features(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) est = RandomForestClassifier(n_estimators=50, random_state=0) transformer1 = SelectFromModel(estimator=est, max_features=3, threshold=-np.inf) X_new1 = transformer1.fit_transform(X, y) transformer2 = SelectFromModel(estimator=est, threshold=0.04) X_new2 = transformer2.fit_transform(X, y) transformer3 = SelectFromModel(estimator=est, max_features=3, threshold=0.04) X_new3 = transformer3.fit_transform(X, y) assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1]) selected_indices = transformer3.transform( np.arange(X.shape[1])[np.newaxis, :]) assert_allclose(X_new3, X[:, selected_indices[0]])
def test_max_features(): # Test max_features parameter using various values X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) max_features = X.shape[1] est = RandomForestClassifier(n_estimators=50, random_state=0) transformer1 = SelectFromModel(estimator=est, threshold=-np.inf) transformer2 = SelectFromModel(estimator=est, max_features=max_features, threshold=-np.inf) X_new1 = transformer1.fit_transform(X, y) X_new2 = transformer2.fit_transform(X, y) assert_allclose(X_new1, X_new2) # Test max_features against actual model. transformer1 = SelectFromModel(estimator=Lasso(alpha=0.025, random_state=42)) X_new1 = transformer1.fit_transform(X, y) scores1 = np.abs(transformer1.estimator_.coef_) candidate_indices1 = np.argsort(-scores1, kind='mergesort') for n_features in range(1, X_new1.shape[1] + 1): transformer2 = SelectFromModel(estimator=Lasso(alpha=0.025, random_state=42), max_features=n_features, threshold=-np.inf) X_new2 = transformer2.fit_transform(X, y) scores2 = np.abs(transformer2.estimator_.coef_) candidate_indices2 = np.argsort(-scores2, kind='mergesort') assert_allclose(X[:, candidate_indices1[:n_features]], X[:, candidate_indices2[:n_features]]) assert_allclose(transformer1.estimator_.coef_, transformer2.estimator_.coef_)
def test_max_features_tiebreak(): # Test if max_features can break tie among feature importance X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) max_features = X.shape[1] feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1]) for n_features in range(1, max_features + 1): transformer = SelectFromModel( FixedImportanceEstimator(feature_importances), max_features=n_features, threshold=-np.inf) X_new = transformer.fit_transform(X, y) selected_feature_indices = np.where(transformer._get_support_mask())[0] assert_array_equal(selected_feature_indices, np.arange(n_features)) assert X_new.shape[1] == n_features
def logistic_l1(X, y, tol): DEBUG = False # if DEBUG: print 'X ',X,' y ',y,' tol ',tol lr = LogisticRegression(penalty='l1', C=0.65, dual=False) model = SelectFromModel(lr, prefit=False, threshold=tol) if DEBUG: print X.shape, y.shape x_select = model.fit_transform(X, y) x_logreg = lr.fit(X, y) x_logreg_trans = lr.predict(X) x_irls = irls(X, y) support = model.get_support(indices=True) if DEBUG: print 'support', support, 'x_select', x_select, 'x_logreg', x_logreg_trans, 'x_irls', x_irls if DEBUG: print 'len_support', len(support), 'len_x_select', len(x_select), 'len_x_logreg', len( x_logreg_trans), 'len_x_irls', len(x_irls) if DEBUG: print 'x_logreg_coef', x_logreg.coef_, 'len', len(x_logreg.coef_[0]), 'intercept', x_logreg.intercept_ return x_logreg.coef_[0]
def how_many_variables_used(word_list, inputs, outputs, num_vars, l1_step=LinearSVC(penalty='l1', dual=False, C=1)): kf = KFold(inputs.shape[0], n_folds=10, shuffle=True) for train_indices, val_indices in kf: # pipeline = Pipeline([('chi2_top_k', SelectKBest(chi2, num_vars)), # ('l1_step', SelectFromModel(l1_step))]) kbest = SelectKBest(chi2, num_vars) l1_selector = SelectFromModel(l1_step) x_new = kbest.fit_transform(inputs[train_indices], outputs[train_indices].ravel()) indices = kbest.get_support(indices=True) x_new = l1_selector.fit_transform(x_new, outputs[train_indices].ravel()) new_indices = l1_selector.get_support(indices=True) from sklearn.ensemble import ExtraTreesClassifier model = ExtraTreesClassifier() model.fit(x_new, outputs[train_indices].ravel()) importance = np.argsort(model.feature_importances_)[::-1] print([word_list[indices[i]] for i in new_indices]) print([word_list[indices[new_indices[i]]] for i in importance]) print(x_new.shape)
# -*- coding: utf-8 -*- import pandas from sklearn.linear_model import LinearRegression from sklearn.feature_selection import SelectFromModel data = pandas.read_csv('D:\\PDM\\6.2\\data2.csv') feature = data[['月份', '季度', '广告费用', '客流量']] lrModel = LinearRegression() selectFromModel = SelectFromModel(lrModel) selectFromModel.fit_transform( feature, data['销售额'] ) feature.columns[selectFromModel.get_support()]
#Function transform is deprecated; Support to use estimators as feature selectors will be removed in version 0.19. Use SelectFromModel instead. from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectFromModel X_train = [ [0,1,2], [3,5,4], [6,8,7], [9,11,10], [12,14,13], [15,16,17], [18,19,20], [21,22,23], ] Y_train = [0,0,0,0,1,1,1,1] rf=RandomForestClassifier(n_estimators=400, n_jobs=-1) sf=SelectFromModel(rf) Xr_train=sf.fit_transform(X_train,Y_train) print(Xr_train)
newFeatures.append(f[i] * f[j]) newFeatures.append((f[i] + 0.1) / (f[j] + 0.1)) newFeatures.append(f[i] - f[j]) newFeatures.append(f[i] + f[j]) newFeatures.append(f[i]) features2.append(newFeatures) target = results[:, 2] weights = results[:, 3] w = [t for t in results[:, 2] if t > 0] clf = GradientBoostingRegressor(learning_rate=0.08, n_estimators=20, max_depth=40, min_samples_leaf=20) clfR = GradientBoostingRegressor(learning_rate=0.08, n_estimators=120, max_depth=40, min_samples_leaf=20) clffit = clf.fit(features2, target) featuresSelectionModel = SelectFromModel(clffit) features3 = featuresSelectionModel.fit_transform(features2, target) features4 = [] for f in features3: newFeatures = [] for i in range(len(f)): for j in range(i, len(f)): newFeatures.append(f[i] * f[j]) newFeatures.append((f[i] + 0.2) / (f[j] + 0.2)) newFeatures.append(f[i] - f[j]) newFeatures.append(f[i] + f[j]) newFeatures.append(f[i]) features4.append(newFeatures) clffit2 = clf.fit(features4, target)