def select_features(data, neg_cmpd, pos_cmpd, compound_col="Metadata_compound", C=0.01): """ Return selected features basd on L1 linear svc. Parameters ----------- data : pandas DataFrame neg_cmpd : string name of negative control in compound_col pos_cmpd : string name of positive control in compound_col compound_col : string name of column in data that contains compound labels C : float (default=0.01) Sparsity, lower the number the fewer features are selected Returns ------- selected_features : list Selected features """ X, Y = _split_classes(data, neg_cmpd, pos_cmpd, compound_col) lin_svc = LinearSVC(C=C, penalty="l1", dual=False).fit(X, Y) model = SelectFromModel(lin_svc, prefit=True) feature_mask = np.array(model.get_support()) feature_names = np.array(X.columns.tolist()) selected_features = list(feature_names[feature_mask]) return selected_features
class ModelFeatureSelectionWrapper(BaseEstimator): def __init__(self, estimator, inner_model, feature_selection_threshold_coef=3): self.estimator=estimator self.inner_model = inner_model self.feature_selector = None self.feature_selection_threshold_coef = feature_selection_threshold_coef def _get_feature_selector(self): if self.feature_selector is None: self.feature_selector = SelectFromModel(self.estimator, threshold='{}*mean'.format(float(self.feature_selection_threshold_coef))) return self.feature_selector def get_support(self, indices=False): feature_selector_support = self.feature_selector.get_support(indices=True) inner_support = self.inner_model.get_support(indices=True) return get_support_for_feature_selection_wrapper( feature_selector_support, inner_support, indices, ) def fit(self, X, y): print X, X.shape X = self._get_feature_selector().fit(X.copy(), y.copy()).transform(X.copy()) self.inner_model.fit(X.copy(), y) return self def predict(self, X): X = self._get_feature_selector().transform(X.copy()) return self.inner_model.predict(X.copy())
def final_feats(df_data): x_train = df_data.iloc[:,1:370] #removing the "ID" and the "Target" columns """Getting the first 2 PCs""" pca = PCA(n_components=2) x_train_projected = pca.fit_transform(normalize(x_train, axis=0)) x_train, del_constants = remove_feat_constants(x_train) """ removing columns with no variance; in our case the all-zero columns""" x_train, del_identicals = remove_feat_identicals(x_train) """removing columns that are identical to each other, and retainining only one of them""" y_train = df_data["TARGET"] # Using L1 based feature selection on X_train with 308 columns lsvc = svm.LinearSVC(C=0.01, penalty="l1", dual=False).fit(x_train, y_train) model = SelectFromModel(lsvc, prefit=True) feat_ix_keep = model.get_support(indices=True) #getting indices of selected features #so that I don't have to use "transform" and convert the data frame to a matrix. orig_feat_ix = np.arange(x_train.columns.size) feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep) X_train_new = x_train.drop(labels=x_train.columns[feat_ix_delete], axis=1) X_train_new.insert(1, 'PCAOne', x_train_projected[:, 0]) X_train_new.insert(1, 'PCATwo', x_train_projected[:, 1]) return X_train_new, y_train, feat_ix_keep, pca, del_constants, del_identicals
class SelectFromModelSelection(SelectionModel): name = "SelectFromModel" def __init__(self, *args): SelectionModel.__init__(self, *args) self.selector = SelectFromModel(self.estimator) self.selector.fit(self.x_array, self.y_array) self.support_ = self.selector.get_support()
def rf_feat_reduction(rf_model, features): print " Reducing number of input features based on feature importance." subset_model = SelectFromModel(rf_model, prefit=True) feat_subset = subset_model.transform(features) feat_bool = subset_model.get_support() print " " + str(len(feat_subset[0])) + " features chosen after model selection." return feat_subset, feat_bool
def logistic_l1(X, y, tol): DEBUG = False # if DEBUG: print 'X ',X,' y ',y,' tol ',tol lr = LogisticRegression(penalty='l1', C=0.65, dual=False) model = SelectFromModel(lr, prefit=False, threshold=tol) if DEBUG: print X.shape, y.shape x_select = model.fit_transform(X, y) x_logreg = lr.fit(X, y) x_logreg_trans = lr.predict(X) x_irls = irls(X, y) support = model.get_support(indices=True) if DEBUG: print 'support', support, 'x_select', x_select, 'x_logreg', x_logreg_trans, 'x_irls', x_irls if DEBUG: print 'len_support', len(support), 'len_x_select', len(x_select), 'len_x_logreg', len( x_logreg_trans), 'len_x_irls', len(x_irls) if DEBUG: print 'x_logreg_coef', x_logreg.coef_, 'len', len(x_logreg.coef_[0]), 'intercept', x_logreg.intercept_ return x_logreg.coef_[0]
def feature_select(clf): cvscore = np.mean(cross_val_score(clf, X, t)) clf.fit(X, t) try: feature_importances = list(reversed(np.array(features)[np.argsort(clf.feature_importances_)])) except: feature_importances = None selection_results = {'mean' : dict(), 'median' : dict()} scalings = [0, 0.25, 0.5, 0.75, 0.9, 1, 1.1, 1.25, 1.5, 1.75, 2] for scaling in scalings: X_new = SelectFromModel(clf, threshold=str(scaling)+'*mean', prefit=True).transform(X) selection_results['mean'][scaling] = np.mean(cross_val_score(clf, X_new, t)) X_new = SelectFromModel(clf, threshold=str(scaling)+'*median', prefit=True).transform(X) selection_results['median'][scaling] = np.mean(cross_val_score(clf, X_new, t)) best_select = max(itertools.product(['mean', 'median'], scalings), key = lambda (m,s) : selection_results[m][s]) model = SelectFromModel(clf, threshold=str(best_select[1]) + '*' + best_select[0], prefit=True) X_new = model.transform(X) feature_mask = model.get_support() cvscore_selected = np.mean(cross_val_score(clf, X_new, t)) clf.fit(X_new, t) return cvscore, feature_importances, best_select, feature_mask, cvscore_selected, clf
def how_many_variables_used(word_list, inputs, outputs, num_vars, l1_step=LinearSVC(penalty='l1', dual=False, C=1)): kf = KFold(inputs.shape[0], n_folds=10, shuffle=True) for train_indices, val_indices in kf: # pipeline = Pipeline([('chi2_top_k', SelectKBest(chi2, num_vars)), # ('l1_step', SelectFromModel(l1_step))]) kbest = SelectKBest(chi2, num_vars) l1_selector = SelectFromModel(l1_step) x_new = kbest.fit_transform(inputs[train_indices], outputs[train_indices].ravel()) indices = kbest.get_support(indices=True) x_new = l1_selector.fit_transform(x_new, outputs[train_indices].ravel()) new_indices = l1_selector.get_support(indices=True) from sklearn.ensemble import ExtraTreesClassifier model = ExtraTreesClassifier() model.fit(x_new, outputs[train_indices].ravel()) importance = np.argsort(model.feature_importances_)[::-1] print([word_list[indices[i]] for i in new_indices]) print([word_list[indices[new_indices[i]]] for i in importance]) print(x_new.shape)
from sklearn.feature_selection import SelectKBest, SelectFromModel from sklearn.ensemble import RandomForestClassifier import numpy as np rng = np.random.RandomState(1) X = rng.randint(0, 2, (200, 20)) y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0) fs_univariate = SelectKBest(k=10) fs_modelbased = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='median') fs_univariate.fit(X, y) print('Features selected by univariate selection:') print(fs_univariate.get_support()) print('') fs_modelbased.fit(X, y) print('Features selected by model-based selection:') print(fs_modelbased.get_support())
def get_C_panel(method, features, labels, C): model = SelectFromModel(method(C).fit( mt.normalize_features(features, normal="scaled"), labels), prefit=True) return (model.get_support(True))
def logistic_dimension(data, label, parameter=1): logistic_ = LogisticRegression(penalty="l1", C=parameter, max_iter=30) model = SelectFromModel(logistic_) new_data = model.fit_transform(data, label) mask = model.get_support(indices=True) return new_data, mask
#df.TARGET.describe() y = df["TARGET"].values X = df.ix[:, "var3":"var38"].values X_labels = df.ix[:, "var3":"var38"].columns.values lr = LassoLarsCV() sfm = SelectFromModel(lr, threshold=1e-3) X_std = StandardScaler().fit_transform(X, y) sfm.fit(X_std,y) lr.fit(X_std, y) #feat_imp = pd.DataFrame(lr.coef_, index=X_labels) #feat_imp.plot(kind="bar", title="Feature Importance", use_index=False) chosen_feat = [ f for i,f in enumerate(X_labels) if sfm.get_support()[i] ] #chosen_feat = pickle.load(open("feat", "rb")) print(len(chosen_feat)) chosen_feat # kaggle forum df.var3 = df.var3.replace(-999999,2) y = df["TARGET"].values X = df.ix[:, "var3":"var38"].values X_labels = df.ix[:, "var3":"var38"].columns.values test = pd.read_csv("processed_test.csv", header=0, index_col="ID") test.var3 = test.var3.replace(-999999,2) X_test = test[chosen_feat].values
def regressor(raster_path, vector_path, vector_field, newRasterfn, global_src_extent=False): # open raster file rds = gdal.Open(raster_path, GA_ReadOnly) # get geo data rgt = rds.GetGeoTransform() assert(rds) # get number of bands bands = rds.RasterCount myBlockSize=rds.GetRasterBand(1).GetBlockSize(); x_block_size = myBlockSize[0] y_block_size = myBlockSize[1] # Get image sizes cols = rds.RasterXSize rows = rds.RasterYSize geotransform = rds.GetGeoTransform() originX = geotransform[0] originY = geotransform[3] pixelWidth = geotransform[1] pixelHeight = geotransform[5] # we need this for file creation outRasterSRS = osr.SpatialReference() outRasterSRS.ImportFromWkt(rds.GetProjectionRef()) # get datatype and transform to numpy readable data_type = rds.GetRasterBand(1).DataType data_type_name = gdal.GetDataTypeName(data_type) if data_type_name == "Byte": data_type_name = "uint8" # open vector file vds = ogr.Open(vector_path, GA_ReadOnly) # TODO maybe open update if we want to write stats assert(vds) # get the layer vlyr = vds.GetLayer(0) # count valid features c = 0 feat = vlyr.GetNextFeature() while feat is not None: label = feat.GetField(vector_field) if label is not None: c = c + 1 nr_of_feat = c feat = vlyr.GetNextFeature() feat = vlyr.ResetReading() print " Reading the training data ..." print " Number of training samples:" + str(nr_of_feat) # create a python list to fill during subsequent loop for mean and training data mean = [[0 for _ in range(bands)] for _ in range(nr_of_feat)] training = [] print " Extracting band values for each training sample ..." # extract mean value for each polygon on each band (zonal stats function) for x in xrange (1, bands + 1): rb = rds.GetRasterBand(x) nodata_value = rb.GetNoDataValue() if nodata_value: nodata_value = float(nodata_value) rb.SetNoDataValue(nodata_value) # create an in-memory numpy array of the source raster data # covering the whole extent of the vector layer if global_src_extent: # use global source extent # useful only when disk IO or raster scanning inefficiencies are your limiting factor # advantage: reads raster data in one pass # disadvantage: large vector extents may have big memory requirements src_offset = bbox_to_pixel_offsets(rgt, vlyr.GetExtent()) src_array = rb.ReadAsArray(*src_offset) # calculate new geotransform of the layer subset new_gt = ( (rgt[0] + (src_offset[0] * rgt[1])), rgt[1], 0.0, (rgt[3] + (src_offset[1] * rgt[5])), 0.0, rgt[5] ) mem_drv = ogr.GetDriverByName('Memory') driver = gdal.GetDriverByName('MEM') # reset feature reading for every band feat = vlyr.ResetReading() # get the first feature (subsequent features call is in the loop) feat = vlyr.GetNextFeature() # loop through the features and keep an index i per loop i = 0 while feat is not None: label = feat.GetField(vector_field) #print "label: " + str(label) #label2 = feat.GetField("Id") #print "label2: " + str(label2) if label is not None: # extract the training (we only need once) if x == 1: label = feat.GetField(vector_field) training.append(feat.GetField(vector_field)) # extract the band values if not global_src_extent: # use local source extent # fastest option when you have fast disks and well indexed raster (ie tiled Geotiff) # advantage: each feature uses the smallest raster chunk # disadvantage: lots of reads on the source raster src_offset = bbox_to_pixel_offsets(rgt, feat.geometry().GetEnvelope()) src_array = rb.ReadAsArray(*src_offset) # calculate new geotransform of the feature subset new_gt = ( (rgt[0] + (src_offset[0] * rgt[1])), rgt[1], 0.0, (rgt[3] + (src_offset[1] * rgt[5])), 0.0, rgt[5] ) # Create a temporary vector layer in memory mem_ds = mem_drv.CreateDataSource('out') mem_layer = mem_ds.CreateLayer('poly', None, ogr.wkbPolygon) mem_layer.CreateFeature(feat.Clone()) # Rasterize it rvds = driver.Create('', src_offset[2], src_offset[3], 1, gdal.GDT_Byte) rvds.SetGeoTransform(new_gt) gdal.RasterizeLayer(rvds, [1], mem_layer, burn_values=[1]) rv_array = rvds.ReadAsArray() # Mask the source data array with our current feature # we take the logical_not to flip 0<->1 to get the correct mask effect # we also mask out nodata values explictly masked = np.ma.MaskedArray( src_array, mask=np.logical_or( src_array == nodata_value, np.logical_not(rv_array) ) ) band_mean = str(x) + "_mean" # index array by bands and feature ar_row = i ar_col = x - 1 i = i + 1 # fill the array with the respective values mean[ar_row][ar_col] = float(masked.mean()) rvds = None mem_ds = None feat = vlyr.GetNextFeature() # python lists to numpy array training = np.array(training) mean = np.array(mean) # we do not need the vector layr anymore vds = None # prepare paramtere testing PARAMETER_GRID = [ (50, 75), #, 100, 125, 150, 200), # nr. of estimators ('auto', 'sqrt'), #, 'log2'), # max nr. of features (1, 2), #, 3, 5) # min nr. of leaves ] # set a preliminary score best_score = float("-inf") best_tot_score = float("-inf") best_r2 = float("-inf") print " Testing for different parameter sets of the RF classifier with all features ..." for n, f, l in product(*PARAMETER_GRID): print " Combination of " + str(n) + " estimators, " + str(f) + " feature subset and " + str(l) + " as minimum number of leaves" # create the rf classifier rf_initial = RandomForestRegressor(n_estimators=n, max_features=f , min_samples_leaf=l, oob_score=True, n_jobs=-1) # Fit our model to training data rf_initial.fit(mean,training) splits = int(round(nr_of_feat / 5)) cv_predicted = cross_validation.cross_val_predict(rf_initial, mean, training, cv=splits) r2 = r2_score(training, cv_predicted) print " oob model: " + str(rf_initial.oob_score_) print " r^2 model: " + str(r2) tot_score = (2 * r2 + rf_initial.oob_score_) / 3 if tot_score > best_tot_score: best_tot_score=tot_score best_r2 = r2 best_score = rf_initial.oob_score_ print " best oob: " + str(best_score) print " best r^2: " + str(r2) rf = rf_initial est, features, leaves = n, f, l # get OOB score and score oob = rf.oob_score_ score = rf.score(mean, training) # print results of best model print( '-------------------------------- ') print( ' 1) Best model using all features:') print( '-------------------------------- ') print( ' RF paramters: ') print( ' Number of estimators: ' + str(est)) print( ' Max. number of features: ' + str(features)) print( ' Min. number of samples per leave: ' + str(leaves)) print( '' ) print( ' R^2 model score: ' + str(score)) print( ' OOB prediction score: ' + str(oob)) print( ' R^2 cross-val score: ' + str(best_r2)) print( '--------------------------------') # create stats and figure files outname_fi = os.path.basename(newRasterfn) outname_fi = outname_fi.replace(' ', '')[:-4] outpath_fi = os.path.dirname(newRasterfn) text_file = outpath_fi + '/Stats.' + outname_fi + '.txt' fig_file = outpath_fi + '/FeatImp.' + outname_fi + '.jpg' fig_file2 = outpath_fi + '/FeatImp.reduced.' + outname_fi + '.jpg' # write to stats file f = open( text_file, 'w' ) f.write( '-------------------------------- \n') f.write( '1) Best model using all features: \n') f.write( '-------------------------------- \n') f.write( ' RF parameters: \n') f.write( ' Number of estimators: ' + str(est) + ' \n') f.write( ' Max. number of features: ' + str(features) + ' \n') f.write( ' Min. number of samples per leave: ' + str(leaves) + ' \n') f.write( '\n' ) f.write( ' R^2 model score: ' + str(score) + '\n') f.write( ' OOB prediction score: ' + str(oob) + ' \n' ) f.write( ' R^2 cross-val score: ' + str(best_r2) + '\n' ) f.write( '-------------------------------- \n') print(' The importance of our bands are:') f.write(' The importance of our bands are:\n') #get band importance imps=[] bands = range(1, bands + 1) for b, imp in zip(bands, rf.feature_importances_): print(' Band {b} importance: {imp}'.format(b=b, imp=imp)) f.write(' Band ' + str(b) + ' importance: ' + str(imp) + '\n' ) imps.append(imp) if "SEPAL" not in os.environ: # create a plot for the feature importance index = np.arange(b) bar_width=0.8 fig, ax = plt.subplots() plt.bar(index + 0.6, imps, bar_width, alpha=0.4, color='b') ax.set_xlabel('Band number') ax.set_ylabel('Score') plt.xticks(index + 1) ax.set_title('Feature importance for RF regressor') # save plot to file plt.savefig(fig_file) plt.show() print " Reducing number of input features based on feature importance." feat_subset = SelectFromModel(rf, prefit=True) mean_new = feat_subset.transform(mean) feat_bool = feat_subset.get_support() print " " + str(len(mean_new[0])) + " features chosen after model selection." print " Testing for different parameter sets of the RF classifier with the selected features ..." for n, mf, l in product(*PARAMETER_GRID): print " Combination of " + str(n) + " estimators, " + str(mf) + " feature subset and " + str(l) + " as minimum number of leaves" # create the rf classifier rf_opt = RandomForestRegressor(n_estimators=n, max_features=mf , min_samples_leaf=l, oob_score=True, n_jobs=-1) # Fit our model to training data rf_opt.fit(mean_new,training) splits = int(round(nr_of_feat / 5)) cv_predicted = cross_validation.cross_val_predict(rf_opt, mean_new, training, cv=splits) r2 = r2_score(training, cv_predicted) print " oob model: " + str(rf_opt.oob_score_) print " r^2 model: " + str(r2) tot_score = (2 * r2 + rf_opt.oob_score_) / 3 if tot_score > best_tot_score: best_tot_score=tot_score best_r2 = r2 best_score = rf_opt.oob_score_ print " best oob : " + str(best_score) print " best r^2 : " + str(best_r2) rf = rf_opt print rf est, features, leaves = n, mf, l mean = mean_new # get OOB score and score oob = rf.oob_score_ score = rf.score(mean, training) print rf.feature_importances_ if mean.shape != mean_new.shape: print( '------------------------------------- ') print( ' No improvements by feature reduction. ') print( '------------------------------------- ') f.write( '------------------------------------- \n') f.write( ' No improvements by feature reduction. \n') f.write( '------------------------------------- \n') else: # print results of best model print( '-------------------------------- ') print( ' 1) Best model using reduced set of features:') print( '-------------------------------- ') print( ' RF paramters: ') print( ' Number of estimators: ' + str(est)) print( ' Max. number of features: ' + str(features)) print( ' Min. number of samples per leave: ' + str(leaves) + '\n') print( ' R^2 model score: ' + str(score)) print( ' OOB prediction score: ' + str(oob)) print( ' R^2 cross-val score: ' + str(best_r2)) print( '-------------------------------- ') f.write( '\n') f.write( '\n') f.write( '-------------------------------- \n') f.write( ' 2) Best model using reduced set of features: \n') f.write( '-------------------------------- \n') f.write( ' RF paramters: \n') f.write( ' Number of estimators: ' + str(est) + ' \n') f.write( ' Max. number of features: ' + str(features) + ' \n') f.write( ' Min. number of samples per leave: ' + str(leaves) + ' \n') f.write( '' ) f.write( ' R^2 model score: ' + str(score) + ' \n' ) f.write( ' OOB prediction score: ' + str(oob) + ' \n' ) f.write( ' R^2 cross-val score: ' + str(best_r2) + '\n' ) f.write( '-------------------------------- \n') print(' The importance of our bands are:') f.write(' The importance of our bands are:\n') imps=[] bands=[] j=0 for i in xrange(len(feat_bool)): if feat_bool[i] == True: band=i+1 #get band importance imp=rf.feature_importances_[j] print(' Band ' + str(band) + ' importance: ' + str(imp)) f.write(' Band ' + str(band) + ' importance: ' + str(imp) + '\n') j = j + 1 imps.append(imp) bands.append(band) if "SEPAL" not in os.environ: # create a plot for the feature importance index = np.arange(j) bar_width=0.8 fig, ax = plt.subplots() plt.bar(index + 0.6, imps, bar_width, alpha=0.4, color='b') ax.set_xlabel('Band number') ax.set_ylabel('Score') plt.xticks(index + 1, bands) #plt.xticks(bands) ax.set_title('Feature importance for RF regressor') # save plot to file plt.savefig(fig_file2) plt.show() print " Cross-validating the final model (Leave-5-out CV) ..." splits = int(round(nr_of_feat / 5)) cv_predicted = cross_validation.cross_val_predict(rf, mean, training, cv=splits) cv_score = cross_validation.cross_val_score(rf, mean, training, cv=splits, scoring='r2', n_jobs=-1) # calculate some quality criteria r2 = r2_score(training, cv_predicted) mse = mean_squared_error(training, cv_predicted) rmse = sqrt(mse) mae = mean_absolute_error(training, cv_predicted) mape = np.mean(np.abs((training - cv_predicted) / training)) * 100 evs = explained_variance_score(training, cv_predicted, multioutput = 'uniform_average') print('--------------------------------') print(' Final Model cross-validation') print('--------------------------------') print( " R^2: " + str(r2)) print( " MAE: " + str(mae)) print( " MAPE: " + str(mape)) print( " MSE: " + str(mse)) print( " RMSE: " + str(rmse)) print( " EVS: " + str(evs)) print('--------------------------------') print( " Accuracy: %0.2f (+/- %0.2f)" % (cv_score.mean(), cv_score.std() * 2)) print('--------------------------------') f.write('-------------------------------- \n') f.write(' Final Model cross-validation\n') f.write('--------------------------------\n') f.write( " R^2: " + str(r2) + '\n') f.write( " MAE: " + str(mae) + '\n') f.write( " MAPE: " + str(mape) + '\n') f.write( " MSE: " + str(mse) + '\n') f.write( " RMSE: " + str(rmse) + '\n') f.write( " EVS: " + str(evs) + '\n') f.write('--------------------------------\n') f.write( " Accuracy: %0.2f (+/- %0.2f)\n" % (cv_score.mean(), cv_score.std() * 2)) f.write('--------------------------------\n') # close our stats file f.close() # write cross validation data to file d = {'measured': training, 'predicted': cv_predicted} df = DataFrame(data=d) df.to_csv(outpath_fi + '/CV.' + outname_fi + '.csv', ';') if "SEPAL" not in os.environ: # create a cross-val plot y = training fig, ax = plt.subplots() ax.scatter(training, cv_predicted, edgecolors=(0, 0, 0)) ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') plt.show() print " Create empty output file ..." # create out array driver = gdal.GetDriverByName('GTIff') outRaster = driver.Create(newRasterfn, cols, rows, 1, gdal.GDT_Float32 , options=[ # Format-specific creation options. 'BIGTIFF=IF_SAFER', 'BLOCKXSIZE=128' # must be a power of 2 'BLOCKYSIZE=128' # , # also power of 2, need not match BLOCKXSIZEBLOCKXSIZE # 'COMPRESS=LZW' ] ) outRaster.SetGeoTransform((originX, pixelWidth, 0, originY, 0, pixelHeight)) outband = outRaster.GetRasterBand(1) outRaster.SetProjection(outRasterSRS.ExportToWkt()) outRaster.GetRasterBand(1).SetNoDataValue(0) # print " Predicting the model to the dataset and write to output band ..." #classify by raster blocksize #loop through y direction r = 1 for y in xrange(0, rows, y_block_size): if y + y_block_size < rows: ysize = y_block_size else: ysize = rows - y # loop throug x direction for x in xrange(0, cols, x_block_size): if x + x_block_size < cols: xsize = x_block_size else: xsize = cols - x # create empty img = np.empty((ysize, xsize, len(mean[0])), dtype=data_type_name) # loop through the timeseries and fill the stacked array part if mean.shape == mean_new.shape: # read input according to feature reduction j=0 for i in xrange(len(feat_bool)): if feat_bool[i] == True: i += 0 img[:,:,j] = np.array(rds.GetRasterBand(i+1).ReadAsArray(x,y,xsize,ysize)) bands[j]=i+1 j = j + 1 else: # read full input for i in xrange( rds.RasterCount ): i += 0 img[:,:,i] = np.array(rds.GetRasterBand(i+1).ReadAsArray(x,y,xsize,ysize)) # for later masking min_val = np.min(img, axis=2) # reshape the stacked array for actual classification new_shape = (img.shape[0] * img.shape[1], img.shape[2]) img_as_array = img.reshape(new_shape) # do the classification classification = rf.predict(img_as_array) # Reshape our classification map classification = np.array(classification.reshape(img[:, :, 0].shape)) # mask out data where on eof the values is 0 classification[min_val == 0] = 0. # write part of the array to file outband.WriteArray(classification, x, y) print (" Run: " + str(r) ) r = r + 1
train_Y = label_dict.transform(train_Y) pd.to_pickle([train_X, train_Y], util.features_prefix + "/size_XY.pkl") else: [train_X, train_Y] = pd.read_pickle(util.features_prefix + "/size_XY.pkl") # 99 + 380 + 7*5*2 + 2 print len(train_X[0]), len(train_Y) if os.path.exists(util.features_prefix + "/salary_XY.pkl") is False: train_Y = list(train["predict_salary"].values) label_dict = LabelEncoder().fit(train_Y) label_dict_classes = len(label_dict.classes_) train_Y = label_dict.transform(train_Y) pd.to_pickle([train_X, train_Y], util.features_prefix + "/salary_XY.pkl") else: [train_X, train_Y] = pd.read_pickle(util.features_prefix + "/salary_XY.pkl") 99 + 380 + 7*5*2 + 2 from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(max_depth=3) clf.fit(np.array(train_X[:100]), np.array(train_Y[:100])) print clf.predict(np.array(train_X[100:200])) print train_Y[100:200] from sklearn.feature_selection import SelectFromModel model = SelectFromModel(clf, prefit=True) list_1 = model.get_support() for i in range(len(list_1)): if list_1[i] == True: print i print 'pickle end'
step=10, verbose=5) rfe_selector.fit(X_norm, y) rfe_support = rfe_selector.get_support() rfe_feature = X.loc[:, rfe_support].columns.tolist() print(str(len(rfe_feature)), 'selected features') ## 4) Lasso from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats) embeded_lr_selector.fit(X_norm, y) embeded_lr_support = embeded_lr_selector.get_support() embeded_lr_feature = X.loc[:, embeded_lr_support].columns.tolist() print(str(len(embeded_lr_feature)), 'selected features') ## 5) Tree-based SelectFromModel # RandomForest is used to calculate feature importance using node impurities in each decision tree; final feature # importance is calculated as average of all decision trees from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats) embeded_rf_selector.fit(X, y) embeded_rf_support = embeded_rf_selector.get_support()
x_test_trans = sel.transform(x_test) vali_auc = np.mean( cross_val_score(clf, x_train_trans, y_train, cv=skf, scoring='roc_auc')) clf.fit(x_train_trans, y_train) predict_result = clf.predict_proba(x_test_trans)[:, 1] total_predict += predict_result test_auc = roc_auc_score(y_test, predict_result) soft_rank = [vali_auc if i == True else 0 for i in sel.get_support()] record.append([ clf.__class__.__name__, sum(sel.get_support()), test_auc, times, soft_rank ]) #print(clf.__class__.__name__ +" "+ str(sum(sel.get_support())) +" "+ str(test_auc)) total_test_auc = roc_auc_score(y_test, total_predict) record.append(["merge", 0, total_test_auc, times, []]) df_record2 = pd.DataFrame(record) df_record2.columns = ['clf', 'FeatureCount', 'AUC', 'Time', 'SoftFeatureRank'] # get mean df_record2.groupby('clf')['AUC', 'FeatureCount'].agg({ 'AUC': 'mean', 'FeatureCount': 'mean'
y_train = train['target'] feat_labels = X_train.columns rf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1) rf.fit(X_train, y_train) importances = rf.feature_importances_ indices = np.argsort(rf.feature_importances_)[::-1] for f in range(X_train.shape[1]): print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) sfm = SelectFromModel(rf, threshold='median', prefit=True) print('Number of features before selection: {}'.format(X_train.shape[1])) n_features = sfm.transform(X_train).shape[1] print('Number of features after selection: {}'.format(n_features)) selected_vars = list(feat_labels[sfm.get_support()]) train = train[selected_vars + ['target']] scaler = StandardScaler() scaler.fit_transform(train.drop(['target'], axis=1))
# %% Prepare data for logistic regression y = df["is_red"] X = df.drop(["is_red", 'GH', 'GR', 'L', 'O', 'S', 'V'], axis=1) X.assign(y=y).to_csv(f"../data/2020-10-21_vcf-model.csv") # %% Calculate variable frequency for plotting var_freq = X.sum() / len(X) var_freq.rename("variant_frequency").to_csv( f"../data/2020-10-21_variant-freq.csv") # %% Fit logistic regression lr = LogisticRegression(penalty="l1", solver="liblinear") lr.fit(X, y) model = SelectFromModel(lr, prefit=True) indices = model.get_support() colnames = X.columns[indices] X_new = X.loc[:, indices] X_new.assign(y=y).to_csv( f"../data/2020-10-21_logistic-regression-lasso-selected-features.csv") coef_df = pd.DataFrame(lr.coef_, columns=X.columns) ors = coef_df.squeeze().transform("exp") ors = ors[ors != 1] ors.sort_values().tail(20) ors.to_csv(f"../data/2020-10-21_odds-ratios.csv") # %% Figure 2: Plot ROC curve prefix = f"2020-10-21_vcf_logistic-regression-model" suffixes = [
def feature_selection( context, df_artifact, k=2, min_votes=0.5, label_column: str = 'Y', stat_filters=[ 'f_classif', 'mutual_info_classif', 'chi2', 'f_regression' ], model_filters={ 'LinearSVC': 'LinearSVC', 'LogisticRegression': 'LogisticRegression', 'ExtraTreesClassifier': 'ExtraTreesClassifier' }, max_scaled_scores=True): """Applies selected feature selection statistical functions or models on our 'df_artifact'. Each statistical function or model will vote for it's best K selected features. If a feature has >= 'min_votes' votes, it will be selected. :param context: the function context :param k: number of top features to select from each statistical function or model :param min_votes: minimal number of votes (from a model or by statistical function) needed for a feature to be selected. Can be specified by percentage of votes or absolute number of votes :param label_column: ground-truth (y) labels :param stat_filters: statistical functions to apply to the features (from sklearn.feature_selection) :param model_filters: models to use for feature evaluation, can be specified by model name (ex. LinearSVC), formalized json (contains 'CLASS', 'FIT', 'META') or a path to such json file. :param max_scaled_scores: produce feature scores table scaled with max_scaler """ # Read input DF df_path = str(df_artifact) context.logger.info(f'input dataset {df_path}') if df_path.endswith('csv'): df = pd.read_csv(df_path) elif df_path.endswith('parquet') or df_path.endswith('pq'): df = pd.read_parquet(df_path) # Set feature vector and labels y = df.pop(label_column) X = df # Create selected statistical estimators stat_functions_list = { stat_name: SelectKBest(create_class(f'sklearn.feature_selection.{stat_name}'), k) for stat_name in stat_filters } requires_abs = ['chi2'] # Run statistic filters selected_features_agg = {} stats_df = pd.DataFrame(index=X.columns) for stat_name, stat_func in stat_functions_list.items(): try: # Compute statistics params = (X, y) if stat_name in requires_abs else (abs(X), y) stat = stat_func.fit(*params) # Collect stat function results stat_df = pd.DataFrame(index=X.columns, columns=[stat_name], data=stat.scores_) plot_stat(context, stat_name, stat_df) stats_df = stats_df.join(stat_df) # Select K Best features selected_features = X.columns[stat_func.get_support()] selected_features_agg[stat_name] = selected_features except Exception as e: context.logger.info( f"Couldn't calculate {stat_name} because of: {e}") # Create models from class name / json file / json params all_sklearn_estimators = dict( all_estimators()) if len(model_filters) > 0 else {} selected_models = {} for model_name, model in model_filters.items(): if '.json' in model: current_model = json.load(open(model, 'r')) ClassifierClass = create_class(current_model["META"]["class"]) selected_models[model_name] = ClassifierClass( **current_model["CLASS"]) elif model in all_sklearn_estimators: selected_models[model_name] = all_sklearn_estimators[model_name]() else: try: current_model = json.loads(model) if isinstance( model, str) else current_model ClassifierClass = create_class(current_model["META"]["class"]) selected_models[model_name] = ClassifierClass( **current_model["CLASS"]) except: context.logger.info(f'unable to load {model}') # Run model filters models_df = pd.DataFrame(index=X.columns) for model_name, model in selected_models.items(): # Train model and get feature importance select_from_model = SelectFromModel(model).fit(X, y) feature_idx = select_from_model.get_support() feature_names = X.columns[feature_idx] selected_features_agg[model_name] = feature_names.tolist() # Collect model feature importance if hasattr(select_from_model.estimator_, 'coef_'): stat_df = select_from_model.estimator_.coef_ elif hasattr(select_from_model.estimator_, 'feature_importances_'): stat_df = select_from_model.estimator_.feature_importances_ stat_df = pd.DataFrame(index=X.columns, columns=[model_name], data=stat_df[0]) models_df = models_df.join(stat_df) plot_stat(context, model_name, stat_df) # Create feature_scores DF with stat & model filters scores result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False) context.log_dataset(key='feature_scores', df=result_matrix_df, local_path='feature_scores.parquet', format='parquet') if max_scaled_scores: normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values min_max_scaler = MinMaxScaler() normalized_df = min_max_scaler.fit_transform(normalized_df) normalized_df = pd.DataFrame(data=normalized_df, columns=result_matrix_df.columns, index=result_matrix_df.index) context.log_dataset( key='max_scaled_scores_feature_scores', df=normalized_df, local_path='max_scaled_scores_feature_scores.parquet', format='parquet') # Create feature count DataFrame for test_name in selected_features_agg: result_matrix_df[test_name] = [ 1 if x in selected_features_agg[test_name] else 0 for x in X.columns ] result_matrix_df.loc[:, 'num_votes'] = result_matrix_df.sum(axis=1) context.log_dataset(key='selected_features_count', df=result_matrix_df, local_path='selected_features_count.parquet', format='parquet') # How many votes are needed for a feature to be selected? if isinstance(min_votes, int): votes_needed = min_votes else: num_filters = len(stat_filters) + len(model_filters) votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0))) context.logger.info(f'votes needed to be selected: {votes_needed}') # Create final feature dataframe selected_features = result_matrix_df[ result_matrix_df.num_votes >= votes_needed].index.tolist() good_feature_df = df.loc[:, selected_features] final_df = pd.concat([good_feature_df, y], axis=1) context.log_dataset(key='selected_features', df=final_df, local_path='selected_features.parquet', format='parquet')
param_grid=param_grid, scoring=make_scorer(roc_auc_score), # n_jobs=4, iid=False, cv=5) start_time = time.time() gsearch.fit(X_trn, y_trn) elapsed_time = time.time() - start_time print elapsed_time gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_ sfm = SelectFromModel(lr, threshold=0.2) sfm.fit(trn[use_columns], trn[target]) support = sfm.get_support() new_use_columns = [c for c, s in zip(use_columns, support) if s] del_columns = [c for c, s in zip(use_columns, support) if not s] use_columns = new_use_columns trn.drop(del_columns, axis=1, inplace=True) tst.drop(del_columns, axis=1, inplace=True) # sfm = SelectFromModel(lr, threshold=0.2) # sfm.fit(trn[all_tfidf_columns], trn[target]) # sfm.fit(trn[all_tfidf_columns], trn[target]) # support = sfm.get_support() # new_tfidf_columns = [c for c, s in zip(all_tfidf_columns, support) if s]
# In[23]: from sklearn.feature_selection import SelectFromModel # Create a selector object that will use the random forest classifier to identify # features that have an importance of more than 0.01 sfm = SelectFromModel(clf, threshold=0.01) # Train the selector sfm.fit(X_train, y_train) # In[24]: selected_features = [] # Print the names of the most important features for feature_list_index in sfm.get_support(indices=True): selected_features.append(feat_labels[feature_list_index]) data_selected = data[selected_features] data_selected.head() # In[25]: selected_features # In[29]: data_selected.set_index('battery_power').to_csv('scale.csv') # In[26]:
# model = ElasticNet(l1_ratio = 0.5) # model.fit(features, labels) # print(list(zip(features, model.coef_.tolist()))) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # TRANSFORMER METHODS # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Now we'll grab the transformer code and wave our magic wand to select # features based on the wisdom of Python # For LASSO model = Lasso() sfm = SelectFromModel(model) sfm.fit(features, labels) print ("LASSO Results") print(list(features[sfm.get_support(indices=True)])) # For Ridge model = Ridge() sfm = SelectFromModel(model) sfm.fit(features, labels) print ("Ridge Results") print(list(features[sfm.get_support(indices=True)])) # For ElasticNet model = ElasticNet() sfm = SelectFromModel(model) sfm.fit(features, labels) print ("ElasticNet Results") print(list(features[sfm.get_support(indices=True)]))
# Train the selector rF.fit(train,response) # In[66]: features = train.columns.tolist() # In[67]: model_features=[] for f_index in rF.get_support(indices=True): model_features.append(features[f_index]) # In[68]: model_features.append( 'SK_ID_CURR') model_features.append('TARGET') model_features # In[69]:
# loop through algorithms and append the score into the list model.fit(X_train, y_train) prediction5 = model5.predict(X_test) #score = model.score(X_test, y_test) print("The accuracy score of ensemble is {:.2%}".format( accuracy_score(Y_test, prediction5))) print(classification_report(Y_test, prediction5)) # %% #%% ### ENSEMBLE -2 from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectFromModel rfc = RandomForestClassifier(n_estimators=200, random_state=5678) model_feature_selection = SelectFromModel(rfc) model_feature_selection.fit(X, y) model_feature_selection.get_support() selected_features = X.columns[model_feature_selection.get_support()] print("Number of selected features: ", len(selected_features)) print("Selected features are: ", list(selected_features)) # %% ## Modifying our test data and splitting X = X[selected_features] X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=1122) print("Train data shape: X:", X_train.shape, ", Y: ", Y_train.shape) print("Test data shape: X:", X_test.shape, ", Y: ", Y_test.shape) #%% model2_0 = DecisionTreeClassifier(random_state=687) model2_0.fit(X_train, Y_train)
predictions = predictions prec = sklearn.metrics.precision_score(ground_truth, predictions) rec = sklearn.metrics.recall_score(ground_truth, predictions) f1 = sklearn.metrics.f1_score(ground_truth, predictions) print "prec: " + str(prec) print "rec: " + str(rec) print "f1: " + str(f1) return f1 # Build linear SVM classifier, l1 regularization to perform implicit feature selection lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, Y_train) model = SelectFromModel(lsvc, prefit=True) features_selected = [elem for selected, elem in zip(model.get_support(), data_loader.get_feature_names()) if selected] print "Feature names:" print features_selected Y_pred = lsvc.predict(X_test) score = custom_scorer(Y_test, Y_pred) ## Conclusions: # # prec: 0.714285714286 # rec: 0.111607142857 # f1: 0.19305019305 # # By L1 regularize feature specific weights, we achieve a feature selection since the weights of some features turn to zero. By inspecting
from sklearn.feature_selection import SelectFromModel from tensorflow.python.keras.utils import to_categorical x_train, y_train = Dataloader().getTrain() x_test, y_test = Dataloader().getTest() y_train = to_categorical(y_train) x_train.pop("start_time") x_train.pop("end_time") x_test.pop("start_time") x_test.pop("end_time") print(x_train.shape) print(y_train.shape) clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1) # Apply The Full Featured Classifier To The Test Data clf.fit(x_train, y_train) sel = SelectFromModel(clf) sel.fit(x_train, y_train) selected_feat = x_train.columns[(sel.get_support())] len(selected_feat) print(selected_feat) def getSelectedFeature(): return selected_feat
def split_and_encode_Xy(X, y, encoding='le', feat_scaler=True, tgt_scaler=True, freqs=None, dummy_cols=10, ohe_dates=False, test_size=.25, feat_select=True, shuffle=True, enc_Xy=False, X_test=None, scoring='r2'): """ Splits X, y into train and test sub sets, encode them --- shuffle: set it to False to preserve items order """ X_train, y_train, y_test = (None, None, None) # do not shuffle the data before splitting to respect row order if not enc_Xy: # check X, y are valid dataframes or numpy arrays... X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, shuffle=shuffle) else: print() print("Encoding full data set 'X' -> 'X_train'") X_train = X y_train = y print("Let's have a look at the first row and output") print("X_train\n", X_train.head()) print("y_train\n", y_train.head()) print() if list(X.select_dtypes(include=["datetime"]).columns): print("datetime type found.") X_train = lc.get_date_features(X_train, freqs) if X_test is not None: X_test = lc.get_date_features(X_test, freqs) # print(X_train["Month"].head(3)) if encoding == 'le': X_train = lc.dummy_encode(X_train.copy()).astype(np.float32) if X_test is not None: X_test = lc.dummy_encode(X_test.copy()).astype(np.float32) elif encoding == 'ohe': # do this for mixed label-onehot encoding ! # X_train.reset_index(drop=True, inplace=True) # X_test.reset_index(drop=True, inplace=True) X_train = lc.get_dummies_or_label_encode(X_train.copy(), dummy_cols=dummy_cols, ohe_dates=ohe_dates).astype( np.float32) # print("oheencoded X_train['month'] \n", X_train["Month"].head(3)) if X_test is not None: X_test = lc.get_dummies_or_label_encode( X_test.copy(), dummy_cols=dummy_cols, ohe_dates=ohe_dates).astype(np.float32) X_test = eu.reorder_ohencoded_X_test_columns(X_train, X_test) else: raise ValueError("%r is not a valid value for var 'encoding', \n" "valid values are in ['le', 'ohe']" % encoding) print() if X_train.isnull().values.any(): X_train = X_train.fillna(X_train.median()) if X_test is not None and X_test.isnull().values.any(): X_test = X_test.fillna(X_test.median()) print("After encoding, first row and output") print("X_train\n", X_train.head()) print("X_train.columns\n", list(X_train.columns)) print("y_train\n", y_train.head()) print() scalers = (None, None) data_and_scalers = {"scalers": scalers} if feat_scaler: print("scaling train and test data") scaler = StandardScaler() # you're going to perform scaling at training time before finalization if not enc_Xy: X_train_scaled = scaler.fit_transform(X_train) X_train = DataFrame(data=X_train_scaled, columns=X_train.columns, index=X_train.index) print() print("X_train shape:", X_train.shape) if X_test is not None: X_test_scaled = scaler.transform(X_test) X_test = DataFrame(data=X_test_scaled, columns=X_test.columns, index=X_test.index) print("X_test shape:", X_test.shape) print() print("After scaling...") print("X_train\n", X_train[:1]) print("X_train type", type(X_train)) if X_test is not None: print("X_test\n", X_test[:1]) print("X_test type", type(X_test)) print() scalers = (scaler, None) data_and_scalers["scalers"] = scalers print("scoring:", scoring) # tgt_scaler = False if scoring == 'neg_rmsle' else True # standard scaling introduces negative values, # which can't be fed to log, hence to rmsle if tgt_scaler: print("Scaling target...") if scoring != 'neg_rmsle': y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train.values.reshape( -1, 1)).ravel() else: y_scaler = MinMaxScaler() y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1)) print("y_train and its type\n", (y_train[:1], type(y_train))) if not enc_Xy: if scoring != 'neg_rmsle': y_test = y_scaler.transform(y_test.values.reshape(-1, 1)).ravel() else: y_test = y_scaler.fit_transform(y_test.values.reshape(-1, 1)) print("y_test and its type\n", (y_test[:3], type(y_test))) scalers = (scalers[0], y_scaler) data_and_scalers["scalers"] = scalers print() # this works for classifiers # featsel_tuple = eu.create_feature_selector(X_train, None, seed) if feat_select and X_train.shape[1] > 10: lsvr = LinearSVR(max_iter=1e4) lsvr = lsvr.set_params(C=0.01, loss="squared_epsilon_insensitive", dual=False) # threshold=[1e-2, 1e-1] or in ["mean", "median"] thsd = "median" # "median", "median" featselector = SelectFromModel(lsvr, threshold=thsd) # tscv_fs = TimeSeriesSplit(n_splits=5) # featselector = RFECV(lsvr, step=1, cv=tscv_fs) data_and_scalers["f_selector"] = featselector if not enc_Xy: # featselector = featsel_tuple[1] X_train_selected = featselector.fit_transform(X_train, y_train) xtr_indices = featselector.get_support() X_train = DataFrame(data=X_train_selected, columns=X_train.columns[xtr_indices], index=X_train.index) print("After feature selection...") print("X_train shape:", X_train.shape) if X_test is not None: X_test_selected = featselector.transform(X_test) xtt_indices = featselector.get_support() X_test = DataFrame(data=X_test_selected, columns=X_test.columns[xtt_indices], index=X_test.index) print("X_test shape:", X_test.shape) data_and_scalers["data"] = (X_train, X_test, y_train, y_test) return data_and_scalers
def select_from_model(df): X, y = df.iloc[:, :-1], df.iloc[:, -1] clf = RandomForestClassifier(random_state=9) model = SelectFromModel(clf) model.fit_transform(X, y) return X.columns.values[model.get_support()].tolist()
class ExtraTreeBasedSelectorRegression(Transformer): def __init__(self, n_estimators=100, criterion='mse', min_samples_leaf=1, min_samples_split=2, max_features=1., bootstrap='False', max_leaf_nodes='None', max_depth='15', min_weight_fraction_leaf=0., oob_score=False, n_jobs=-1, random_state=1, verbose=0): super().__init__("extra_trees_based_selector_regression", 31) self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL] self.compound_mode = 'only_new' self.n_estimators = n_estimators self.estimator_increment = 10 if criterion not in ("mse", "friedman_mse", "mae"): raise ValueError("'criterion' is not in ('mse', 'friedman_mse', " "'mae'): %s" % criterion) self.criterion = criterion self.min_samples_leaf = min_samples_leaf self.min_samples_split = min_samples_split self.max_features = max_features self.bootstrap = bootstrap self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_weight_fraction_leaf = min_weight_fraction_leaf self.oob_score = oob_score self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose def operate(self, input_datanode, target_fields=None, sample_weight=None): feature_types = input_datanode.feature_types X, y = input_datanode.data if target_fields is None: target_fields = collect_fields(feature_types, self.input_type) X_new = X[:, target_fields] n_fields = len(feature_types) irrevalent_fields = list(range(n_fields)) for field_id in target_fields: irrevalent_fields.remove(field_id) if self.model is None: from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import ExtraTreesRegressor self.n_estimators = int(self.n_estimators) self.min_samples_leaf = int(self.min_samples_leaf) self.min_samples_split = int(self.min_samples_split) self.max_features = float(self.max_features) self.bootstrap = check_for_bool(self.bootstrap) self.n_jobs = int(self.n_jobs) self.verbose = int(self.verbose) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.min_weight_fraction_leaf = float( self.min_weight_fraction_leaf) num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) estimator = ExtraTreesRegressor( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state) estimator.fit(X_new, y, sample_weight=sample_weight) self.model = SelectFromModel(estimator=estimator, threshold='mean', prefit=True) _X = self.model.transform(X_new) is_selected = self.model.get_support() irrevalent_types = [feature_types[idx] for idx in irrevalent_fields] selected_types = [ feature_types[idx] for idx in target_fields if is_selected[idx] ] selected_types.extend(irrevalent_types) new_X = np.hstack((_X, X[:, irrevalent_fields])) new_feature_types = selected_types output_datanode = DataNode((new_X, y), new_feature_types, input_datanode.task_type) output_datanode.trans_hist = input_datanode.trans_hist.copy() output_datanode.trans_hist.append(self.type) self.target_fields = target_fields.copy() return output_datanode @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): if optimizer == 'smac': cs = ConfigurationSpace() n_estimators = Constant("n_estimators", 100) criterion = CategoricalHyperparameter("criterion", ["mse", "friedman_mse"]) max_features = UniformFloatHyperparameter("max_features", 0.1, 1.0, default_value=1.0, q=0.05) max_depth = UnParametrizedHyperparameter(name="max_depth", value="15") max_leaf_nodes = UnParametrizedHyperparameter( "max_leaf_nodes", "None") min_samples_split = UniformIntegerHyperparameter( "min_samples_split", 2, 20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter("min_samples_leaf", 1, 20, default_value=1) min_weight_fraction_leaf = Constant('min_weight_fraction_leaf', 0.) bootstrap = CategoricalHyperparameter("bootstrap", ["True", "False"], default_value="False") cs.add_hyperparameters([ n_estimators, criterion, max_features, max_depth, max_leaf_nodes, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, bootstrap ]) return cs elif optimizer == 'tpe': from hyperopt import hp space = { 'n_estimators': 100, 'criterion': hp.choice('etsreg_criterion', ['gini', 'entropy']), 'max_features': hp.uniform('etsreg_max_features', 0, 1), 'max_depth': "15", 'max_leaf_nodes': "None", 'min_samples_leaf': hp.randint('etsreg_samples_leaf', 20) + 1, 'min_samples_split': hp.randint('etsreg_samples_split', 19) + 2, 'min_impurity_decrease': 0., 'bootstrap': hp.choice('etsreg_bootstrap', ['True', 'False']) } return space
###--------------------------------------### ###------------COMPARE SCORES------------### ###--------------------------------------### print('Accuracy without any features, but stars: ' + str(score_nofeatures*100) + '%' ) print('Accuracy after adding basic features: ' + str(score_basicfeatures*100) + '%' ) print('Accuracy after adding readability features: ' + str(score_readable*100) + '%' ) print('Accuracy after adding pos features: ' + str(score_pos*100) + '%' ) print('Accuracy after adding review counts features: ' + str(score_review_count*100) + '%' ) print('Accuracy after adding sentiment features from Vader: ' + str(score_sentiment_vader*100) + '%' ) print('Accuracy after adding review sentiment features from textblob: ' + str(score_sentiment_textblob*100) + '%' ) print('Accuracy after adding rating deviance features: ' + str(score_rating_deviance*100) + '%' ) print('Accuracy after adding concreteness feature: ' + str(score_concreteness*100) + '%' ) # Maybe use sklearn.feature_selection from sklearn.feature_selection import SelectFromModel df = df.dropna() df = df.drop('text', axis =1) y = df['useful_dummy'].values X = df.drop('useful_dummy', axis=1).values # Fit the classifier to the training data selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y) selector.estimator_.coef_ selector.threshold_ selector.get_support() selector.transform(X)
data.append(tmp[1:]) import numpy as np data = np.array(data, dtype=np.float32).T labels = np.array(labels, dtype=np.float32) from sklearn.model_selection import train_test_split data_train, data_test, labels_train, labels_test = train_test_split( data, labels, test_size=0.2, stratify=labels) from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectFromModel clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=5) clf = clf.fit(data_train, labels_train) model = SelectFromModel(clf, threshold=0.01, prefit=True) data_train = model.transform(data_train) features = [genes[i] for i in model.get_support(indices=True)] from scipy.cluster.hierarchy import linkage Z = linkage(data_train.T, method='single', metric='correlation') import matplotlib.pyplot as plt from scipy.cluster.hierarchy import dendrogram plt.xlabel('Gene Symbol') plt.ylabel('Distance') dendrogram(Z, labels=features, color_threshold=0.6) plt.axhline(y=0.6, c='k', linestyle='--') plt.show()
test = odps.get_table('jz_combine_tl_test_6_2').to_df().to_pandas() predict_features = ['sys', 'dia', 'tl', 'hdl', 'ldl'] use_features = [ t for t in train.columns if t != 'vid' and t not in predict_features ] x_train = train.loc[:, use_features] label = train['tl'] gbdt = GradientBoostingRegressor(random_state=1) rf = RandomForestRegressor(random_state=1) l2 = RidgeCV() sfm_gbdt = SelectFromModel(gbdt, threshold=0.001) sfm_gbdt.fit_transform(x_train, label) gbdt_features = set(x_train.columns[sfm_gbdt.get_support()]) print('*************************************') print(gbdt_features) sfm_rf = SelectFromModel(rf, threshold=0.001) sfm_rf.fit_transform(x_train, label) rf_features = set(x_train.columns[sfm_rf.get_support()]) print('*************************************') print(rf_features) print(gbdt_features & rf_features) sfm_l2 = SelectFromModel(l2, threshold=0.5) sfm_l2.fit_transform(x_train, label) l2_features = set(x_train.columns[sfm_l2.get_support()]) print('*************************************') print(l2_features)
DecisionTreeClassifier(max_depth=10), ensemble.RandomForestClassifier(max_depth=10, n_estimators=100), ensemble.AdaBoostClassifier(DecisionTreeClassifier(max_depth=10), n_estimators=300), ensemble.GradientBoostingClassifier(n_estimators = 500, learning_rate = 0.1, max_depth = 10 , random_state = 0)] scores = np.zeros((6, 6)) i = 0 for clf in classifiers: lsvc = clf.fit(X_dummytrain,y_dummytrain) model = SelectFromModel(lsvc, prefit = True) Train_new = model.transform(X_dummytrain) print Train_new.shape newindices = model.get_support(True) FinalTrainLessFeature = X_dummytrain[np.ix_(np.arange(40000), newindices)] FinalTestLessFeature = X_dummytest[np.ix_(np.arange(10000), newindices)] print FinalTrainLessFeature.shape print FinalTestLessFeature.shape j =0 for clf in classifiers: rng = np.random.RandomState(1) estimate = clf.fit(FinalTrainLessFeature,y_dummytrain) predictions = estimate.predict(FinalTestLessFeature) scores[i][j] = accuracy_score(y_dummytest,predictions) print scores
#%% ''' Creat different combinations of features to give a better training result ''' poly = PolynomialFeatures()#default = 2 X_train = poly.fit_transform(X_train) feature_name = poly.get_feature_names(data.columns)#show the combinations print(feature_name) ''' Select the above combination of features by run the random forest technique once and abandon those low weight arguments. ''' select = SelectFromModel(RandomForestClassifier(),max_features = 30)#maximum feature to retain is 30 select = select.fit(X_train,Y_train) featuresSupport = select.get_support() X_train = select.transform(X_train) ''' find the selected features' name ''' selecred_feature_name = [] for i in range(len(featuresSupport)): if featuresSupport[i] == True: selecred_feature_name.append(feature_name[i]) print(selecred_feature_name) #%% ''' Creat many random forest model with different split and leaf size, run each forest with some sample data, and leave the best performed one ''' gs_b = GridSearchCV(
#Performing PCA pca = PCA(n_components='mle') pca.fit(train_panel) #Explained Varinace ratio NO_cols = len(pca.explained_variance_ratio_[pca.explained_variance_ratio_ > 0.05]) print str(NO_cols)+" columsn have variance ration greater than 0.05" #Feature selectiong using Lasso Lass = Lasso(alpha = 0.1) Lass = Lass.fit(train_panel,train_target) model_selecting = SelectFromModel(Lass, prefit=True) features_selected = train_panel.columns[model_selecting.get_support()] train_features_subset = model_selecting.transform(train_panel) print str(train_features_subset.shape[1])+" columns selected" #After Feature Selection LR_Cross_val = cross_val_score(LR,train_features_subset,train_target,cv=10,scoring = 'mean_squared_error').mean() print "CV Score for Linear Regression : "+str(-1*LR_Cross_val) #Tunning Alpha for Lasso for i in np.arange(0.01,0.5,0.05) : las = Lasso(alpha=i) print "Aplha ="+str(i)+" CV: "+str(-1*cross_val_score(las,train_features_subset,train_target,cv=10,scoring = 'mean_squared_error').mean())
r2 = metrics.r2_score(y_test, y_pred) rmse = metrics.mean_squared_error(y_test, y_pred, squared=False) print("### LOG LASSO REGRESSION ###") print("Test Lasso r2-score is {}".format(r2)) print("Test Lasso RMSE is {}".format(rmse)) y_pred = lasso.predict(X_train_chi_sel) r2 = metrics.r2_score(y_train, y_pred) rmse = metrics.mean_squared_error(y_train, y_pred, squared=False) print("Train Lasso r2-score is {}".format(r2)) print("Train Lasso RMSE is {}".format(rmse)) # lasso feature selection sel = SelectFromModel(lasso) sel.fit(X_train_chi_sel, y_train) selected_feat = X_train_chi_sel[:, sel.get_support()] X_train_selected = sel.transform(X_train_chi_sel) X_test_selected = sel.transform(test_chi_sel[:, :-1]) print("datasets trasformed to {} features...".format(X_train_selected.shape[1])) # Random Forest clf = RandomForestRegressor(random_state=0) clf.fit(X_train_selected, y_train) y_pred = clf.predict(X_test_selected) r2 = metrics.r2_score(y_test, y_pred) rmse = metrics.mean_squared_error(y_test, y_pred, squared=False) print("\nRANDOM FOREST") print('selected features by lasso: {}'.format(X_train_selected.shape[1]))
from sklearn.feature_selection import SelectKBest, SelectFromModel from sklearn.ensemble import RandomForestClassifier import numpy as np rng = np.random.RandomState(1) X = rng.randint(0, 2, (200, 20)) y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0) fs_univariate = SelectKBest(k=10) fs_modelbased = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='median') fs_univariate.fit(X, y) print('Features selected by univariate selection:') print(fs_univariate.get_support()) plt.matshow(fs_univariate.get_support().reshape(1, -1), cmap='gray_r') fs_modelbased.fit(X, y) print('Features selected by model-based selection:') print(fs_modelbased.get_support()) plt.matshow(fs_modelbased.get_support().reshape(1, -1), cmap='gray_r');
dummies_carbin = pd.get_dummies(test['Cabin'], prefix='Cabin') dummies_embarked = pd.get_dummies(test['Embarked'], prefix='Embarked') dummies_sex = pd.get_dummies(test['Sex'], prefix='Sex') dummies_Pclass = pd.get_dummies(test['Pclass'], prefix='Pclass') test = pd.concat( [test, dummies_carbin, dummies_embarked, dummies_sex, dummies_Pclass], axis=1) from sklearn.feature_selection import SelectFromModel from xgboost import XGBClassifier, plot_importance selected_feature = SelectFromModel(estimator=XGBClassifier()).fit( train.iloc[:, 2:].values, train.iloc[:, 1]) print(selected_feature) print(selected_feature.get_support()) train_x = train.iloc[:, 2:] print(train_x.columns[selected_feature.get_support()]) model_XGB = XGBClassifier() model_XGB.fit(train.iloc[:, 2:].values, train.iloc[:, 1]) plot_importance(model_XGB) import matplotlib.pyplot as plt plt.show() # test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) # # test.to_csv("./test0.csv", index=False) # train.to_csv("./train0.csv", index=False)
print "Variance Threshold" sel = VarianceThreshold(threshold=(0.90 * (1 - 0.90))) selector=sel.fit(training[features]) print selector.get_support(indices=True) for i in range(0,len(features)): if i in selector.get_support(indices=True): print features[i] print "Select from Model - Logistic" modelLReg = LogisticRegression() modelLReg = modelLReg.fit(training[features], training['crime']) model = SelectFromModel(modelLReg, prefit=True) print model.get_support(indices=True) for i in range(0,len(features)): if i in model.get_support(indices=True): print features[i] print "Tree Based Feature Selection" clf = ExtraTreesClassifier() clf = clf.fit(training[features], training['crime']) model = SelectFromModel(clf, prefit=True) print model.get_support(indices=True) for i in range(0,len(features)): if i in model.get_support(indices=True): print features[i]
X, y = df1.iloc[:, :-1], df1.iloc[:, -1] print("Input data with columns", X.shape) print("Predictions shape: ", y.shape) #%% print("WITHOUT PREPROCESSING") test_all(*train_test_split(X, y, test_size=0.25, random_state=2606)) #%% print("WITH PREPROCESSING") from sklearn.preprocessing import StandardScaler scaler = StandardScaler() # wHAT IS STANDARD SCALER HERE? X = scaler.fit_transform(X) test_all(*train_test_split(X, y, test_size=0.25, random_state=2606)) #%% from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier sel = SelectFromModel(RandomForestClassifier(n_estimators=100)) cols = df1.iloc[:, :-1].columns # Get the column name sel.fit(df1.iloc[:, :-1], df1.iloc[:, -1]) # X, y # sel.get_support() selected_feat = df1[cols].columns[( sel.get_support())] # select only True columns len(selected_feat) print(selected_feat) columnsData = df1[selected_feat] #%% print("After preprocessing, and FEATURE SELECTION") test_all(*train_test_split(columnsData, y, test_size=0.25, random_state=2606)) #%%
def get_esembel_score(name): if os.path.exists(util.features_prefix + name + "_XXXYYY.pkl") is False: print 'file does not exist' exit() [X_train, X_validate, X_test, y_train, y_validate, y_test] = pd.read_pickle( util.features_prefix + name + '_XXXYYY.pkl') import xgboost as xgb rf_clf_2 = pd.read_pickle(util.models_prefix + name+'_rf.pkl') list_all = [] rf_2_list = rf_clf_2.predict(X_test) from sklearn.feature_selection import SelectFromModel model = SelectFromModel(rf_clf_2, prefit=True) temp = model.get_support() print sum(temp) list_all.append(rf_2_list) print rf_clf_2.score(X_test, y_test) xgb_2 = xgb.Booster({'nthread': 4}) # init model xgb_2.load_model(util.models_prefix +name+ '_xgb.pkl') # load data print len(xgb_2.get_fscore().keys()) dtest = xgb.DMatrix(X_test) xgb_2_test = xgb_2.predict(dtest) list_all.append(xgb_2_test) print score_lists(xgb_2_test, y_test) from keras.utils import np_utils import copy [train_X, train_Y] = pd.read_pickle(util.features_prefix + name + '_XY.pkl') X_semantic = np.array(copy.deepcopy(X_test[:, range(95, 475)])) X_manual = np.array(copy.deepcopy(X_test[:, range(0, 95)])) X_cluster = np.array(copy.deepcopy(X_test[:, range(475, 545)])) X_document = np.array(copy.deepcopy(X_test[:, range(545, 547)])) X_document[:, [0]] = X_document[:, [0]] + train_X[:, [-1]].max() X_semantic = X_semantic.reshape(X_semantic.shape[0], 10, -1) X_semantic_1 = np.zeros((X_semantic.shape[0], X_semantic.shape[2], X_semantic.shape[1])) for i in range(int(X_semantic.shape[0])): X_semantic_1[i] = np.transpose(X_semantic[i]) json_string = pd.read_pickle(util.models_prefix +name+ '_json_string_cnn.pkl') model_cnn = model_from_json(json_string) model_cnn.load_weights(util.models_prefix + name+'_nn_weight_cnn.h5') cnn_list = model_cnn.predict_classes([X_document, X_cluster, X_manual, X_semantic_1]) # cnn_list_prob = model_cnn.predict_proba([X_document, X_cluster, X_manual, X_semantic_1]) kk = list(cnn_list) list_all.append(kk) print score_lists(kk, y_test) json_string = pd.read_pickle(util.models_prefix + name + '_json_string_lstm.pkl') model_lstm = model_from_json(json_string) model_lstm.load_weights(util.models_prefix + name + '_nn_weight_lstm.h5') lstm_list = model_lstm.predict_classes([X_document, X_cluster, X_manual, X_semantic_1]) # cnn_list_prob = model_cnn.predict_proba([X_document, X_cluster, X_manual, X_semantic_1]) kk = list(lstm_list) list_all.append(kk) print score_lists(kk, y_test) list_ensemble = [] for i in range(len(y_test)): dict_all = {} for z in range(len(list_all)): dict_all[list_all[z][i]] = dict_all.setdefault(list_all[z][i], 0) + 1 tmp_list = dict_all.items() list_ensemble.append(sorted(tmp_list, lambda a, b: -cmp(a[1], b[1]))[0][0]) print score_lists(list_ensemble, y_test) print '**************************'
y_train = y_train.reshape((307,)) y_test = y_test.reshape((77,)) X=np.matrix(test_sample.iloc[:,1:201]) y=np.matrix(test_sample[['target']]) from sklearn.feature_selection import SelectFromModel from lightgbm import LGBMClassifier lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2, reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40) embeded_lgb_selector = SelectFromModel(lgbc, max_features=199) embeded_lgb_selector.fit(X, y) embeded_lgb_support = embeded_lgb_selector.get_support() embeded_lgb_feature = test_sample.loc[:,embeded_lgb_support].columns.tolist() print(str(len(embeded_lgb_feature)), 'selected features') embeded_lgb_feature.remove('target') embeded_lgb_feature # importando as bibliotecas dos modelos classificadores from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn import metrics
# -*- coding: utf-8 -*- import pandas from sklearn.linear_model import LinearRegression from sklearn.feature_selection import SelectFromModel data = pandas.read_csv('D:\\PDM\\6.2\\data2.csv') feature = data[['月份', '季度', '广告费用', '客流量']] lrModel = LinearRegression() selectFromModel = SelectFromModel(lrModel) selectFromModel.fit_transform( feature, data['销售额'] ) feature.columns[selectFromModel.get_support()]
def feature_selection(df, train_y): ################ # 数据预处理,特征选择 global to_drop # 去掉那些变化不大的特征 variances = VarianceThreshold().fit(df).variances_.tolist() drop_variance = [] for i in range(len(variances)): if variances[i] < 0.25: drop_variance.append(feature_kinds2[i]) to_drop['variance'] = drop_variance # 计算Pearson相关系数,剔除相关性过高的特征 coef = df.corr() # 提取矩阵的上三角 upper = coef.where(np.triu(np.ones(coef.shape), k=1).astype(np.bool)) drop_corr = [ column for column in upper.columns if any(upper[column].abs() > correlation_threshold) ] to_drop['corr'] = drop_corr # 使用LightGBM剔除重要性为0的特征 features = pd.get_dummies(df) feature_names = list(features) features = np.array(features) labels = np.array(train_y).reshape((-1, )) feature_importance_values = np.zeros(len(feature_names)) for iter in range(n_iterations): model = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.05, verbose=-1) print("Start lgbm fit ", iter, " times") model.fit(features, labels.astype('int')) # 记录特征的重要性 feature_importance_values += model.feature_importances_ / n_iterations feature_importances = pd.DataFrame({ 'feature': feature_names, 'importance': feature_importance_values }) # 根据重要性对特征进行排序 feature_importances = feature_importances.sort_values( 'importance', ascending=False).reset_index(drop=True) # 将重要性标准化 feature_importances['normalized_importance'] = feature_importances[ 'importance'] / feature_importances['importance'].sum() feature_importances['cumulative_importance'] = np.cumsum( feature_importances['normalized_importance']) # 提取重要性为0的特征 record_zero_importance = feature_importances[ feature_importances['importance'] == 0.0] drop_importance_zero = list(record_zero_importance['feature']) to_drop['importance_zero'] = drop_importance_zero print("lightgbm finished.") # 剔除那些重要性小的特征 feature_importances = feature_importances.sort_values( 'cumulative_importance') record_low_importance = feature_importances[ feature_importances['cumulative_importance'] > cumulative_importance] drop_importance_low = list(record_low_importance['feature']) to_drop['importance_low'] = drop_importance_low # L1正则化 labels = np.array(train_y).reshape((-1, )) print("df.shape: ", df.shape) print("labels.shape: ", labels.shape) clf = LassoCV(max_iter=10000) clf.fit(df, labels) selector = SelectFromModel(estimator=clf, prefit=True) support = selector.get_support() print(support) drop_feature_lassocv = [] for iter in range(len(feature_kinds2)): if support[iter] is False: drop_feature_lassocv.append(feature_kinds2[iter]) to_drop['L1'] = drop_feature_lassocv print(to_drop) # 将上述要丢弃的特征整合,去除重复的 features_to_drop = set(list(chain(*list(to_drop.values())))) features_to_drop = list(features_to_drop) print("Selected features.") return features_to_drop
for label, y in (('All', y0), ): #*((key, np.where(y0==key, key, 'Other')) for key in np.unique(y0))): #%% transcripts = [] scores = [] trials = 1000 for seed in tqdm(range(trials)): clf = ExtraTreesClassifier(n_estimators=50, random_state=seed, max_depth=5, criterion='entropy', min_impurity_decrease=0.05) clf.fit(X, y) dimred = SelectFromModel(clf, prefit=True, max_features=50) transcripts.extend(ft.T.index[dimred.get_support()]) scores.extend(clf.feature_importances_[dimred.get_support()]) df0 = pd.DataFrame() df0['transcripts'] = transcripts df0['scores'] = scores #%% def f(gp): return pd.DataFrame([[len(gp), max(gp['scores'])]], columns=['count', 'max']) df1 = df0.groupby('transcripts').apply(f).reset_index() df1['frequency'] = df1['count'] / trials df1['selection'] = (df1['max'] > 0.101) | (df1['frequency'] > 0.06) fig1 = px.scatter(df1,
forest.fit(X_train, y_train) importances = forest.feature_importances_ print('R2 for Train)', forest.score( X_train, y_train )) print('R2 for Test (cross validation)', forest.score(X_test, y_test)) ## Feature Selection "SelectFromModel is a meta-transformer that can be used along with any estimator that has a coef_ or feature_importances_ attribute after fitting. The features are considered unimportant and removed, if the corresponding coef_ or feature_importances_ values are below the provided threshold parameter. Apart from specifying the threshold numerically, there are built-in heuristics for finding a threshold using a string argument." from sklearn.feature_selection import SelectFromModel model = SelectFromModel(forest, prefit=True, max_features=3) feature_idx = model.get_support() feature_names = X.columns[feature_idx] X_NEW = model.transform(X) pd.DataFrame(X_NEW, columns= feature_names) # Split the data into a training set and a test set X_train, X_test, y_train, y_test = train_test_split(X_NEW, y, test_size=0.3, random_state=0) lm = LinearRegression() lm.fit( X_train, y_train ) print('R2 for Train)', lm.score( X_train, y_train )) print('R2 for Test (cross validation)', lm.score(X_test, y_test)) from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel import sys import pandas as pd import json import numpy as np from sklearn.ensemble import ExtraTreesClassifier if __name__ == "__main__": #python selectfeatures.py [input_file] [output_file - new_data.csv] if len(sys.argv) == 3: df = pd.read_csv(sys.argv[1], sep=',', header=0) X = df.iloc[:,:-1].as_matrix() y = df.iloc[:,-1].as_matrix().astype('U') lsvc = ExtraTreesClassifier().fit(X, y) model = SelectFromModel(lsvc, prefit=True, threshold=0.01) idx_important_features = model.get_support() features = df.iloc[:,:-1] print(np.where(idx_important_features)[0]) X_2 = features.iloc[:,idx_important_features] df2 = pd.DataFrame(X_2) df2['class'] = y df2.to_csv(sys.argv[2], sep=',', index=False)
def rf_importance(X_train, y_train, threshold): clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) sfm = SelectFromModel(clf, threshold=threshold) sfm.fit(X_train, y_train.reshape(-1)) return sfm.get_support(indices=True)
f.close() print(len(headers)) train = np.array(train) train_X = train[:40000, :-1] norm_X = normalize(train_X, axis = 0, norm = 'max') train_y = train[:40000, -1] valid_X = train[40000:, :-1] norm_vX = normalize(valid_X, axis = 0, norm = 'max') valid_y = train[40000:, -1] # Perform logistic regression for feature selection logreg_feat_sel = LogisticRegression(penalty = 'l1', C = 0.01) logreg_feat_sel.fit(norm_X, train_y) model = SelectFromModel(logreg_feat_sel, prefit = True) X_new = model.transform(norm_X) X_inds = model.get_support(indices = True) vX_new = model.transform(norm_vX) print(X_new.shape) print(X_inds) rel_cols = [headers[i] for i in X_inds] print(rel_cols) v_score = [0.] * 5 v_score = np.array(v_score) # Perform logistic regression on the restricted list of features # Modify penalty term and plot the cross-validation errors of each for i in range(5): restr_logreg = LogisticRegression(penalty = 'l2', C = np.power(10., -i)) restr_logreg.fit(X_new, train_y) logreg_pred_y = restr_logreg.predict(X_new)
# Feature selection by selecting K best features feature_selection_skb = SelectKBest(score_func=f_regression, k=150) feature_selection_skb.fit(X_train, y_train) # Get new data set containing only selected features X_selected_skb = X_selected[X_selected.columns[feature_selection_skb.get_support()]] # Print feature selection results print("SelectKBest, selected features:", len(X_selected_skb.columns)) X_train_fs_skb, X_test_fs_skb = train_test_split(X_selected_skb, train_size=0.8, test_size=0.2, shuffle=False) print("Feature selection - SelectFromModel") # Feature selection by threshold feature_selection_sfm = SelectFromModel(ridge_reg_cv, threshold="median") feature_selection_sfm.fit(X_train, y_train) # Get new data set containing only selected features X_selected_sfm = X_selected[X_selected.columns[feature_selection_sfm.get_support()]] # Print feature selection results print("SelectFromModel, selected features:", len(X_selected_sfm.columns)) X_train_fs_sfm, X_test_fs_sfm = train_test_split(X_selected_sfm, train_size=0.8, test_size=0.2, shuffle=False) print("Perform test - basic data") results = utilities.test_regressions(reg_list, X_train, X_test, y_train, y_test, '', plot_learning_curves=True, plot_histogram=True, save_path=output_directory) results_log = utilities.test_regressions(reg_list, X_train, X_test, y_train_log, y_test_log, '_log', plot_learning_curves=True, plot_histogram=True, save_path=output_directory) print("Perform test - PCA") results_pca = utilities.test_regressions(reg_list, X_train_pca, X_test_pca, y_train, y_test, '_pca', plot_learning_curves=True, plot_histogram=True, save_path=output_directory) results_pca_log = utilities.test_regressions(reg_list, X_train_pca, X_test_pca, y_train_log, y_test_log, '_pca_log', plot_learning_curves=True, plot_histogram=True, save_path=output_directory)
def feature_selection_embeded(data, label, feature_return='embeded_rf_feature'): """ data: pandas DataFrame,load from train_data.csv label: pandas DataFrame, sample city label feature_return: optional, feature selected model: ['embeded_rf_feature','embeded_lr_selector', 'embeded_lgb_selector'] return: feature list """ tmp = pd.merge(label, data, left_index=True, right_index=True) X, y = tmp[data.columns].values, tmp["city"].values assert feature_return in [ 'embeded_rf_feature', 'embeded_lr_selector', 'embeded_lgb_selector' ] # feature selected by Random Forest model if feature_return == 'embeded_rf_feature': embeded_rf_selector = SelectFromModel(RandomForestClassifier( criterion='gini', max_features='auto', random_state=np.random.seed(13), n_jobs=-1, class_weight='balanced', n_estimators=500), threshold='3.7*mean') #1.5 embeded_rf_selector.fit(X, y) embeded_rf_support = embeded_rf_selector.get_support() embeded_rf_feature = data.loc[:, embeded_rf_support].columns.tolist() print(str(len(embeded_rf_feature)), 'RandomForestClassifier selected features') return embeded_rf_feature # feature selected by Logistic Regression model elif feature_return == 'embeded_lr_selector': embeded_lr_selector = SelectFromModel(LogisticRegression( penalty='l1', C=0.6, class_weight='balanced'), threshold='2*mean') #X, y = tmp[embeded_rf_feature].values, tmp["city"].values embeded_lr_selector.fit(X, y) embeded_lr_selector = embeded_lr_selector.get_support() embeded_lr_selector = data.loc[:, embeded_lr_selector].columns.tolist() print(str(len(embeded_lr_selector)), 'LogisticRegression selected features') return embeded_lr_selector # feature selected by LGBM model else: lgbc = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2, reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40) embeded_lgb_selector = SelectFromModel(lgbc, threshold='mean') embeded_lgb_selector.fit(X, y) embeded_lgb_support = embeded_lgb_selector.get_support() embeded_lgb_feature = data.loc[:, embeded_lgb_support].columns.tolist() print(str(len(embeded_lgb_feature)), 'LGBMClassifier selected features') return embeded_lgb_selector return None
from sklearn.datasets import load_iris iris = load_iris() ix, iy = iris.data, iris.target from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier from sklearn.feature_selection import SelectFromModel model1 = ExtraTreesClassifier() model2 = GradientBoostingClassifier() model1.fit(ix, iy) model2.fit(ix, iy) model1.feature_importances_ model2.feature_importances_ clf1 = SelectFromModel(model1, prefit=True) clf2 = SelectFromModel(model2, prefit=True) clf1.get_support() clf2.get_support() #--- # sklearn 交叉验证 from sklearn.cross_validation import cross_val_score #cross_val_score(model, X, y, cv=10) from sklearn.cross_validation import cross_val_predict #cross_val_predict(model, X, y, cv=10) from sklearn.cross_validation import LeaveOneOut #scores = cross_val_score(model, X, y, cv=LeaveOneOut(len(X))) # --- from sklearn.pipeline import Pipeline, make_pipeline
pool.close() pool.join() y_data_labels = np.append(y_data_labels, np.zeros(len(neg_data))) print("Done") # Concatenate training data x_data_train = pos_data + neg_data # Fit + transform data print("Fitting model: ", end="") clf.fit_transform(x_data_train, y_data_labels) print("Done") # Calculate and print cv scores scores = cross_validation.cross_val_score(clf, x_data_train, y_data_labels, cv=cv, scoring='f1_weighted') print("scores: " + str(np.average(scores))) # Feature importance table feature_names = vect.get_feature_names() selected_feature_names = [feature_names[i] for i in sel.get_support(True)] importances = rfc.feature_importances_ indices = np.argsort(importances)[::-1] table = [[selected_feature_names[f], importances[f]] for f in indices] # Pretty-print the table. df = pd.DataFrame(table) print(df.to_string(header=False)) # Enter interactive mode code.interact(local=locals())
def get_support(X,y,C): lsvc = LinearSVC(C=C, penalty="l1", dual=False).fit(X, y) model = SelectFromModel(lsvc, prefit=True) X_support = model.get_support() return X_support