def variance_threshold(features_train, features_valid): """Return the initial dataframes after dropping some features according to variance threshold Parameters: ---------- features_train: pd.DataFrame features of training set features_valid: pd.DataFrame features of validation set Output: ------ features_train: pd.DataFrame features_valid: pd.DataFrame """ from sklearn.feature_selection import VarianceThreshold threshold=0.01 selector = VarianceThreshold(threshold=threshold) selector.fit(features_train) ## Instead of using the transform() method, we look at which columns have been dropped, to be able to drop in both training and validation set the same features. This way, we keep the column names to make interpretation easier variances = selector.variances_ dropped_features = features_train.columns.values[variances < threshold] #name of features to drop features_train.drop(dropped_features, axis=1, inplace=True) features_valid.drop(dropped_features, axis=1, inplace=True) return features_train, features_valid
def test_same_variances(self): local = VarianceThreshold() dist = SparkVarianceThreshold() shapes = [((10, 5), None), ((1e3, 20), None), ((1e3, 20), 100), ((1e4, 100), None), ((1e4, 100), 600)] for shape, block_size in shapes: X_dense, X_dense_rdd = self.make_dense_rdd() X_sparse, X_sparse_rdd = self.make_sparse_rdd() Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y')) local.fit(X_dense) dist.fit(X_dense_rdd) assert_array_almost_equal(local.variances_, dist.variances_) local.fit(X_sparse) dist.fit(X_sparse_rdd) assert_array_almost_equal(local.variances_, dist.variances_) dist.fit(Z) assert_array_almost_equal(local.variances_, dist.variances_)
def _variance_threshold(self, input_df, threshold): """Uses Scikit-learn's VarianceThreshold feature selection to learn the subset of features that pass the threshold Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} Input DataFrame to perform feature selection on threshold: float The variance threshold that removes features that fall under the threshold Returns ------- subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']} Returns a DataFrame containing the features that are above the variance threshold """ training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) selector = VarianceThreshold(threshold=threshold) try: selector.fit(training_features) except ValueError: # None features are above the variance threshold return input_df[['guess', 'class', 'group']].copy() mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group'] return input_df[mask_cols].copy()
def select_features(x, y, methods=('variance', 'correlation', 'l1', 'forest')): ''' methods = ('variance', 'correlation', 'l1', 'forest') - variance: use variance threshold to discard features that are mostly 0 or 1 - correlation: use chi2 test to remove most very correlated features - l1: use l1 penalty to remove features that make solution sparse - forest: use ExtraTreesClassifier to point out importance of features select important ones ''' features = x.loc[:,'Feature_1':'Feature_2'] if 'variance' in methods: vt = VT(threshold=(0.99*(1-0.99))) vt.fit(features) if 'correlation' in methods: cr = SP(f_regression, percentile=80) if 'l1' in methods: rgr = MultiTaskLassoCV(cv=5, n_jobs=-1) m = SFM(rgr) if 'forest' in methods: clf = RandomRorestRegressor(n_estimators=300, max_features=0.7,n_jobs=-1).fit(x,y) m = SFM(clf) m.fit(x.values, y.values) for indices in idx_list: x_indices = x_indices & indices print 'All: %s' % len(x_indices) return list(x_indices)
def remove_feat_constants(data_frame): # Remove feature vectors containing one unique value, # because such features do not have predictive value. print("") print("Deleting zero variance features...") # Let's get the zero variance features by fitting VarianceThreshold # selector to the data, but let's not transform the data with # the selector because it will also transform our Pandas data frame into # NumPy array and we would like to keep the Pandas data frame. Therefore, # let's delete the zero variance features manually. n_features_originally = data_frame.shape[1] selector = VarianceThreshold() selector.fit(data_frame) # Get the indices of zero variance feats feat_ix_keep = selector.get_support(indices=True) orig_feat_ix = np.arange(data_frame.columns.size) feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep) # Delete zero variance feats from the original pandas data frame data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete], axis=1) # Print info n_features_deleted = feat_ix_delete.size print(" - Deleted %s / %s features (~= %.1f %%)" % ( n_features_deleted, n_features_originally, 100.0 * (np.float(n_features_deleted) / n_features_originally))) return data_frame
def variance_threshold(self, dframe=None, columns=None, skip_columns=None, thresh=0.0, autoremove=False): """ Wrapper for sklearn variance threshold to for pandas dataframe :param dframe: :param columns: :param skip_columns: :param thresh: :param autoremove: :return: """ logging.debug("Finding low-variance features") removed_features=[] try: all_columns = dframe.columns # remove the skip columns remaining_cols = all_columns.drop(skip_columns) # get length of new index. max_index = len(remaining_cols) - 1 skipped_idx = [all_columns.get_loc(column) for column in skip_columns] for idx, item in enumerate(skipped_idx): if item > max_index: diff = item - max_index skipped_idx[idx] -= diff if item == max_index: diff = item - len(skip_columns) skipped_idx[idx] -= diff if idx == 0: skipped_idx[idx] = item skipped_values = dframe.iloc[:skipped_idx].values X = dframe.loc[:, remaining_cols].values vt = VarianceThreshold(threshold=thresh) vt.fit(X) feature_indices = vt.get_support(indices=True) feature_names = [remaining_cols[idx] for idx, _ in enumerate(remaining_cols) if idx in feature_indices] removed_features = list(np.setdiff1d(remaining_cols, feature_names)) logging.debug("Found %d low - variance columns " % len(removed_features)) except Exception as e: logging.error(e) logging.error("Could not remove low variance features, some thing went wrong") print(e) pass return dframe, removed_features
def test_variance_threshold(): tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) selector = VarianceThreshold(threshold=0) selector.fit(training_features) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._variance_threshold(training_testing_data, 0), training_testing_data[mask_cols])
def feature_selection(train_instances): logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info('Crossvalidation started... ') selector = VarianceThreshold() selector.fit(train_instances) logger.info('Number of features used... ' + str(Counter(selector.get_support())[True])) logger.info('Number of features ignored... ' + str(Counter(selector.get_support())[False])) return selector
def _variance_threshhold(self, variance): '''Remove columns that do not meat the variance threshold''' logging.info('Removing data that has variance less than %f.' %(variance)) vt = VarianceThreshold(variance) vt.fit(self.X) # XXX: Because idx should have high variance we pas all of X self.X = vt.transform(self.X) self.X_submit = vt.transform(self.X_submit) # Repeat this process for X_submit # XXX: This might not be kosher outside of competition vt.fit(self.X_submit) self.X = vt.transform(self.X) self.X_submit = vt.transform(self.X_submit)
def pre_process_datasets(datasets, filter_method=None, threshold=(0, 0), normalize=True, use_cnv=False, use_mut=False): exp_train_data = datasets['exp_train_data'] exp_board_data = datasets['exp_board_data'] if use_cnv: cnv_train_data = datasets['cnv_train_data'] cnv_board_data = datasets['cnv_board_data'] if filter_method == 'cv': exp_cv = exp_train_data.std(1).values / exp_train_data.mean(1).values exp_train_data = exp_train_data.loc[exp_cv > threshold[0], :] exp_board_data = exp_board_data.loc[exp_cv > threshold[0], :] if use_cnv: cnv_train_data = cnv_train_data.apply(exp) cnv_cv = cnv_train_data.std(1).values / cnv_train_data.mean(1).values cnv_train_data = cnv_train_data.loc[cnv_cv > threshold[1], :] cnv_board_data = cnv_board_data.loc[cnv_cv > threshold[1], :] if filter_method == 'var': selector = VarianceThreshold(threshold[0]) selector.fit(exp_train_data.values.T) exp_train_data = exp_train_data.loc[selector.get_support(), :] exp_board_data = exp_board_data.loc[selector.get_support(), :] if use_cnv: selector = VarianceThreshold(threshold[1]) selector.fit(cnv_train_data.values.T) cnv_train_data = cnv_train_data.loc[selector.get_support(), :] cnv_board_data = cnv_board_data.loc[selector.get_support(), :] if use_cnv: feat_train_data = exp_train_data.append(cnv_train_data) feat_board_data = exp_board_data.append(cnv_board_data) print 'features after filtering', exp_train_data.shape[0], '+', cnv_train_data.shape[0], '=', feat_train_data.shape[0] else: feat_train_data = exp_train_data feat_board_data = exp_board_data print 'features after filtering', exp_train_data.shape[0] if use_mut: feat_train_data = feat_train_data.append(datasets['mut_train_data']) feat_board_data = feat_board_data.append(datasets['mut_board_data']) if normalize: scaler = StandardScaler().fit(feat_train_data.values.T) feat_train_data.values[:,:] = scaler.transform(feat_train_data.values.T).T feat_board_data.values[:,:] = scaler.transform(feat_board_data.values.T).T datasets['feat_train_data'] = feat_train_data datasets['feat_board_data'] = feat_board_data
def main(): from sklearn.feature_selection import VarianceThreshold X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]] root = Feature('root') featureList = np.array([]) for i in range(len(X[0])): feature = Feature('feature_%d' % i) root.transform('init', feature) featureList = np.append(featureList, feature) model = VarianceThreshold() model.fit(X) doWithSelector(model, featureList) root.printTree()
def feature_select(word,instance_dic,feature_dic, thre_hold=0.01, num_feature=100): instances_list = instance_dic[word] feature_words=feature_dic[word] feature_xs = [] labels = [] for instance in instances_list: label = ' '.join(instance.senseid) feature_x_dic = feature_vector(instance,feature_words) feature_vals=[] for word in feature_words: feature_vals.append(feature_x_dic[word]) feature_xs.append(feature_vals) labels.append(label) # 1st round feature selection by removing low variance features sel_lowvr = VarianceThreshold(threshold=(thre_hold)) feature_xs_selected = sel_lowvr.fit(feature_xs) lowvr_index = feature_xs_selected.get_support(indices=True).tolist() feature_xs_selected = feature_xs_selected.transform(feature_xs).tolist() # 2nd round feature selection using sklearn's SelectKBest() if num_feature < len(feature_xs_selected[0]): sel_chi2 = SelectKBest(chi2, k= num_feature).fit(feature_xs_selected, labels) chi2_index= sel_chi2.get_support(indices=True).tolist() #feature_xs_selected = sel_chi2.transform(feature_xs_selected).tolist()# transform from numpy array back to lis return lowvr_index, chi2_index else: print str(word) + ": chi2 selection not executed due to low # of features" return lowvr_index, [i for i in range(len(lowvr_index))]
def removeZeroVariance(data_frame): n_features_originally = data_frame.shape[1] selector = VarianceThreshold() selector.fit(data_frame) # Get the indices of zero variance feats feat_ix_keep = selector.get_support(indices=True) orig_feat_ix = np.arange(data_frame.columns.size) feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep) # Delete zero variance feats from the original pandas data frame data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete], axis=1) # Print info n_features_deleted = feat_ix_delete.size print(" - Deleted %s / %s features (~= %.1f %%)" % ( n_features_deleted, n_features_originally, 100.0 * (np.float(n_features_deleted) / n_features_originally))) return data_frame
def test_variance_k_best_random_tree_k_fold(self): # Feature Selection samples, responses = open_model("models.obj") samples = np.array(samples) responses = np.array(responses) FeatureSelection = True if FeatureSelection: selection = VarianceThreshold(threshold=0.00) selection.fit(samples) idxs = selection.get_support(indices=True) samples = samples[:, idxs] samples = preprocessing.scale(samples) # Stratified cross-validation scv = StratifiedKFold(responses, n_folds=10) sum = 0 for i, (train, test) in enumerate(scv): print('Case %d' % (i)) # Modeling rdmForest = RandomForest_scikit() # Train init = time() rdmForest.train(samples[train, :], responses[train]) # Test a, confusionPre = rdmForest.test(samples[test, :], responses[test], True) print('Time: %0.3fs' % (time() - init)) for idx, fila in enumerate(confusionPre): for jdx, entrada in enumerate(fila): if idx != jdx: sum += entrada print("Wrong Cases: "+str(sum)) print(' Full Case ') rdmForest = RandomForest_scikit() rdmForest.train(samples, responses) rdmForest.test(samples, responses, True)
def test_same_variances(self): local = VarianceThreshold() dist = SparkVarianceThreshold() shapes = [((10, 5), None), ((1e3, 20), None), ((1e3, 20), 100), ((1e4, 100), None), ((1e4, 100), 600)] for shape, block_size in shapes: X, X_rdd = self.generate_dataset(shape, block_size) local.fit(X) dist.fit(X_rdd) assert_array_almost_equal(local.variances_, dist.variances_) X, X_rdd = self.generate_sparse_dataset() local.fit(X) dist.fit(X_rdd) assert_array_almost_equal(local.variances_, dist.variances_)
def main(): # shape (#rows,18) train_users_raw = pd.read_csv('train_users_pruned.csv',delimiter=',',encoding='utf-8') test_users_raw = pd.read_csv('test_users.csv',delimiter=',',encoding='utf-8') del train_users_raw['id'] user_id = test_users_raw['id'] del test_users_raw['id'] train_users_raw=train_users_raw.drop(train_users_raw.columns[[0]], axis=1) test_users_raw=test_users_raw.drop(test_users_raw.columns[[0]], axis=1) country_destination = train_users_raw['country_destination'] del train_users_raw['country_destination'] del train_users_raw['year_booked'] del train_users_raw['month_booked'] del train_users_raw['date_booked'] del test_users_raw['year_booked'] del test_users_raw['month_booked'] del test_users_raw['date_booked'] selector = VarianceThreshold(threshold=2.0) selector.fit(train_users_raw) selected_col_ind = selector.get_support(indices=True) selected_col_ind = np.append(selected_col_ind, train_users_raw.shape[1]-1) #print selected_col_ind #print train_users_raw.columns.values # shape (#rows,11) train_users_downsized = train_users_raw.ix[:,selected_col_ind] train_users_downsized['country_destination'] = country_destination print train_users_downsized.columns.values test_users_downsized = test_users_raw.ix[:,selected_col_ind] test_users_downsized['id'] = user_id print test_users_downsized.columns.values train_users_downsized.to_csv('training_data_processed.csv', sep=',', encoding='utf-8') test_users_downsized.to_csv('testing_data_processed.csv', sep=',', encoding='utf-8')
# load data train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') ids_tr = train.pop('id').values ids_te = test.pop('id').values magic_tr = train.pop('wheezy-copper-turtle-magic').values magic_te = test.pop('wheezy-copper-turtle-magic').values target = train.pop('target').values train = train.values test = test.values # infomative columns of each magic value vt = VarianceThreshold(threshold=1.5) infomative_cols = [] for i in range(MAX_MAGIC_NO): vt.fit(train[magic_tr == i]) infomative_cols.append(vt.get_support(indices=True)) ### Step-1 ### oof_all = [] pred_all = [] for n in range(1, MAX_COMPONENTS + 1): oof_n = np.zeros(len(train)) pred_n = np.zeros(len(test)) gmm0 = GaussianMixture(n_components=n, covariance_type='full', random_state=RANDOM_SEED) gmm1 = GaussianMixture(n_components=n, covariance_type='full', random_state=RANDOM_SEED) for i in range(MAX_MAGIC_NO):
from sklearn import datasets from sklearn.feature_selection import VarianceThreshold from sklearn.preprocessing import StandardScaler # 加载数据 iris = datasets.load_iris() # 创建features和target features = iris.data target = iris.target # 创建VarianceThreshold对象 thresholder = VarianceThreshold(threshold=.5) # 创建大方差特征矩阵 features_high_variance = thresholder.fit_transform(features) # 显示大方差特征矩阵 print(features_high_variance[0:3]) # 显示方差 print(thresholder.fit(features).variances_) # 标准化特征矩阵 scaler = StandardScaler() features_std = scaler.fit_transform(features) # 计算每个特征值的方差 selector = VarianceThreshold() print(selector.fit(features_std).variances_)
def variance_threshold_selector(data, threshold=0.5): selector = VarianceThreshold(threshold) selector.fit(data) return data[data.columns[selector.get_support(indices=True)]]
######## # Main # ######## if __name__ == '__main__': # Load training and test set LS = utils.load_from_csv(TRAINING_SET) TS = utils.load_from_csv(TEST_SET) # Create fingerprint features and output of learning set X_LS = fingerprints.transform(LS['SMILES'].values, FINGERPRINT) y_LS = LS['ACTIVE'].values # Variance threshold (feature selection) selector = VarianceThreshold() selector.fit(X_LS) X_LS = selector.transform(X_LS) # Cross validation score cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0) scores = cross_val_score(MODEL, X_LS, y_LS, cv=cv, scoring='roc_auc') # Estimated AUC AUC = scores.mean() # Train model MODEL.fit(X_LS, y_LS) # Create fingerprint features of test set X_TS = fingerprints.transform(TS['SMILES'].values, FINGERPRINT) X_TS = selector.transform(X_TS)
def remove_features_with_low_variance(x_data): variance = VarianceThreshold(threshold=1.4) print ('before transform', len(x_data[4]), x_data[4]) variance.fit(x_data) transformed_x = variance.transform(x_data) print ('after transform', len(transformed_x[4]), transformed_x[4])
def get_removed_feats(df, model): return df.columns.values[1:][~model.get_support()] def update_df(df, removed_descriptors, inplace=True): if inplace: df.drop(removed_descriptors, 1, inplace=True) # print(df.shape) return df else: new_df = df.drop(removed_descriptors, 1, inplace=False) # print(new_df.shape) return new_df # find the names of the columns with zero variance var_sel = VarianceThreshold() var_sel.fit(df.iloc[:,1:]) removed_descriptors = get_removed_feats(df, var_sel) # update the data frame update_df(df, removed_descriptors) # correlation filter def find_correlated(data): correlation_matrix = data.iloc[:,1:].corr(method='spearman') removed_descs = set() all_descs = correlation_matrix.columns.values for label in all_descs: if label not in removed_descs: correlations_abs = correlation_matrix[label].abs() mask = (correlations_abs > 0.7).values to_remove = set(all_descs[mask])
# ## Variance based thresholding # In[3]: df = pd.read_csv('datasets/Pokemon.csv') poke_gen = pd.get_dummies(df['Generation']) poke_gen.head() # In[4]: from sklearn.feature_selection import VarianceThreshold vt = VarianceThreshold(threshold=.15) vt.fit(poke_gen) # In[5]: pd.DataFrame({'variance': vt.variances_, 'select_feature': vt.get_support()}, index=poke_gen.columns).T # In[6]: poke_gen_subset = poke_gen.iloc[:,vt.get_support()].head() poke_gen_subset
x_train_um = x_train_1[x_train_1['fl_severidade'] == 1] x_train_zero_down = resample(x_train_zero, replace=True, n_samples=len(x_train_um), random_state=123) x_train_1 = pd.concat([x_train_zero_down, x_train_um]) print(x_train_1[x_train_1['fl_severidade'] == 0].count()) y_train_1 = x_train_1['fl_severidade'] x_train_1.drop('fl_severidade', axis=1) ################################################################# #LIMPEZA DE VARIAVEIS CONSTANTES ################################################################## vl_limpa_const = 0.1 # variaveis com 99% dos campos repetidos limpa_const = VarianceThreshold(threshold=vl_limpa_const) limpa_const.fit(amostra_paci) vars_const = [ v for v in amostra_paci.columns if v not in amostra_paci.columns[limpa_const.get_support()] ] qt_var = len([ v for v in amostra_paci.columns if v not in amostra_paci.columns[limpa_const.get_support()] ]) print('Existem {} variaveis constantes com limite de {}'.format( qt_var, vl_limpa_const)) print('Variaveis constantes com limite de {}'.format(vl_limpa_const)) print(vars_const) d = {'vars_const': vars_const} df = pd.DataFrame(data=d)
def variance_threshold_selector(data, threshold): selector = VarianceThreshold(threshold) selector.fit(data) variances = selector.variances_ print(variances) return variances, data[data.columns[selector.get_support(indices=True)]]
def fs_attack(self, clf, do_vt=None, do_rfe=None, verbose=None): """ :param clf: classifier :param do_vt: do variance thresholding :param do_rfe: do recursive feature selection :return: [auc, auc_lv, auc_rfe] always 3 values. if no features were removed, the regular auc repeats. """ retarr = [] train_ = pd.read_csv(self.out_datapath + self.train_fname, index_col=0) test_ = pd.read_csv(self.out_datapath + self.test_fname, index_col=0) X_train, y_train = train_.iloc[:, 2:-3].values, train_[ self.attribute].values X_test, y_test = test_.iloc[:, 2:-3].values, test_[self.attribute].values clf.fit(X_train, y_train) pred_ = clf.predict(X_test) auc = roc_auc_score(y_test, pred_) if auc >= 0.5: print(self.vf_fname + ',', auc) else: print(self.vf_fname + ',', 1 - auc) retarr.append(auc) if do_vt: sel = VarianceThreshold() sel.fit(X_train) #print (sel.variances_) X_train_lv = sel.transform(X_train) #print(sel.get_support(indices=True)) if (X_train.shape[1] > X_train_lv.shape[1]): if verbose: print("X_train.shape[1], X_train_lv.shape[1]", X_train.shape[1], X_train_lv.shape[1]) # , X_test_lv.shape) X_test_lv = sel.transform(X_test) clf.fit(X_train_lv, y_train) pred_ = clf.predict(X_test_lv) auc_lv = roc_auc_score(y_test, pred_) if auc_lv >= 0.5: print(self.vf_fname + '_lv,', auc_lv) else: print(self.vf_fname + '_lv,', 1 - auc_lv) X_train = X_train_lv X_test = X_test_lv retarr.append(auc_lv) else: retarr.append(retarr[-1]) if do_rfe: if not hasattr(clf, 'score'): print( "WARNING! The classifier passed should have a 'score' method for RFE! You are probably using BinaryDNN! RFE will be skipped!" ) retarr.append(retarr[-1]) else: if X_train.shape[1] <= 14: # too few features if verbose: print("too few features, skipping RFE") retarr.append(retarr[-1]) else: selector = RFECV(clf, step=1, cv=5, n_jobs=-2) selector.fit(X_train, y_train) if (selector.n_features_ < X_train.shape[1]): if verbose: print(selector.n_features_, " feats selected out of", X_train.shape[1]) X_train_fe = selector.transform(X_train) X_test_fe = selector.transform(X_test) clf.fit(X_train_fe, y_train) pred_ = clf.predict(X_test_fe) auc_fe = roc_auc_score(y_test, pred_) if auc_fe >= 0.5: print(self.vf_fname + '_lv_fe,', auc_fe) else: print(self.vf_fname + '_lv_fe,', 1 - auc_fe) retarr.append(auc_fe) else: # if nothing was removed retarr.append(retarr[-1]) return retarr """
def remove_low_variance_features(input_df, thres=0): sel = VarianceThreshold(threshold=thres) sel.fit(input_df) index = np.where(sel.variances_ > thres)[0] return input_df.ix[:, index]
target_np=np.ravel(target_np_bin) ############################################################################# # # Feature Selection # ########################################## #Low Variance Filter if lv_filter==1: print('--LOW VARIANCE FILTER ON--', '\n') #LV Threshold sel = VarianceThreshold(threshold=0.5) #Removes any feature with less than 20% variance fit_mod=sel.fit(data_np) fitted=sel.transform(data_np) sel_idx=fit_mod.get_support() #Get lists of selected and non-selected features (names and indexes) temp=[] temp_idx=[] temp_del=[] for i in range(len(data_np[0])): if sel_idx[i]==1: #Selected Features get added to temp header temp.append(header[i+feat_start]) temp_idx.append(i) else: #Indexes of non-selected features get added to delete array temp_del.append(i) print('Selected', temp)
def preprocess(X, LB, datasets, use_mut, use_CNV, use_exp, exp_threshold, use_methyl, use_cell_info, scale): """Preprocesses data""" # Remove COMBINATION_ID column # Remove CELL_LINE column X = X.drop("COMBINATION_ID", 1) LB = LB.drop("COMBINATION_ID", 1) if use_mut: mut = datasets["mut_data"] X = ( X.reset_index().merge(mut, how="left", on="CELL_LINE", sort=False).set_index("index") ) # to preserve original order LB = LB.reset_index().merge(mut, how="left", on="CELL_LINE", sort=False).set_index("index") if use_CNV: cnv = datasets["cnv_data"] X = X.reset_index().merge(cnv, how="left", on="CELL_LINE", sort=False).set_index("index") LB = LB.reset_index().merge(cnv, how="left", on="CELL_LINE", sort=False).set_index("index") if use_exp: gex = datasets["gex_data"] col1 = gex.loc[:, "CELL_LINE"] exp_data = gex.iloc[:, 1:] # Need to impute missing values (because of the added CCLE data) before being able to filter imp = preprocessing.Imputer(strategy="median") exp_data = pd.DataFrame(data=imp.fit_transform(exp_data.values), columns=list(exp_data.columns.values)) # Filter by variance filt = VarianceThreshold(exp_threshold) filt.fit(exp_data.values) gex = pd.concat( [col1, exp_data.loc[:, filt.get_support()]], axis=1 ) # gex dataframe, now without missing values and filtered by variance X = X.reset_index().merge(gex, how="left", on="CELL_LINE", sort=False).set_index("index") LB = LB.reset_index().merge(gex, how="left", on="CELL_LINE", sort=False).set_index("index") if use_methyl: X = X.reset_index().merge(datasets["methyl_data"], how="left", on="CELL_LINE", sort=False).set_index("index") LB = LB.reset_index().merge(datasets["methyl_data"], how="left", on="CELL_LINE", sort=False).set_index("index") if use_cell_info: X = X.reset_index().merge(datasets["cell_data"], how="left", on="CELL_LINE", sort=False).set_index("index") LB = LB.reset_index().merge(datasets["cell_data"], how="left", on="CELL_LINE", sort=False).set_index("index") X = X.sort_index(axis=0) LB = LB.sort_index(axis=0) # Remove CELL_LINE column X = X.drop("CELL_LINE", 1) LB = LB.drop("CELL_LINE", 1) # X = X.drop(['COMPOUND_A', 'COMPOUND_B'], 1) # LB = LB.drop(['COMPOUND_A', 'COMPOUND_B'], 1) # Encode categorical data obj_cols = list(X.select_dtypes(include=["object"]).columns) col_names = list(X.columns.values) last_col = col_names.index(col_names[-1]) X = pd.get_dummies(X, columns=obj_cols) LB = pd.get_dummies(LB, columns=obj_cols) missing_classes_X = list(set(list(LB.columns.values)[last_col:]).difference(set(list(X.columns.values)[last_col:]))) missing_classes_LB = list( set(list(X.columns.values)[last_col:]).difference(set(list(LB.columns.values)[last_col:])) ) X = pd.concat( [ X, pd.DataFrame( data=np.zeros((len(list(X.index.values)), len(missing_classes_X))), index=X.index, columns=missing_classes_X, ), ], axis=1, ) LB = pd.concat( [ LB, pd.DataFrame( data=np.zeros((len(list(LB.index.values)), len(missing_classes_LB))), index=LB.index, columns=missing_classes_LB, ), ], axis=1, ) # # Remove features with more than 50% NaN # keep_features = [] # nan = X.isnull().sum()/len(X.index) # for col_name in X: # if nan[col_name] <= 0.5: # keep_features.append(col_name) # X = X.loc[:, keep_features] # LB = LB.loc[:, keep_features] keep_features = list(X.columns.values) # Impute missing values imp = preprocessing.Imputer(missing_values="NaN", strategy="median") X = imp.fit_transform(X.values) LB = imp.fit_transform(LB.values) # Remove features with zero variance filt = VarianceThreshold() filt.fit(X) X = X[:, filt.get_support()] LB = LB[:, filt.get_support()] keep_features = [keep_features[i] for i in xrange(len(keep_features)) if list(filt.get_support())[i]] if scale: scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) LB = scaler.transform(LB) return X, LB, keep_features
# Thresholding Numerical Feature Variance # removing features with low variance by selecting a subset of features with variances above a given threshold iris = datasets.load_iris() features = iris.data target = iris.target # Create thresholder thresholder = VarianceThreshold(threshold=.5) # Create high variance feature matrix features_high_variance = thresholder.fit_transform(features) # View high variance feature matrix print(features_high_variance[0:3]) # We can see the variance for each feature using variances_: print(thresholder.fit(features).variances_) # VT first calculates the variance of each feature, then it drops those whose variance does not meet that threshold. # If the features have been standardized (mean zero and unit variance), of course variance thresholding will not work. # Thresholding Binary Feature Variance # You have a set of binary categorical features and want to remove those with low variance. # We select a subset of features with a Bernoulli random variable variance above a given threshold. # Create feature matrix with: # Feature 0: 80% class 0 # Feature 1: 80% class 1 # Feature 2: 60% class 0, 40% class 1 features = [[0, 1, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0]] # Run threshold by variance
print('NÚMERO EM PORCENTAGEM DA BASE EXCLUÍDA:',(1 - df[col].count() / 80000) * 100,'%') """ 4º) LIMPEZA DE VARIÁVEIS CONSTANTES EXEMPLO: 99% DE VARIÁVEIS PREENCHIDAS COM 0 OU NULL DEVEM SER EXCLUÍDAS UMA VEZ QUE NÃO SÃO SIGNIFICATIVAS NO MODELO """ from sklearn.feature_selection import VarianceThreshold var_thres = VarianceThreshold(threshold = 0.01) var_thres.fit(df) var_thres.get_support() constant_columns = [column for column in df.columns if column not in df.columns[var_thres.get_support()]] df.drop(constant_columns, axis = 1) print('NÚMERO DE VARIÁVEIS CONSTANTES EXCLUÍDAS:',len(constant_columns)) """ 5º) SELEÇÃO DAS MELHORES VARIÁVEIS PARA O MODELO
def remove_features_with_low_variance(x_data): variance = VarianceThreshold(threshold=1.4) print('before transform', len(x_data[4]), x_data[4]) variance.fit(x_data) transformed_x = variance.transform(x_data) print('after transform', len(transformed_x[4]), transformed_x[4])
# for feature selection i have a few ideas. 1) run feature selection over the whole matrix of features. #2) remove some of the recordings and do it a few times (so manually k-folding), because that way if the same features are removed #then we know that for real those are the features not helpful xtrain_aud = sio.loadmat('xtrain_all_aud.mat') xtrain_aud = xtrain_aud['xtrain'] ytrain_aud = sio.loadmat('ytrain_all_aud.mat') ytrain_aud = ytrain_aud['ytrain'] # method 1: variance threshold Var_selector = VarThresh(.5) # without any parameters passed to varthresh it defaults to anything with all feautres the exact same # am going to start with .1 Var_selector.fit(xtrain_aud) which_feats = Var_selector.get_support() x_aud_fitted = Var_selector.transform(xtrain_aud) print x_aud_fitted.shape xtrunclength = sio.loadmat('xtrunclength.mat') xtrunclength = xtrunclength['xtrunclength'] xtesting = sio.loadmat('xtesting.mat') xtesting = xtesting['xtesting'] xtesting = xtesting[~np.isnan(xtesting).any(axis=1),:] xtesting = xtesting[~np.isinf(xtesting).any(axis=1),:]
y[categorical_ix]) * 1 # use "* 1" to convert it into int results_array[numerical_ix] = np.abs( x[numerical_ix] - y[numerical_ix]) / norm_range[numerical_ix] return np.sum(np.square(results_array)) tidy_data = pd.read_csv('tidy.csv') X_data = tidy_data.drop(['CHT_No', 'reStroke'], axis=1) X_data = X_data.drop(['Adm_AF_0otEKG', 'EKG_AF', 'Adm_AntiCO'], axis=1) # highly related to AF y_data = tidy_data[['reStroke']] # remove constant features from sklearn.feature_selection import VarianceThreshold selector = VarianceThreshold(threshold=0) selector.fit(X_data) X_data = X_data[X_data.columns[selector.get_support(indices=True)]] categorical_ix = np.array( [0, 2, 3, 4, 5, 6, 7, 8, 34, 35, 36, 41, 42, 43, 44, 46, 47, 48, 49, 51]) categorical_columns = X_data.columns[categorical_ix].values numerical_columns = np.setdiff1d(X_data.columns, categorical_columns) numerical_ix = np.array([X_data.columns.get_loc(c) for c in numerical_columns]) X_data[numerical_columns] = StandardScaler().fit_transform( X_data[numerical_columns]) norm_range = np.array( np.nanmax(X_data.values, axis=0) - np.nanmin(X_data.values, axis=0)) heom_metric = distython.HEOM(X_data, categorical_ix, nan_equivalents=[np.nan]) reducer = umap.UMAP(metric=heom_metric.heom, random_state=369)
def model_xgb(features_train, labels_train, features_test): # Remove constant features selector_vt = VarianceThreshold() selector_vt.fit(features_train) # Get the indices of zero variance features features_kept = selector_vt.get_support(indices=True) orig_features = np.arange(features_train.columns.size) features_deleted = np.delete(orig_features, features_kept) #print ("Indices of deleted features:", features_deleted) print ("- Number of constant features removed:", len(features_deleted)) # Delete zero variance features from train and test sets features_train = features_train.drop(labels=features_train.columns[features_deleted], axis=1) features_test = features_test.drop(labels=features_test.columns[features_deleted], axis=1) #print (features_train.shape, features_test.shape) """ # Another way of removing constant features. Slightly slower than the above method # Count the number of unique values in each feature nuniques_train = features_train.apply(lambda x:x.nunique()) no_variation_train = nuniques_train[nuniques_train==1].index features_train = features_train.drop(no_variation_train, axis=1) features_test = features_test.drop(no_variation_train, axis=1) print (features_train.shape, features_test.shape) """ # Remove idential features features_deleted = [] # Find the names of identical features by going through all the combinations of features for f1, f2 in itertools.combinations(iterable=features_train.columns, r=2): if np.array_equal(features_train[f1], features_train[f2]): features_deleted.append(f2) features_deleted = np.unique(features_deleted) # Delete the identical features features_train = features_train.drop(labels=features_deleted, axis=1) features_test = features_test.drop(labels=features_deleted, axis=1) print ("- Number of idential features removed:", len(features_deleted)) # Add a column to count the number of zeros per row features_train['n0'] = (features_train == 0).sum(axis=1) features_test['n0'] = (features_test == 0).sum(axis=1) # Feature normalization f_train_normalized = normalize(features_train, axis=0) f_test_normalized = normalize(features_test, axis=0) # Do PCA print ("- Do PCA") pca = PCA(n_components=2) f_train_pca = pca.fit_transform(f_train_normalized) features_train['PCA1'] = f_train_pca[:,0] features_train['PCA2'] = f_train_pca[:,1] f_test_pca = pca.fit_transform(f_test_normalized) features_test['PCA1'] = f_test_pca[:,0] features_test['PCA2'] = f_test_pca[:,1] # Feature selection #p = 75, AUC = 0.834348 p = 70 # AUC = 0.834820 #p = 65, AUC = print ("- Do feature selection") f_train_binarized = Binarizer().fit_transform(scale(features_train)) selector_chi2 = SelectPercentile(chi2, percentile=p).fit(f_train_binarized, labels_train) selected_chi2 = selector_chi2.get_support() # a list of True/False to indicate if a feature is selected or not #selected_chi2_features = [f for i, f in enumerate(features_train.columns) if selected_chi2[i]] #print (selected_chi2_features) select_f_classif = SelectPercentile(f_classif, percentile=p).fit(f_train_binarized, labels_train) selected_f_classif = select_f_classif.get_support() # a list of True/False to indicate if a feature is selected or not #selected_f_classif_features = [f for i, f in enumerate(features_train.columns) if selected_f_classif[i]] #print (selected_f_classif_features) selected = selected_chi2 & selected_f_classif selected_features = [f for i, f in enumerate(features_train.columns) if selected[i]] #print (selected_features) features_train = features_train[selected_features] features_test = features_test[selected_features] # xgboost print ("- Perform xgboost") params = { "objective": "binary:logistic", "silent": 1, "eval_metric": "auc", "eta": 0.03, # tried 0.01 "subsample": 0.5, # tried 1.0, 0.4 "colsample_bytree": 0.7, # tried 0.5, 0.9 "max_depth": 2 # 2-->AUC=0.836347; 5 --> AUC=0.835131; 7 -> AUC=0.834351 #"min_child_weight": 1, # tried 2 & 5 #"gamma": 0 # tried 4 } train_xgb = xgb.DMatrix(features_train, labels_train) test_xgb = xgb.DMatrix(features_test) clf = xgb.train(params, train_xgb, num_boost_round=500) # tried 400, 500, 600 # Get the importances of features, returning pairs of features and their importances importance = clf.get_fscore() # Sort features by importance, and return the top features only # 'key' parameter specifies a function to be called on each list element prior to making comparisons # itemgetter(1) returns importances, itemgetter(0) returns features sorted_importance = sorted(importance.items(), key=operator.itemgetter(1))[-15:] #print (sorted_importance) # Put pairs of features and their importances into a DataFrame for plotting df_importance = pd.DataFrame(sorted_importance, columns=['feature', 'fscore']) # Plot the importance of features, which is useful for data exploration phase df_importance.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(20, 6)) plt.title('XGBoost Feature Importance') plt.xlabel('feature importance') plt.gcf().savefig('feature_importance_xgb.png') #plt.show() # if putting show() before gcf().savefig, the figure won't be saved return clf.predict(test_xgb)
def FeatureSelection(x): selector = VarianceThreshold() selector.fit(x) return selector.get_support(True)
gs.fit(X_all, y_all) scores_post = cross_val_score(gs.best_estimator_, X_all, y_all, scoring=scorer, cv=5) print '*' * 20 print clf.__class__.__name__ print "Params %s" % gs.best_params_ print "Score %.4f" % gs.best_score_ return gs.best_estimator_ from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(.5) result = sel.fit(X_all) remove = [] for x in xrange(0, len(sel.variances_)): if(sel.variances_[x] < 0.0): remove.append(X_all.dtypes.index[x]) for a in remove: print "Removing %s" % a del X_all[a] good_cols = ['failures', 'absences', #'schoolsup', #'goout', # 'paid', # 'guardian_other',
def delete_low_variance(x_train, x_test): low_var = VarianceThreshold(threshold=0.1) low_var.fit(x_train) x_train, x_test = low_var.transform(x_train), low_var.transform(x_test) return (x_train, x_test)
print("Classification report: ") print(classification_report(Y_test, Y_pred)) accuracy_score = accuracy_score(Y_test, Y_pred) print("Accuracy of the model: ", accuracy_score) # ** Variance Threshold** # In[45]: X = adult_df_rev.values[:, :-1] Y = adult_df_rev.values[:, -1] # In[46]: from sklearn.feature_selection import VarianceThreshold # In[53]: #scaling required vt = VarianceThreshold() fit1 = vt.fit(X, Y) print(fit1.variances_) features = fit1.transform(X) print(features) print(features.shape[1]) print(list(zip(colname, fit1.get_support())))
# import some data to play with iris = datasets.load_iris() # Create features and target features = iris.data target = iris.target # Create thresholder thresholder = VarianceThreshold(threshold=.5) # Create high variance feature matrix features_high_variance = thresholder.fit_transform(features) # View high variance feature matrix features_high_variance[0:3] # View variances thresholder.fit(features).variances_ # Load library from sklearn.preprocessing import StandardScaler # Standardize feature matrix scaler = StandardScaler() features_std = scaler.fit_transform(features) # Caculate variance of each feature selector = VarianceThreshold() selector.fit(features_std).variances_
y_donate, train_size=0.70, random_state=123) X_train_scaled, X_test_scaled, y_train, y_test = train_test_split( X_scaled, y_donate, train_size=0.70, random_state=123) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) print(X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape) # ### IV-2. Feature Selection Using Variance Threshold # In[41]: # Remove all features whose variance are less than 0.05 var_selector = VarianceThreshold(threshold=0.05) var_selector.fit(X_train_scaled) indices_selected = var_selector.get_support(indices=True) colnames_vtselected = [X_train_scaled.columns[i] for i in indices_selected] print(colnames_vtselected) len(colnames_vtselected) # ### IV-3. Further Feature Selection using RFECV # In[42]: # Specify the model estimator = LogisticRegression( ) # estimator for RFE, select the suitable model # Select variables using RFECV rfe_selector = RFECV(estimator, step=1, cv=5, n_jobs=-1, scoring='roc_auc')
rbscale = preprocessing.RobustScaler(quantile_range=(25, 75)).fit(ModelSample[FeatureList]) RbIndex = rbscale.transform(ModelSample[FeatureList]) RbIndex = pd.DataFrame(RbIndex,columns=ModelSample[FeatureList].columns,index=ModelSample[FeatureList].index) RbIndexCorr = RbIndex.corr() RbIndexCorr.to_csv('RbIndexCorr.csv',encoding='utf8') RbIndex['dis_index'] = ModelSample['dis_index'] #################Index Selection################## ######1##### #VarTIndexSelect = ModelSample.drop(['con_index','com_index','dis_index'],axis=1) #VarTYSelect = ModelSample[['con_index','com_index','dis_index']] from sklearn.feature_selection import VarianceThreshold VarT = VarianceThreshold(threshold=(0.8*(1-0.8))) VarTvari = VarT.fit(RbIndex[FeatureList]).variances_ VarTvari = pd.DataFrame(VarTvari,index=RbIndex[FeatureList].columns) VarTvari.columns = ['variances'] VarTFeatureList = VarTvari[VarTvari.variances>=0.2].index.tolist() VarTFeatureList.append('dis_index') VarTSample = ModelSample[VarTFeatureList] VarTFeatureList.pop() #####2##### #UniIndexSelect = IndexVarT.drop(['con_index','com_index','dis_index'],axis=1) #UniYSelect = IndexVarT[['con_index','com_index','dis_index']] from sklearn.feature_selection import f_classif, mutual_info_classif,chi2 mi = mutual_info_classif(VarTSample[VarTFeatureList],VarTSample.dis_index) mi = pd.Series(mi,index=VarTSample[VarTFeatureList].columns) ff,fp = f_classif(VarTSample[VarTFeatureList],VarTSample.dis_index)
from sklearn.feature_selection import VarianceThreshold # need to variance decide threshold first vt = VarianceThreshold(threshold=0.005) # or 0.003 # numeric data only ansur_male_num = ansur_male.select_dtypes(include='number') # need to normalizing all features by dividing them by their mean normalized_df = ansur_male_num / ansur_male_num.mean() # ensure the same variance normalized_df.var() _ = vt.fit(ansur_male_num) mask = vt.get_support() ansur_male_num = ansur_male_num.loc[:, mask] # Train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1121218) # Init, fit, score forest = RandomForestRegressor(random_state=1121218) _ = forest.fit(X_train, y_train) # Training Score print(f"Training Score: {forest.score(X_train, y_train)}")
print("{0:.1%} accuracy on test set vs. {1:.1%} on training set".format(accuracy_test, accuracy_train)) # 93.3% accuracy on test set vs. 94.9% on training set # Wow, what just happened!? On the full dataset the model is rubbish but with a single feature we can make good predictions? This is an example of the curse of dimensionality! The model badly overfits when we feed it too many features. It overlooks that neck circumference by itself is pretty different for males and females. # Features with missing values or little variance # Low variance features are so similar between different observation that they may contain little information we can use in an analysis # To remove them we can use from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=1) # set the minimal variance threshold # Fit the selector to our dataset sel.fit(ansur_df) mask = sel.get_support() # This will give us a TRUE or FALSE value on whether each feature's variance is above the threshold or not # loc method and specify we want to select all rows using a colon for the first argument and subselect the columns by using our mask as the second reduced_df = ansur_df.loc[:, mask] # Normalize the variance before using it for feature selection. To do so we divide each column by its mean value before fitting the selector sel = VarianceThreshold(threshold=0.005) sel.fit(ansur_df / ansur_df.mean()) # After normlisation the variance in the dataset will be lower. # When we apply the selector to our dataset the nr of features is more than haved, to 45
def demo(self): with open('abalone.data', 'r') as open_file: abalone = open_file.read() abalone = abalone.strip() abalone = re.split('[\n ,]', abalone) #print abalone for index in range(len(abalone)): if abalone[index] == 'M': abalone[index] = '0' elif abalone[index] == 'F': abalone[index] = '1' elif abalone[index] == 'I': abalone[index] = '2' abalone = [abalone[i:i + 9] for i in range(0, len(abalone), 9)] abalone = np.array(abalone, dtype=float) X = np.delete(abalone, [0], axis=1) y = abalone.T[0] # feature selection # VarianceThreshold sel = VarianceThreshold(threshold=1) sel.fit(X, y) scores1 = sel.variances_ index1 = np.argsort(scores1) n = index1[:-4] X_new_1 = np.delete(X, [n], axis=1) # SelectKBest skb = SelectKBest(chi2, k=3) skb.fit(X, y) scores2 = skb.scores_ index2 = np.argsort(scores2) n = index2[:-4] X_new_2 = np.delete(X, [n], axis=1) # L1 lsvc = LinearSVC(C=0.043, penalty="l1", dual=False) lsvc.fit(X, y) model = SelectFromModel(lsvc, prefit=True) X_new_3 = lsvc.transform(X) scores3 = lsvc.coef_ np.abs(scores3) index3 = np.argsort(scores3) # tree clf = ExtraTreesClassifier() clf.fit(X, y) model = SelectFromModel(clf, prefit=True) scores4 = clf.feature_importances_ index4 = np.argsort(scores4) n = index4[:-4] X_new_4 = np.delete(X, [n], axis=1) # pipline clf = Pipeline([('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))), ('classification', RandomForestClassifier())]) clf.fit(X, y) X = PolynomialFeatures( interaction_only=True).fit_transform(X_new_1).astype(float) clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_1, y) clf.predict(X_new_1) score1 = clf.score(X_new_1, y) X = PolynomialFeatures( interaction_only=True).fit_transform(X_new_2).astype(float) clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_2, y) clf.predict(X_new_2) score2 = clf.score(X_new_2, y) X = PolynomialFeatures( interaction_only=True).fit_transform(X_new_3).astype(float) clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_3, y) clf.predict(X_new_3) score3 = clf.score(X_new_3, y) X = PolynomialFeatures( interaction_only=True).fit_transform(X_new_4).astype(float) clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_4, y) clf.predict(X_new_4) score4 = clf.score(X_new_4, y) print score1, score2, score3, score4 # 0.385683504908 0.385683504908 0.386641129998 0.531242518554 #0.385683504908 0.385683504908 0.386641129998 0.403878381614 #0.385683504908 0.385683504908 0.386641129998 0.456787167824 #0.385683504908 0.385683504908 0.386641129998 0.531481924826 #0.385683504908 0.385683504908 0.386641129998 0.427100790041 #fig, ax = plt.subplots() '''fig = plt.figure(1) ax2 = fig.add_subplot(311) ax3 = fig.add_subplot(312) ax4=fig.add_subplot(313) y1=[] y2=[] y3=[] for i in range(8): x_1= np.linspace(i,8,1) x_2=np.linspace(i,9,1) x_3=np.linspace(i,10,1) y1=scores_1[i] y2=scores_2[i] y3=scores_3[i]''' '''ax.cla() ax.set_title("festure selection") ax.set_xlabel("features") ax.set_ylabel("scores") ax.set_xlim(0, 10) ax.grid() ax.plot(y1, 'r^',label='Varience') ax.plot(y2, 'k^' ,label='selectbeskt') ax.plot(y3, 'bs',label='tree') ax.legend(loc='best') ax2.set_title("Varience") ax3.set_title("SelectKBest") ax4.set_title("ExtraTreesClassifier") ax2.set_xlabel("features") ax2.set_ylabel("scores") ax2.set_xlim(-1,10,1) n1=ax2.plot(x_1,y1,'r^') n2=ax3.plot(x_2,y2,'k^') n3=ax4.plot(x_3,y3,'bs')''' '''ax2.legend(loc='best') ax3.legend(loc='best') ax4.legend(loc='best') #if ax2.legend in fig2: plt.pause(1.5)''' '''plt.clf()
def compute(train, test): #Train data train_X = []; train_restaurant_ids = []; test_X = []; test_restaurant_ids = []; train_Y = []; #Common feature values in train/test train_feature_val = {}; test_feature_val = {}; build_FeatureVal(train, train_feature_val); build_FeatureVal(test, test_feature_val); buildFeatures(train, train_feature_val, test_feature_val, train_X, train_Y, train_restaurant_ids, "train"); buildFeatures(test, train_feature_val, test_feature_val, test_X, None, test_restaurant_ids, "test"); train_Y = np.array(train_Y); enc = OneHotEncoder(categorical_features=np.array([3,4,5,32,33,34,35,36,37,38,39,40,41,42]), sparse=False, n_values=100); enc.fit(test_X); train_X = enc.transform(train_X); test_X = enc.transform(test_X); print("No of train features " + str(len(train_X[0]))); print("No of test features " + str(len(test_X[0]))); #Remove features with similar values selector = VarianceThreshold(); selector.fit(train_X); train_X = selector.transform(train_X); test_X = selector.transform(test_X); print("No of train features " + str(len(train_X[0]))); print("No of test features " + str(len(test_X[0]))); parameters_to_try = generateParams(); print("No of Paramters to test " + str(len(parameters_to_try))); #Contruct parameters as s list models_to_try = [ (copy.copy(train_X), copy.copy(train_Y), parameters_to_try[i] ) for i in range(0, len(parameters_to_try)) ]; #Create a Thread pool. pool = Pool(8); results = pool.map( train_model_wrapper, models_to_try ); pool.close(); pool.join(); best_params = None; best_rmse = sys.float_info.max; for i in range(0, len(results)): if results[i][1] < best_rmse: best_rmse = results[i][1]; best_params = results[i][0]; print("Best Params : " + str(best_params)); print("Best RMSE : " + str(best_rmse)); #estimator = SVR(**params) #estimator = RandomForestRegressor(**best_params) estimator = GradientBoostingRegressor(**best_params) estimator.fit(train_X, train_Y); print("Writing Output"); predict_and_save(estimator, test_X, test_restaurant_ids);
class VarianceSelector(Transformer): type = 9 def __init__(self, threshold=1e-7): super().__init__("variance_selector") self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL] self.compound_mode = 'only_new' self.threshold = threshold def operate(self, input_datanode, target_fields=None): from sklearn.feature_selection import VarianceThreshold feature_types = input_datanode.feature_types X, y = input_datanode.data if target_fields is None: target_fields = collect_fields(feature_types, self.input_type) X_new = X.copy() else: X_new = X[:, target_fields] n_fields = len(feature_types) irrevalent_fields = list(range(n_fields)) for field_id in target_fields: irrevalent_fields.remove(field_id) is_selected = [True] * len(target_fields) if self.model is None: self.model = VarianceThreshold(threshold=self.threshold) self.model.fit(X_new) for idx, var in enumerate(self.model.variances_): is_selected[idx] = True if var > self.threshold else False irrevalent_types = [feature_types[idx] for idx in irrevalent_fields] selected_types = [ feature_types[idx] for idx in target_fields if is_selected[idx] ] selected_types.extend(irrevalent_types) _X = self.model.transform(X_new) if len(irrevalent_fields) > 0: new_X = np.hstack((_X, X[:, irrevalent_fields])) if input_datanode.feature_names is not None: feature_names = np.hstack(([ input_datanode.feature_names[idx] for idx in irrevalent_fields ], [ input_datanode.feature_names[idx] for idx in self.model.get_support(True) ])) else: feature_names = None else: new_X = _X if input_datanode.feature_names is not None: feature_names = [ input_datanode.feature_names[idx] for idx in self.model.get_support(True) ] else: feature_names = None new_feature_types = selected_types output_datanode = DataNode((new_X, y), new_feature_types, input_datanode.task_type, feature_names=feature_names) output_datanode.trans_hist = input_datanode.trans_hist.copy() output_datanode.trans_hist.append(self.type) output_datanode.enable_balance = input_datanode.enable_balance output_datanode.data_balance = input_datanode.data_balance self.target_fields = target_fields.copy() return output_datanode @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): if optimizer == 'smac': cs = ConfigurationSpace() return cs elif optimizer == 'tpe': from hyperopt import hp space = {} return space
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0, stratify = y) # In[80]: x_train.shape # ### Removing Constant, Quasi & Duplicated features # In[81]: constant_filter = VarianceThreshold(threshold =0.01) constant_filter.fit(x_train) x_train_filter = constant_filter.transform(x_train) x_test_filter = constant_filter.transform(x_test) # In[82]: x_train_filter.shape # In[83]: x_train_T = x_train_filter.T x_test_T = x_test_filter.T
).add(flights[AIRPORTS[1]].value_counts(), fill_value=0) for airport in AIRPORTS: flights = join_aggregates( flights, airport, airport_size, airport+"_TRAFFIC") for airport in AIRPORT_NAMES: airport_intl = airport+"_INTL" flights[airport_intl] = flights[airport].apply(is_international) for coordinate in COORDINATES: flights[coordinate+"_DIF"] = flights[coordinate + '_ORIGIN'] - flights[coordinate+'_DESTINATION'] flights.drop(UNAVAILABLE + EXTRA + DUP_COLS, axis=1, inplace=True) STR_VAR = [v for v in flights.columns if is_string_dtype(flights[v])] flights = categorize_multiple(flights, STR_VAR) flights.drop(STR_VAR, axis=1, inplace=True) NULL_VAR = get_col_with_null(flights) NUM_VAR_COMPLETE = [v for v in flights.columns if is_numeric_dtype( flights[v]) and v not in NULL_VAR] selector = VarianceThreshold() selector.fit(flights[NUM_VAR_COMPLETE]) NUM_VAR_REMOVED = [NUM_VAR_COMPLETE[i] for i in range( len(NUM_VAR_COMPLETE)) if selector.get_support()[i] == False] flights.drop(NUM_VAR_REMOVED, axis=1, inplace=True) flights.to_csv("./data/flights_processed.csv", index=False)
def variance_threshold_selector(X, threshold=0.01): selector = VarianceThreshold(threshold) selector.fit(X) return X[X.columns[selector.get_support(indices=True)]]
def preProcessData(trainFeatureMatrix, testFeatureMatrix): totalFeatureNum = 52 singleValueIndexList = [17, 19, 20, 23] categoricalAttriIndexList = [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 44, 45, 46] categoricalFeatureValueNumList = [13, 112, 2, 13, 13, 112, 2, 13, 145, 4, 3031, 4, 138, 102, 102, 2090] cateNumericIndexList = [1, 6, 15, 16, 18,21,22,24,25,26,27,28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,49,50,51] numericAttriIndexList = [1, 6, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 47, 48, 49, 50, 51] # for i in range(len(trainFeatureSpace[0])): # if not i in categoricalAttriIndexList: # #print 'numerical', i, len(list(set(trainFeatureSpace[:,i]))) # print '%s, numerical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i])))) # else: # print '%s, categorical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i])))) tempResultMatrix = np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0) # print len(trainFeatureMatrix), len(trainFeatureMatrix[0]) # print len(testFeatureMatrix), len(testFeatureMatrix[0]) # print len(tempResultMatrix), len(tempResultMatrix[0]) # exit() # for i in range(len(trainFeatureMatrix)): # for j in range(len(trainFeatureMatrix[0])): # if j in cateNumericIndexList: # trainFeatureMatrix[i][j] = int(trainFeatureMatrix[i][j]) # for i in range(len(testFeatureMatrix)): # for j in range(len(testFeatureMatrix[0])): # if j in cateNumericIndexList: # testFeatureMatrix[i][j] = int(testFeatureMatrix[i][j]) #selectedFeatureList = [] # for i in range(53): # if not i in singleValueIndexList: # selectedFeatureList.append(i) # trainFeatureMatrix = trainFeatureMatrix[ : , selectedFeatureList] # testFeatureMatrix = testFeatureMatrix[ : , selectedFeatureList] from sklearn.preprocessing import OneHotEncoder enc = OneHotEncoder() enc.__init__(categorical_features = categoricalAttriIndexList + cateNumericIndexList) enc.fit(tempResultMatrix) trainFeatureMatrix = enc.transform(trainFeatureMatrix).toarray() testFeatureMatrix = enc.transform(testFeatureMatrix).toarray() print 'old feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0]) #tempResultMatrix = np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0) sel = VarianceThreshold() sel.fit(trainFeatureMatrix) trainFeatureMatrix = sel.transform(trainFeatureMatrix) testFeatureMatrix = sel.transform(testFeatureMatrix) print 'new feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0]) #exit() return trainFeatureMatrix, testFeatureMatrix
def get_low_variance_columns(dframe=None, columns=[], skip_columns=[], threshold=0.0, autoremove=False): """Wrapper for sklearn VarianceThreshold for use on pandas dataframes.""" print("Finding low-variance features.") removed_features = [] ranking_variance_thresholds = {} try: # get list of all the original df columns all_columns = dframe.columns # remove `skip_columns` remaining_columns = all_columns.drop(skip_columns) # get length of new index max_index = len(remaining_columns) - 1 # get indices for `skip_columns` skipped_idx = [all_columns.get_loc(col) for col in skip_columns] # adjust insert location by the number of columns removed # (for non-zero insertion locations) to keep relative # locations intact for idx, item in enumerate(skipped_idx): if item > max_index: diff = item - max_index skipped_idx[idx] -= diff if item == max_index: diff = item - len(skip_columns) skipped_idx[idx] -= diff if idx == 0: skipped_idx[idx] = item # get values of `skip_columns` skipped_values = dframe.iloc[:, skipped_idx].values # get dataframe values X = dframe.loc[:, remaining_columns].values # instantiate VarianceThreshold object vt = VarianceThreshold(threshold=threshold) # fit vt to data vt.fit(X) # threshold ranking ranking_variance_thresholds = dict( list(zip(remaining_columns, vt.variances_))) # get the indices of the features that are being kept feature_indices = vt.get_support(indices=True) # remove low-variance columns from index feature_names = [ remaining_columns[idx] for idx, _ in enumerate(remaining_columns) if idx in feature_indices ] # get the columns to be removed removed_features = list(np.setdiff1d(remaining_columns, feature_names)) print(("""Found {0} low-variance columns. """.format(len(removed_features)))) # remove the columns if autoremove: print("Removing low-variance features.") # remove the low-variance columns X_removed = vt.transform(X) print("Reassembling the dataframe (with low-variance " "features removed).") # re-assemble the dataframe dframe = pd.DataFrame(data=X_removed, columns=feature_names) # add back the `skip_columns` for idx, index in enumerate(skipped_idx): dframe.insert(loc=index, column=skip_columns[idx], value=skipped_values[:, idx]) print("Succesfully removed low-variance columns.") # do not remove columns else: print("No changes have been made to the dataframe.") except Exception as e: print(e) print("Could not remove low-variance features. Something " "went wrong.") return dframe, [], {} return dframe, removed_features, ranking_variance_thresholds
train_data['Y']=train_X_Y['Y'] train_data['X']=train_X_Y['X'] training, validation = train_test_split(train_data, train_size=.60) features = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION','NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN','X','Y'] features2 = [x for x in range(0,24)] features = features + features2 print "Variance Threshold" sel = VarianceThreshold(threshold=(0.90 * (1 - 0.90))) selector=sel.fit(training[features]) print selector.get_support(indices=True) for i in range(0,len(features)): if i in selector.get_support(indices=True): print features[i] print "Select from Model - Logistic" modelLReg = LogisticRegression() modelLReg = modelLReg.fit(training[features], training['crime']) model = SelectFromModel(modelLReg, prefit=True) print model.get_support(indices=True) for i in range(0,len(features)): if i in model.get_support(indices=True):
#To drop the year column retail_set = retail_set.drop('YEAR', axis=1) #To fill the missing values with the mean of their respective columns retail_set.fillna(retail_set.mean(), inplace=True) #To convert the categorical values to numeric so that it can be included for the prediction X = pd.get_dummies(retail_set, columns=['CITY', 'STATE', 'FORMAT', 'REGION', 'SPECIAL'], drop_first=True) #Eliminate the columns whose variance is less than 0.5 sel = VarianceThreshold(threshold=(.5 * (1 - .5))) sel.fit(X) X = sel.transform(X) X = StandardScaler().fit_transform(X) #To reduce the dimensions using principal component analyis pca = PCA(n_components=9) pc = pca.fit_transform(X) #Assigning the required components to features and output labels so that it can be used in future for training and testing X = pc y = retail_set['RS_SALES'] #Dividing the set into training and testing X_train, X_test, y_train, y_test = train_test_split(X, y,
df = pd.read_csv(data_path('train.csv')) df_test = pd.read_csv(data_path('test.csv')) target = df['TARGET'] del df['TARGET'] id = df_test['ID'] from src.transfomations import remove_correlated _, to_remove = remove_correlated(df, 0.99) df_test.drop(to_remove, axis=1, inplace=True ) variance_threshold = VarianceThreshold(threshold=0.001) df = variance_threshold.fit_transform(df) df_test = variance_threshold.fit(df_test) m2_xgb = XGBClassifier(n_estimators=110, nthread=1, max_depth=4, scale_pos_weight=.8) m2_xgb.fit(df, target, eval_metric='auc') param_dist = { "n_estimators": [80, 100, 110, 130], "max_depth": [3, 4, 5], "scale_pos_weight": [0.8, 1, 1.2], "learning_rate": [0.1, 0.05, 0.02], } randomizedSearch = RandomizedSearchCV(m2_xgb, n_iter=20, param_distributions=param_dist, verbose=2) randomizedSearch.fit(df, target) best = randomizedSearch.best_estimator_
""" from sklearn.feature_selection import VarianceThreshold from sklearn.feature_selection import chi2 from sklearn.feature_selection import SelectKBest kbestfilter = SelectKBest(chi2,k=500) train_features = kbestfilter.fit_transform(dataset_small.get_train_features(), dataset_small.get_train_labels()) test_features = kbestfilter.transform(dataset_small.get_test_features()) ## threshold = 0.8*(1-0.8) sel_var = VarianceThreshold(threshold = threshold) sel_var.fit(np.sign(dataset_small.get_train_features())) train_selected_features = sel_var.transform(dataset_small.get_train_features()) test_selected_features = sel_var.transform(dataset_small.get_test_features()) ## train naive bayes import sklearn.naive_bayes as naive_bayes bnb = naive_bayes.BernoulliNB() spam_filter = bnb.fit(np.sign(train_selected_features), dataset_small.get_train_labels()) spam_pred = spam_filter.predict(test_selected_features) ## evaluate goodness of prediction import sklearn.metrics report = sklearn.metrics.classification_report(dataset_small.get_test_labels(),
pass # Import train and test raw data train_data_raw = pd.read_csv("train.csv") test_data_raw = pd.read_csv("test.csv") train_data = optimize_data(train_data_raw) test_data = optimize_data(test_data_raw) # Remove the features with low variance from sklearn.feature_selection import VarianceThreshold P = .8 sel = VarianceThreshold(threshold=(P * (1 - P))) train_data = train_data[train_data.columns[sel.fit(train_data).variances_>(P * (1 - P))]] test_data = test_data[train_data.columns[sel.fit(train_data).variances_>(P * (1 - P))]] # test_data must contain the same columns as train data due to model fitting and prediction for column in train_data.columns: if not column in test_data.columns: test_data[column] = pd.DataFrame().apply(lambda _: '', axis=1) test_data.fillna(0, inplace=True) #train_data = train_data_raw[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]] train_data_results = train_data_raw["Survived"] # scaling train data train_data_scaled = scale(train_data) # scaling test data test_data_scaled = scale(test_data)
def get_low_variance_columns(dframe=None, columns=None, skip_columns=None, thresh=0.0, autoremove=False): """ Wrapper for sklearn VarianceThreshold for use on pandas dataframes. """ print("Finding low-variance features.") try: # get list of all the original df columns all_columns = dframe.columns # remove `skip_columns` remaining_columns = all_columns.drop(skip_columns) # get length of new index max_index = len(remaining_columns) - 1 # get indices for `skip_columns` skipped_idx = [all_columns.get_loc(column) for column in skip_columns] # adjust insert location by the number of columns removed # (for non-zero insertion locations) to keep relative # locations intact for idx, item in enumerate(skipped_idx): if item > max_index: diff = item - max_index skipped_idx[idx] -= diff if item == max_index: diff = item - len(skip_columns) skipped_idx[idx] -= diff if idx == 0: skipped_idx[idx] = item # get values of `skip_columns` skipped_values = dframe.iloc[:, skipped_idx].values # get dataframe values X = dframe.loc[:, remaining_columns].values # instantiate VarianceThreshold object vt = VarianceThreshold(threshold=thresh) # fit vt to data vt.fit(X) # get the indices of the features that are being kept feature_indices = vt.get_support(indices=True) # remove low-variance columns from index feature_names = [remaining_columns[idx] for idx, _ in enumerate(remaining_columns) if idx in feature_indices] # get the columns to be removed removed_features = list(np.setdiff1d(remaining_columns, feature_names)) print("Found {0} low-variance columns.".format(len(removed_features))) # remove the columns if autoremove: print("Removing low-variance features.") # remove the low-variance columns X_removed = vt.transform(X) print("Reassembling the dataframe (with low-variance " "features removed).") # re-assemble the dataframe dframe = pd.DataFrame(data=X_removed, columns=feature_names) # add back the `skip_columns` for idx, index in enumerate(skipped_idx): dframe.insert(loc=index, column=skip_columns[idx], value=skipped_values[:, idx]) print("Succesfully removed low-variance columns.") # do not remove columns else: print("No changes have been made to the dataframe.") except Exception as e: print(e) print("Could not remove low-variance features. Something " "went wrong.") pass return dframe
# -*-coding:utf-8-*- # @auth ivan # @time 20200611 # @goal test 054.Test_Feature_selection from sklearn.feature_selection import VarianceThreshold from sklearn.feature_selection import SelectKBest, f_classif X = [[100, 1, 2, 3], [100, 4, 5, 6], [100, 7, 8, 9], [101, 11, 12, 13]] selector = VarianceThreshold(1) selector.fit(X) print('Variances is %s' % selector.variances_) print('After transform is \n%s' % selector.transform(X)) print('The surport is %s' % selector.get_support(True)) print('The surport is %s' % selector.get_support(False)) print('After reverse transform is \n%s' % selector.inverse_transform(selector.transform(X))) # Variances is [ 0.1875 13.6875 13.6875 13.6875] # After transform is # [[ 1 2 3] # [ 4 5 6] # [ 7 8 9] # [11 12 13]] # The surport is [1 2 3] # The surport is [False True True True] # After reverse transform is # [[ 0 1 2 3] # [ 0 4 5 6] # [ 0 7 8 9] # [ 0 11 12 13]] X = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [3, 3, 3, 3, 3], [1, 1, 1, 1, 1]]
def remove_variance(features, p): t = (p * (1 - p)) sel = VarianceThreshold(threshold=p) sel = sel.fit(features) return sel.fit_transform(features), sel