def recursive_feature_selection(info_humans, info_bots, params, scale=False): X, y, features, scaler = get_Xy(info_humans, info_bots, scale=scale) print "first feature selection by variance test" skb = VarianceThreshold(threshold=(.8 * (1 - .8))) X_new = skb.fit_transform(X) features_1 = features[skb.get_support()] print "second feature selection by ch2 test" skb = SelectKBest(chi2, k=200) # skb = SelectFpr(chi2, alpha=0.005) X_new = skb.fit_transform(X_new, y) features_2 = features_1[skb.get_support()] # skb = PCA(n_components=250) # X_new = skb.fit_transform(X_new, y) print "third feature selection by recursive featue elimination (RFECV)" clf = LogisticRegression(penalty=params['penalty'], C=params['C']) # clf = SVC(kernel="linear") rfecv = RFECV(estimator=clf, step=1, cv=cross_validation.StratifiedKFold(y, 5), scoring='roc_auc', verbose=1) rfecv.fit(X_new, y) print("Optimal number of features : %d" % rfecv.n_features_) return skb, rfecv
def feature_selection(train_instances): logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) logger.info('Crossvalidation started... ') selector = VarianceThreshold() selector.fit(train_instances) logger.info('Number of features used... ' + str(Counter(selector.get_support())[True])) logger.info('Number of features ignored... ' + str(Counter(selector.get_support())[False])) return selector
def pre_process_datasets(datasets, filter_method=None, threshold=(0, 0), normalize=True, use_cnv=False, use_mut=False): exp_train_data = datasets['exp_train_data'] exp_board_data = datasets['exp_board_data'] if use_cnv: cnv_train_data = datasets['cnv_train_data'] cnv_board_data = datasets['cnv_board_data'] if filter_method == 'cv': exp_cv = exp_train_data.std(1).values / exp_train_data.mean(1).values exp_train_data = exp_train_data.loc[exp_cv > threshold[0], :] exp_board_data = exp_board_data.loc[exp_cv > threshold[0], :] if use_cnv: cnv_train_data = cnv_train_data.apply(exp) cnv_cv = cnv_train_data.std(1).values / cnv_train_data.mean(1).values cnv_train_data = cnv_train_data.loc[cnv_cv > threshold[1], :] cnv_board_data = cnv_board_data.loc[cnv_cv > threshold[1], :] if filter_method == 'var': selector = VarianceThreshold(threshold[0]) selector.fit(exp_train_data.values.T) exp_train_data = exp_train_data.loc[selector.get_support(), :] exp_board_data = exp_board_data.loc[selector.get_support(), :] if use_cnv: selector = VarianceThreshold(threshold[1]) selector.fit(cnv_train_data.values.T) cnv_train_data = cnv_train_data.loc[selector.get_support(), :] cnv_board_data = cnv_board_data.loc[selector.get_support(), :] if use_cnv: feat_train_data = exp_train_data.append(cnv_train_data) feat_board_data = exp_board_data.append(cnv_board_data) print 'features after filtering', exp_train_data.shape[0], '+', cnv_train_data.shape[0], '=', feat_train_data.shape[0] else: feat_train_data = exp_train_data feat_board_data = exp_board_data print 'features after filtering', exp_train_data.shape[0] if use_mut: feat_train_data = feat_train_data.append(datasets['mut_train_data']) feat_board_data = feat_board_data.append(datasets['mut_board_data']) if normalize: scaler = StandardScaler().fit(feat_train_data.values.T) feat_train_data.values[:,:] = scaler.transform(feat_train_data.values.T).T feat_board_data.values[:,:] = scaler.transform(feat_board_data.values.T).T datasets['feat_train_data'] = feat_train_data datasets['feat_board_data'] = feat_board_data
def main(): parser = argparse.ArgumentParser(description='Normalize the feature values') required = parser.add_argument_group('required options') required.add_argument('-x', '--outlist', required=True, help='File containing feature values') required.add_argument('-y', '--execlist', required=True, help='File containing exec list') args = parser.parse_args() #X = np.loadtxt(args.outlist, skiprows=1) np.set_printoptions(precision=2) X = np.genfromtxt(args.outlist, skiprows=1) X=np.nan_to_num(X) Y = np.loadtxt(args.execlist, ndmin=2) #f = open("trainlist","wb") #newResult = X/Y #sel = VarianceThreshold(threshold=(.8*(1-.8))) sel = VarianceThreshold(threshold=(.8*(1-.8))) result1 = sel.fit_transform(X) newResult = result1/Y #result2 = sel.fit_transform(newResult) #feature collection for test programs if os.path.isfile('eventlist'): features = np.genfromtxt('eventlist',dtype='str') featureFromVariance = sel.get_support(indices=True) text_file = open("variancefeatures.txt","w") for i in featureFromVariance: text_file.write(features[i]) text_file.write("\n") text_file.close() np.savetxt('normfeaturelist', newResult, fmt='%.2f', delimiter='\t')
def remove_feat_constants(data_frame): # Remove feature vectors containing one unique value, # because such features do not have predictive value. print("") print("Deleting zero variance features...") # Let's get the zero variance features by fitting VarianceThreshold # selector to the data, but let's not transform the data with # the selector because it will also transform our Pandas data frame into # NumPy array and we would like to keep the Pandas data frame. Therefore, # let's delete the zero variance features manually. n_features_originally = data_frame.shape[1] selector = VarianceThreshold() selector.fit(data_frame) # Get the indices of zero variance feats feat_ix_keep = selector.get_support(indices=True) orig_feat_ix = np.arange(data_frame.columns.size) feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep) # Delete zero variance feats from the original pandas data frame data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete], axis=1) # Print info n_features_deleted = feat_ix_delete.size print(" - Deleted %s / %s features (~= %.1f %%)" % ( n_features_deleted, n_features_originally, 100.0 * (np.float(n_features_deleted) / n_features_originally))) return data_frame
def _variance_threshold(self, input_df, threshold): """Uses Scikit-learn's VarianceThreshold feature selection to learn the subset of features that pass the threshold Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} Input DataFrame to perform feature selection on threshold: float The variance threshold that removes features that fall under the threshold Returns ------- subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']} Returns a DataFrame containing the features that are above the variance threshold """ training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) selector = VarianceThreshold(threshold=threshold) try: selector.fit(training_features) except ValueError: # None features are above the variance threshold return input_df[['guess', 'class', 'group']].copy() mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group'] return input_df[mask_cols].copy()
def varianceSelection(self, df, threashold=.8): if not isinstance(df, pandas.core.frame.DataFrame): logger.error('[%s] : [ERROR] Variance selection only possible on Dataframe not %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(df)) sys.exit(1) sel = VarianceThreshold(threshold=(threashold * (1 - threashold))) sel.fit_transform(df) return df[[c for (s, c) in zip(sel.get_support(), df.columns.values) if s]]
def test_zero_variance(): """Test VarianceThreshold with default setting, zero variance.""" for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]: sel = VarianceThreshold().fit(X) assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True)) assert_raises(ValueError, VarianceThreshold().fit, [0, 1, 2, 3]) assert_raises(ValueError, VarianceThreshold().fit, [[0, 1], [0, 1]])
def variance_threshold(self, dframe=None, columns=None, skip_columns=None, thresh=0.0, autoremove=False): """ Wrapper for sklearn variance threshold to for pandas dataframe :param dframe: :param columns: :param skip_columns: :param thresh: :param autoremove: :return: """ logging.debug("Finding low-variance features") removed_features=[] try: all_columns = dframe.columns # remove the skip columns remaining_cols = all_columns.drop(skip_columns) # get length of new index. max_index = len(remaining_cols) - 1 skipped_idx = [all_columns.get_loc(column) for column in skip_columns] for idx, item in enumerate(skipped_idx): if item > max_index: diff = item - max_index skipped_idx[idx] -= diff if item == max_index: diff = item - len(skip_columns) skipped_idx[idx] -= diff if idx == 0: skipped_idx[idx] = item skipped_values = dframe.iloc[:skipped_idx].values X = dframe.loc[:, remaining_cols].values vt = VarianceThreshold(threshold=thresh) vt.fit(X) feature_indices = vt.get_support(indices=True) feature_names = [remaining_cols[idx] for idx, _ in enumerate(remaining_cols) if idx in feature_indices] removed_features = list(np.setdiff1d(remaining_cols, feature_names)) logging.debug("Found %d low - variance columns " % len(removed_features)) except Exception as e: logging.error(e) logging.error("Could not remove low variance features, some thing went wrong") print(e) pass return dframe, removed_features
def test_variance_threshold(): tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) selector = VarianceThreshold(threshold=0) selector.fit(training_features) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._variance_threshold(training_testing_data, 0), training_testing_data[mask_cols])
def filter_features(info_humans, info_bots, k=200, scale=False): """ Carry out 2-layer feature filtering """ X, y, features, scaler = get_Xy(info_humans, info_bots, scale=scale) vt = VarianceThreshold(threshold=(.8 * (1 - .8))) X_new = vt.fit_transform(X) features_1 = features[vt.get_support()] skb = SelectKBest(chi2, k=min(k, len(features_1))) X_new = skb.fit_transform(X_new, y) features_2 = features_1[skb.get_support()] return features_1, features_2, vt, skb
def feat1(matrix): last_column = [row[len(matrix[0])-1] for row in matrix] data_class = transform_to_int(last_column, matrix[0][len(matrix[0])-1]) indices = list(range(len(matrix[0])-1)) new_list = map(operator.itemgetter(*indices), matrix) data = np.asarray(new_list) data = data.astype(np.float) sel = VarianceThreshold(threshold=(0.35)) matrix_new = sel.fit_transform(data) data_class = np.array([data_class]) features_selected = np.concatenate((matrix_new,data_class.T),axis=1) indices_resultados = sel.get_support(new_list) features = [] for data in indices_resultados: features.append(data) return features
def calCorrMat(): ''' delete variable that has variance lower than threshold. 10 here ''' df = pd.read_csv('183_descs_3763.csv',header=0,index_col=None) sel = VarianceThreshold(10) data = sel.fit_transform(df.values) aMask = sel.get_support(True) newDf = df.iloc[:,aMask] print newDf.shape raw_input() corrMat = newDf.corr(method='pearson') corrMat.to_csv('./data/corrNew.csv') newDf.to_csv('./data/reducedDescs.csv') return corrMat
def removeZeroVariance(data_frame): n_features_originally = data_frame.shape[1] selector = VarianceThreshold() selector.fit(data_frame) # Get the indices of zero variance feats feat_ix_keep = selector.get_support(indices=True) orig_feat_ix = np.arange(data_frame.columns.size) feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep) # Delete zero variance feats from the original pandas data frame data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete], axis=1) # Print info n_features_deleted = feat_ix_delete.size print(" - Deleted %s / %s features (~= %.1f %%)" % ( n_features_deleted, n_features_originally, 100.0 * (np.float(n_features_deleted) / n_features_originally))) return data_frame
def bayes(): recipeData = getRecipeData() sel = VarianceThreshold() ingredients = sorted(set([e for sublist in map(lambda e: e['ingredients'], recipeData) for e in sublist])) labels = [recipe['cuisine'] for recipe in recipeData] features = sel.fit_transform([buildFeaturesArray(ingredients, recipe) for recipe in recipeData]) ingredients = [ingredients[i] for i in sel.get_support(True)] clf = MultinomialNB() clf.fit(features, labels) testRecipes = getTestData() testFeatures = [buildFeaturesArray(ingredients, recipe) for recipe in testRecipes] predictions = clf.predict(testFeatures) outputPercentCorrect(predictions) copyAndOutput(predictions, testRecipes)
def test_variance_k_best_random_tree_k_fold(self): # Feature Selection samples, responses = open_model("models.obj") samples = np.array(samples) responses = np.array(responses) FeatureSelection = True if FeatureSelection: selection = VarianceThreshold(threshold=0.00) selection.fit(samples) idxs = selection.get_support(indices=True) samples = samples[:, idxs] samples = preprocessing.scale(samples) # Stratified cross-validation scv = StratifiedKFold(responses, n_folds=10) sum = 0 for i, (train, test) in enumerate(scv): print('Case %d' % (i)) # Modeling rdmForest = RandomForest_scikit() # Train init = time() rdmForest.train(samples[train, :], responses[train]) # Test a, confusionPre = rdmForest.test(samples[test, :], responses[test], True) print('Time: %0.3fs' % (time() - init)) for idx, fila in enumerate(confusionPre): for jdx, entrada in enumerate(fila): if idx != jdx: sum += entrada print("Wrong Cases: "+str(sum)) print(' Full Case ') rdmForest = RandomForest_scikit() rdmForest.train(samples, responses) rdmForest.test(samples, responses, True)
def main(): # shape (#rows,18) train_users_raw = pd.read_csv('train_users_pruned.csv',delimiter=',',encoding='utf-8') test_users_raw = pd.read_csv('test_users.csv',delimiter=',',encoding='utf-8') del train_users_raw['id'] user_id = test_users_raw['id'] del test_users_raw['id'] train_users_raw=train_users_raw.drop(train_users_raw.columns[[0]], axis=1) test_users_raw=test_users_raw.drop(test_users_raw.columns[[0]], axis=1) country_destination = train_users_raw['country_destination'] del train_users_raw['country_destination'] del train_users_raw['year_booked'] del train_users_raw['month_booked'] del train_users_raw['date_booked'] del test_users_raw['year_booked'] del test_users_raw['month_booked'] del test_users_raw['date_booked'] selector = VarianceThreshold(threshold=2.0) selector.fit(train_users_raw) selected_col_ind = selector.get_support(indices=True) selected_col_ind = np.append(selected_col_ind, train_users_raw.shape[1]-1) #print selected_col_ind #print train_users_raw.columns.values # shape (#rows,11) train_users_downsized = train_users_raw.ix[:,selected_col_ind] train_users_downsized['country_destination'] = country_destination print train_users_downsized.columns.values test_users_downsized = test_users_raw.ix[:,selected_col_ind] test_users_downsized['id'] = user_id print test_users_downsized.columns.values train_users_downsized.to_csv('training_data_processed.csv', sep=',', encoding='utf-8') test_users_downsized.to_csv('testing_data_processed.csv', sep=',', encoding='utf-8')
# Filter complete null columns cols = np.where((np.sum(df_2.isnull(), axis=0).values) == df_2.shape[0])[0] print (cols) filt_cols = [c for c in df_2.columns if c not in df_2.columns[cols]] df_3 = df_2[filt_cols] print ("df_3",df_3.shape) #Fill na df_4 = df_3.fillna(value=np.mean(df_3,axis=0),inplace=False,axis=0).values print ("df_4",df_4.shape) data=df_4 selector = VarianceThreshold(threshold=(.99 * (1 - .99))) newdata=selector.fit_transform(data) idxs = selector.get_support(indices=True) print(data[:, idxs]) print("indices",idxs) columnslist=df_2.columns.tolist() print("lenindex",len(idxs)) for z in range(0,len(columnslist)): if z not in idxs: print(columnslist[z]) print("after",newdata.shape) print("initial",data.shape) print("Headers_FINAL: ", df_2.columns.values.tolist()) # In[27]: for i in range (0,df_2['diag_3'].size):
X_test = bash_testleg_cc.drop('labels', axis=1) y1 = bash_testleg_cc['labels'] #print(y1) print(type(y1)) print(type(y)) X_train_T = y.T y_train = pd.DataFrame(X_train_T) X_test_T = y1.T y_test = pd.DataFrame(X_test_T) #X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 0,stratify = y) ##constant feature removall constant_filter = VarianceThreshold(threshold=0) constant_filter.fit(X) print(constant_filter.get_support().sum()) constant_list = [not temp for temp in constant_filter.get_support()] print(constant_list) print(X.columns[constant_list]) X_train_filter = constant_filter.transform(X) X_test_filter = constant_filter.transform(X_test) print(X_train_filter.shape) print(X_test_filter.shape) print(X.shape) ##Quasi constant feature removal quasi_constant_filter = VarianceThreshold(threshold=0.01) quasi_constant_filter.fit(X_train_filter) print(quasi_constant_filter.get_support().sum()) X_train_quasi_filter = quasi_constant_filter.transform(X_train_filter)
# If you want to remove the 2 very low variance features. What would be a # good variance threshold? # A threshold of 1.0e-03 (0.001) will remove the two low variance features. ## Features with low variance from sklearn.feature_selection import VarianceThreshold # Create a VarianceThreshold feature selector sel = VarianceThreshold(threshold=0.001) # Fit the selector to normalized head_df sel.fit(head_df / head_df.mean()) # Create a boolean mask mask = sel.get_support() # Apply the mask to create a reduced dataframe reduced_df = head_df.loc[:, mask] print("Dimensionality reduced from {} to {}.".format(head_df.shape[1], reduced_df.shape[1])) # Dimensionality reduced from 6 to 4 ## Removing features with many missing values school_df.isna().sum() / len(school_df) # Create a boolean mask on whether each feature less than 50% missing values. mask = school_df.isna().sum() / len(school_df) < 0.5
# GSE55145_exprs_B = pd.DataFrame.transpose(GSE55145_exprs_B) # GSE9782_exprs_B = pd.DataFrame.transpose(GSE9782_exprs_B) GDSC_exprs_z = pd.DataFrame.transpose(GDSC_exprs_z) GSE1_exprs_z = pd.DataFrame.transpose(GSE1_exprs_z) GSE2_exprs_z = pd.DataFrame.transpose(GSE2_exprs_z) GSE3_exprs_z = pd.DataFrame.transpose(GSE3_exprs_z) GSE4_exprs_z = pd.DataFrame.transpose(GSE4_exprs_z) TCGA_exprs_z = pd.DataFrame.transpose(TCGA_exprs_z) # # Remove genes with low signal (i.e. below the variance threshold) from expression data selector = VarianceThreshold(0.05) selector.fit_transform(GDSC_exprs_z) GDSC_exprs_z = GDSC_exprs_z[GDSC_exprs_z.columns[selector.get_support( indices=True)]] ls = GSE1_exprs_z.columns.intersection(GDSC_exprs_z.columns) ls = ls.intersection(GSE2_exprs_z.columns) ls = ls.intersection(GSE3_exprs_z.columns) ls = ls.intersection(GSE4_exprs_z.columns) ls = ls.intersection(TCGA_exprs_z.columns) GSE1_exprs_z = GSE1_exprs_z.loc[:, ls] GSE2_exprs_z = GSE2_exprs_z.loc[:, ls] GSE3_exprs_z = GSE3_exprs_z.loc[:, ls] GSE4_exprs_z = GSE4_exprs_z.loc[:, ls] TCGA_exprs_z = TCGA_exprs_z.loc[:, ls] # Obtain selected genes GDSC_exprs_z_genes = list(GDSC_exprs_z.columns.values) GSE1_exprs_z_genes = list(GSE1_exprs_z.columns.values) GSE2_exprs_z_genes = list(GSE2_exprs_z.columns.values)
def featureSelectionVarianceThreshold(data, probability = 0.8): dataRaw = data[:, 2:] sel = VarianceThreshold(threshold=(probability*(1 - probability))) dataNew = sel.fit_transform(dataRaw) fd = open('History.txt','a') history = 'Feature Selection: Variance Threshold' + '\n' + 'Selected Feature: ' + str(sel.get_support(True)) + '\n' fd.write(history) fd.close() return np.c_[data[:, :2], dataNew]
sel = VarianceThreshold(threshold=(theIndex* (1 - theIndex))) data = np.loadtxt("train.nmv.txt") firstData = data.copy() tagUno = [ row[-1] for row in data] tagUno = np.array([tagUno]) #arr = np.concatenate( (arr , for_arr.T ), axis =1) ''' Idea : -Save class labels which will be used after the fit_transform thing cuts the poor variance labels down ''' data = sel.fit_transform(data) data = np.concatenate( (data , tagUno.T ), axis =1) guillotine = sel.get_support() prelimData = np.genfromtxt("prelim-nmv-noclass.txt") prelimData = [i[:-1] for i in prelimData] prelimData = np.array(prelimData) #guillotine = guillotine[:-1] #assert len(guillotine) == len(prelimData[:-1]) guillotine_full = guillotine.copy() guillotine = guillotine[:-1] ''' #assigning this to prelimData verifies that we collapse the #preliminary test set correctly prelimData = firstData.copy()[:-1] '''
def train_test(X_train, X_test): try: vs_constant = VarianceThreshold(threshold=0) # select the numerical columns only. numerical_x_train = X_train[X_train.select_dtypes([np.number]).columns] # fit the object to our data. vs_constant.fit(numerical_x_train) # get the constant colum names. constant_columns = [ column for column in numerical_x_train.columns if column not in numerical_x_train.columns[vs_constant.get_support()] ] # detect constant categorical variables. constant_cat_columns = [ column for column in X_train.columns if (X_train[column].dtype == "O" and len(X_train[column].unique()) == 1) ] all_constant_columns = constant_cat_columns + constant_columns X_train.drop(labels=all_constant_columns, axis=1, inplace=True) X_test.drop(labels=all_constant_columns, axis=1, inplace=True) print(X_train.shape) # threshold value for quasi constant. ####### Quasi-Constant Features threshold = 0.98 # create empty list quasi_constant_feature = [] # loop over all the columns for feature in X_train.columns: # calculate the ratio. predominant = (X_train[feature].value_counts() / np.float(len(X_train))).sort_values( ascending=False).values[0] # append the column name if it is bigger than the threshold if predominant >= threshold: quasi_constant_feature.append(feature) X_train.drop(labels=quasi_constant_feature, axis=1, inplace=True) X_test.drop(labels=quasi_constant_feature, axis=1, inplace=True) print(X_train.shape) #######Duplicated Features # transpose the feature matrice train_features_T = X_train.T ######## Correlation Filter Methods # select the duplicated features columns names duplicated_columns = train_features_T[ train_features_T.duplicated()].index.values # drop those columns X_train.drop(labels=duplicated_columns, axis=1, inplace=True) X_test.drop(labels=duplicated_columns, axis=1, inplace=True) print(X_train.shape) correlated_features = set() correlation_matrix = X_train.corr() for i in range(len(correlation_matrix.columns)): for j in range(i): if abs(correlation_matrix.iloc[i, j]) > 0.8: colname = correlation_matrix.columns[i] correlated_features.add(colname) X_train.drop(labels=correlated_features, axis=1, inplace=True) X_test.drop(labels=correlated_features, axis=1, inplace=True) print(X_train.shape) return X_train, X_test except: print('sucsessfully completed QC')
Xs=Xs.fillna(Xs.mean()) #reescalo las Xs para que funcione PCA y otros algos scaler = StandardScaler() scaler.fit(Xs) Xs_res=scaler = scaler.transform(Xs) Xs_res=pd.DataFrame(data=Xs_res,index=Xs.index,columns=Xs.columns) #selección de variables usando Mínima Varianza cov=Xs_res.cov() correls=Xs.corr() sel = VarianceThreshold(threshold=0.01) filtered1_Xs=sel.fit_transform(Xs) filtered1_Xs=pd.DataFrame(data=filtered1_Xs,index=Xs.index,columns=Xs.columns) sel_cols1=sel.get_support(indices=True).T #reducción de variables usando PCA pca=PCA(n_components=17) pca.fit(Xs) evals=pca.explained_variance_ # corresponde a los eigenvalues var_expl=pca.explained_variance_ratio_ # Varianza explicada por cada componente principal evecs=pca.components_.T # corresponde a los eigenvectores loadings=evecs*np.sqrt(evals) loadings_filt=np.where(np.abs(loadings)>0.3,loadings,float('nan')) loadings_filt=pd.DataFrame(data=loadings_filt,index=Xs.columns) #repito la operación, ahora con n factores que explican %de var total q quiero
dataset[min_max_attributes]) minmaxScaling(X_train) minmaxScaling(X_validation) minmaxScaling(X_test) # ------------------------------------------------------------------- # ------------------------ E: Feature Selection --------------------- # ------------------------------------------------------------------- # Being done only on the train set to determine the features to select # Filter method = Variance Threshold filter = VarianceThreshold(threshold=0.2) filter.fit_transform(X_train) print(filter.get_support(indices=True)) # Wrapper method = SFS knn = KNeighborsClassifier(n_neighbors=3) sfs = SFS(knn, k_features=30, forward=True, floating=False, verbose=2, scoring='accuracy', cv=0) sfs = sfs.fit(X_train, Y_train) print(sfs.k_feature_idx_) # ------------------------------------------------------------------- # ------------------------ 5: Saving the prepared data --------------
poke_gen = pd.get_dummies(df['Generation']) poke_gen.head() # In[4]: from sklearn.feature_selection import VarianceThreshold vt = VarianceThreshold(threshold=.15) vt.fit(poke_gen) # In[5]: pd.DataFrame({'variance': vt.variances_, 'select_feature': vt.get_support()}, index=poke_gen.columns).T # In[6]: poke_gen_subset = poke_gen.iloc[:,vt.get_support()].head() poke_gen_subset # # Statistical Methods # In[7]: from sklearn.datasets import load_breast_cancer
def main(): set_option('display.width', 2000) pd.set_option("display.max_rows", 500, "display.max_columns", 2000) set_option('precision', 3) pd.options.mode.chained_assignment = None input_file = './raw data_edit/data_fd.csv' data_input_ori = pd.read_csv(input_file) data_input_ori = data_input_ori.drop(columns=['Subject']) # Create correlation heatmap cols = data_input_ori.keys() cols_edit = cols[1:] corr = data_input_ori[cols_edit].corr() mask = np.zeros_like(corr) mask[np.triu_indices_from(mask)] = True thd = 0.35 corr_np = corr.values corr_edit = np.zeros_like(corr) corr_pairs = [] for i in range(corr_np.shape[0]): for j in range(corr_np.shape[1]): if j > i: if corr_np[i, j] >= thd or corr_np[i, j] <= -thd: corr_edit[i, j] = corr_np[i, j] tmp1 = cols_edit[i] tmp2 = cols_edit[j] tmp = [tmp1, tmp2, corr_np[i, j]] corr_pairs.append(tmp) print('Feature pairs with a high correlation (|cc| >=0.35):', corr_pairs) plt.figure(1) sns.heatmap(corr, annot=False, vmin=-1, vmax=1, xticklabels=1, yticklabels=1, mask=mask, cmap='seismic') data_input_ori = data_input_ori.drop( columns=['INDEPEND', 'TOBAC100', 'TOBAC30']) keys = data_input_ori.keys() # Remove the features with low variance var = 0.10 sel = VarianceThreshold(threshold=var) data_edit = sel.fit_transform(data_input_ori) indices = sel.get_support(indices=True) keys0 = [] # The features with low variance for i in range(len(keys)): if i not in indices: keys0.append(keys[i]) print('The features removed due to low variance', keys0) keys1 = keys[indices] # Exclude the 'sumbox' keys1 = keys1[2:] features = data_edit[:, 2:] label = data_edit[:, 1] N1 = 24 fv, pv = f_regression(features, label) indices3 = np.argsort(pv) print('keys3 without removing feature', keys1[indices3]) indices3c = indices3[0:N1] print('Features removed due to the high p-value', keys1[indices3[N1:]]) features_new = features[:, indices3c] x_axis = np.linspace(1, len(pv), len(pv)) pv = sorted(pv) # Merge the data together label_norm = np.reshape(label, (len(label), 1)) # Make sure the dimension is the same for the data sets data_edit = np.concatenate((features_new, label_norm), axis=1) np.save('./raw data_edit/data_ml', data_edit) font = {'size': 16} plt.rc('font', **font) plt.figure(2) plt.scatter(x_axis, pv, color='black') plt.xlabel("Feature") plt.ylabel("P-value") plt.show()
data.shape [col for col in data.columns if data[col].isnull().sum() > 0] x_train, x_test, y_train, y_test = train_test_split(data.drop( labels=["TARGET"], axis=1), data["TARGET"], test_size=0.3, random_state=0) x_train.shape x_test.shape #variance Threshhold sel = VarianceThreshold(threshold=0) sel.fit(x_train) sum(sel.get_support()) #another way len(x_train.columns[sel.get_support()]) print( len([ x for x in x_train.columns if x not in x_train.columns[sel.get_support()] ])) [x for x in x_train.columns if x not in x_train.columns[sel.get_support()]] x_train['ind_var2_0'].unique() x_train = sel.transform(x_train) x_test = sel.transform(x_test)
def varianceSelection(X, THRESHOLD=10): from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=THRESHOLD) sel.fit_transform(X) return X[[c for (s, c) in zip(sel.get_support(), X.columns.values) if s]]
from sklearn.feature_selection import VarianceThreshold import dataframe debug = True # x, y = dataframe.get_dataset_from_file('corrected') # # print 'Dataset contains %d instances with %d initial features.' % (len(y), len(x[0])) # threshold = 0 threshold_increment = 0.01 def get_transformed_matrix_with_threshold(x, y, threshold): sel = VarianceThreshold(threshold) return sel.fit_transform(x, y) if not debug: while threshold <= 1.0: x, y = dataframe.df_data, dataframe.df_target selector = VarianceThreshold(threshold) result = selector.fit_transform(x, y) print 'Threshold = %f, Features remained after fit_transform %d' % (threshold, len(result[0])) threshold += threshold_increment print selector.get_support(indices=True)
X_test.drop(labels=constant_features, axis=1, inplace=True) X_train.shape, X_test.shape # # remove quasi-constant features # In[7]: sel = VarianceThreshold( threshold=0.01) # 0.1 indicates 99% of observations approximately sel.fit(X_train) # fit finds the features with low variance sum(sel.get_support()) # how many not quasi-constant? # In[8]: features_to_keep = X_train.columns[sel.get_support()] # In[9]: X_train = sel.transform(X_train) X_test = sel.transform(X_test) X_train.shape, X_test.shape
Y.append(1 if d==0 else 0) return normalize(np.array(X),norm='l2'),Y def sample_data(X, Y, value=0): XX=[] for i in xrange(len(Y)): if Y[i]==value: XX.append(X[i]) return XX out=open(sys.argv[1],"r") model=svm.OneClassSVM(kernel='rbf') X, Y = read_fea(sys.argv[1]) sel = VarianceThreshold(threshold=0) model.fit(sample_data(sel.fit_transform(X),Y, 1)) warning("useful features dim: "+str(len(sel.get_support(True)))) if hasattr(model,'score'): warning("accuracy on training set: "+str(model.score(sel.transform(X), Y))) if len(sys.argv)>2: X, Y = read_fea(sys.argv[2]) warning("accuracy on cv set: "+str(model.score(sel.transform(X), Y))) if len(sys.argv)>3: X, Y = read_fea(sys.argv[3]) warning("accuracy on dev set: "+str(model.score(sel.transform(X), Y))) if len(sys.argv)>4: ref = model.decision_function(sel.transform(X)) X, Y = read_fea(sys.argv[4], True) Z = model.decision_function(sel.transform(X)).tolist() Z = (Z-ref.mean())/ref.std()
from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=0.01) sel.fit(x_train) # In[39]: ### if we sum over get_support, we get the number of features that are not constant # In[178]: sum(sel.get_support()) # In[179]: x_train = sel.transform(x_train) test = sel.transform(test) # In[180]: test.shape
from sklearn.feature_selection import VarianceThreshold from numpy import genfromtxt, savetxt dataset = genfromtxt(open('/Users/larryhan/Dropbox/SML Project2/code/Part B/data_4/classification.csv','r'), delimiter=',', dtype='f3')[1:] title = genfromtxt(open('/Users/larryhan/Dropbox/SML Project2/code/Part B/data_4/classification.csv','r'), delimiter=',',dtype="S5")[0] target = [x[418] for x in dataset] train = [x[0:418] for x in dataset] sel = VarianceThreshold(0.9*(1-0.9)) sel.fit_transform(train) support = sel.get_support() # for i in range(len(support)): # if support[i]: # print(title[i]) sub_title = [] for i in range(len(support)): if support[i]: sub_title.append(title[i])
def main(): df = joblib.load('modelDataset.pkl') # Split dataframe into features and target y = df.iloc[:, 1] # .as_matrix() X = df.iloc[:, 2:] # .as_matrix() id = df.iloc[:, 0] # Scalings sc = StandardScaler() # Apply scaler colNames = X.columns X = sc.fit_transform(X) X = pd.DataFrame(X, columns=colNames) # Remove features with less than 20% variance colNames = X.columns sel = VarianceThreshold(threshold=0.16) X = sel.fit_transform(X) # Get column names back newCols = [] for remain, col in zip(sel.get_support(), colNames): if remain == True: newCols.append(col) X = pd.DataFrame(X, columns=newCols) # Perform univariate feature selection (ANOVA F-values) colNames = X.columns selection_Percent = SelectPercentile(percentile=5) X = selection_Percent.fit_transform(X, y) # Get column names back newCols = [] for remain, col in zip(selection_Percent.get_support(), colNames): if remain == True: newCols.append(col) X = pd.DataFrame(X, columns=newCols) # Perform tree-based feature selection clf = ExtraTreesRegressor() clf = clf.fit(X, y) colNames = X.columns sel = SelectFromModel(clf, prefit=True) X = sel.transform(X) newCols = [] for remain, col in zip(sel.get_support(), colNames): if remain == True: newCols.append(col) X = pd.DataFrame(X, columns=newCols) # Split train/test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1555) def testRegressor(clf): ''' #RF grid param_grid = [{'n_estimators': range(320, 350, 10), 'min_samples_split': range(2, 20, 2), 'min_samples_leaf': range(2, 20, 2), 'max_leaf_nodes': range(140, 170, 5) }] grid = GridSearchCV(clf, param_grid, cv=3, verbose=1, n_jobs=-1) fitted_classifier = grid.fit(X_train, y_train) print(grid.best_score_, grid.best_params_) predictions = fitted_classifier.predict(X_train)''' ''' #XGB tuning - concept, not in use param_grid = [{'max_depth': range(2, 4, 1), 'min_child_weight': range(3, 6, 1), 'n_estimators': range(80, 110, 10), 'learning_rate': [0.1], 'gamma': [0], 'subsample': [0.9, 1], 'colsample_bytree': [0.7], 'reg_alpha': [15, 50, 100, 150, 200], 'reg_lambda': [15, 20, 25, 30, 40, 50]}] fit_params = {"early_stopping_rounds": 8, "eval_metric": "mae", "eval_set": [[X_test, y_test]], "verbose": False} grid = GridSearchCV(clf, param_grid, fit_params=fit_params, cv=3, verbose=1, n_jobs=-1) fitted_classifier = grid.fit(X_train, y_train) print(grid.best_score_, grid.best_params_) predictions = fitted_classifier.predict(X_train) ''' fitted = clf.fit(X_train, y_train) scoresCV = cross_val_score(clf, X_train, y_train, cv=3, verbose=0, n_jobs=-1) trainPredictionsCV = cross_val_predict(clf, X_train, y_train, cv=3, verbose=0, n_jobs=-1) trainPredictions = clf.predict(X_train) testPredictions = clf.predict(X_test) score1 = metrics.explained_variance_score(y_test.values, testPredictions) score2 = metrics.mean_absolute_error(y_test.values, testPredictions) score3 = metrics.mean_squared_error(y_test.values, testPredictions) score4 = metrics.r2_score(y_test.values, testPredictions) print('Train score: ', metrics.mean_absolute_error(y_train.values, trainPredictions)) print('CV score: ', scoresCV) print('Explained Variance Score, MAE, MSE, R^2') print(score1, score2, score3, score4) tempIndex = range(0, len(y_test.values), 1) plt.scatter(tempIndex, y_test.values, color='black', s=20, alpha=0.8) plt.scatter(tempIndex, testPredictions, color='red', s=20, alpha=0.4) plt.show() #Results appear to be highly interesting #MSE (and thus penalising large errors more) suggests that the model does not deal well with #particular categories of retweets where there is a significant difference between true value and predicted #Data appears to have high bias in terms of selection, as if tweets were selected from specific pools #based on retweet value #While the random forest deals well with those particular types of tweets, more analysis is needed # Further steps would start by understanding the sampling procedure that produced these tweets # From there, features need to be relooked at, dimensionality reduction (such as PCA) might be needed # Simpler / more powerful models to then be appropriately applied #The target retweets actually seem to be created from a Decision Tree Model print('x') lr = LinearRegression() dt = DecisionTreeRegressor() rf = RandomForestRegressor() gb = xgboost.XGBRegressor() #print('LR') #testRegressor(lr) #print('DT') #testRegressor(dt) print('RF') testRegressor(dt)
class SemiSupervisedFeatureSelection(FeatureSelection): def __init__(self, conf): FeatureSelection.__init__(self, conf) def setBestParameters(self, instances): return def getFittingInstances(self, instances): return instances.getLabeledInstances() # Remove instances those family is too rare (num_instances < k = 3) def generateInputLabels(self, instances): if self.conf.families_supervision: families_count = instances.getFamiliesCount() drop_ids = [] for family, count in families_count.iteritems(): if count < 3: drop_ids += instances.getFamilyIds(family) selected_ids = [i for i in instances.getIds() if i not in drop_ids] selected_instances = instances.getInstancesFromIds(selected_ids) labels = selected_instances.families else: selected_instances = instances labels = selected_instances.labels ## String labels are transformed into integer labels (0 -> num_labels-1). ## This format is required blabels the library metric-learn. labels_values = list(set(labels)) if len(labels_values) < 2: raise FewerThanTwoLabels() labels = np.array([labels_values.index(x) for x in labels]) return labels, selected_instances def generateInputParameters(self, instances): fitting_instances = self.getFittingInstances(instances) labels, fitting_instances = self.generateInputLabels(fitting_instances) features = self.featuresPreprocessing(fitting_instances) return features, labels def fit(self, instances): features, labels = self.generateInputParameters(instances) self.setBestParameters(instances) self.createPipeline() self.pipeline.fit(features, labels) self.setProjectionMatrix() def createPipeline(self): # Remove features with null variance self.var_filter = VarianceThreshold() self.pipeline = Pipeline([('var_filter', self.var_filter), ('projection', self.projection)]) def getSelectedFeatures(self, features_names): non_constant_features = np.array(features_names)[ self.var_filter.get_support()] selected_features = list( non_constant_features[self.projection.get_support()]) return selected_features ## The name of the selected features. def componentLabels(self, features_names): return self.getSelectedFeatures(features_names)
# fature selection # ============================================================================= col = ['formation_energy_ev_natom', 'bandgap_energy_ev'] X = train1.drop(['id'] + col, axis=1) T = test1.drop(['id'] + col, axis=1) y = np.log(train1[col] + 1) plt.hist(y[col[0]]) plt.hist(y[col[1]], color='r') selector = VarianceThreshold(threshold=0) selector.fit(X) # Fit to train without id and target variables f = np.vectorize(lambda x: not x) # Function to toggle boolean array elements v = X.columns[f(selector.get_support())] print('{} variables have too low variance.'.format(len(v))) print('These variables are {}'.format(list(v))) selected_feat = X.columns.drop(v) #update X = X[selected_feat] T = T[selected_feat] # RFE Recursive feature elimination rf = RandomForestRegressor(n_estimators=500, random_state=seed) selector = RFECV(rf, cv=3, step=5) y = np.log(train1[col[0]] + 1) # formation ev selector = selector.fit(X, y) selector.support_
'XRP' : ['XRP', 'Ripple'], 'ZEC' : ['ZEC', 'ZCash'], 'ZRX' : ['ZRX', '0x'] } if __name__ == '__main__': index = {} df = pd.read_csv('data/google_trends/trends_all_20101112000000_20190101000000.csv', sep=',', encoding='utf-8', index_col='date', parse_dates=True) for sym, columns in COLUMNS.items(): _df = df.loc[:, columns] _df.columns = ['gtrends_{}_{}'.format(sym, c.lower()) for c in _df.columns] _df = _df.drop_duplicates().resample('D').mean().fillna(method='ffill') sel = VarianceThreshold() sel.fit(_df.values) sel_columns = [c for c, s in zip(_df.columns, sel.get_support()) if s] _df = _df.loc[:, sel_columns] print("{}: {} Features, {} Selected".format(sym, len(columns), len(sel_columns))) os.makedirs('data/preprocessed/google_trends/csv/', exist_ok=True) os.makedirs('data/preprocessed/google_trends/excel/', exist_ok=True) csv_path = 'data/preprocessed/google_trends/csv/{}.csv'.format(sym.lower()) xls_path = 'data/preprocessed/google_trends/excel/{}.xlsx'.format(sym.lower()) _df.to_csv(csv_path, sep=',', encoding='utf-8', index=True, index_label='Date') _df.to_excel(xls_path, index=True, index_label='Date') index[sym] = {'csv':csv_path, 'xls':xls_path} print('Saved {} in data/preprocessed/google_trends/'.format(sym)) with open('data/preprocessed/google_trends/index.json', 'w') as f: json.dump(index, f, sort_keys=True, indent=4)
def sample_data(X, Y, value=0): XX = [] for i in xrange(len(Y)): if Y[i] == value: XX.append(X[i]) return XX out = open(sys.argv[1], "r") model = svm.OneClassSVM(kernel='rbf') X, Y = read_fea(sys.argv[1]) sel = VarianceThreshold(threshold=0) model.fit(sample_data(sel.fit_transform(X), Y, 1)) warning("useful features dim: " + str(len(sel.get_support(True)))) if hasattr(model, 'score'): warning("accuracy on training set: " + str(model.score(sel.transform(X), Y))) if len(sys.argv) > 2: X, Y = read_fea(sys.argv[2]) warning("accuracy on cv set: " + str(model.score(sel.transform(X), Y))) if len(sys.argv) > 3: X, Y = read_fea(sys.argv[3]) warning("accuracy on dev set: " + str(model.score(sel.transform(X), Y))) if len(sys.argv) > 4: ref = model.decision_function(sel.transform(X)) X, Y = read_fea(sys.argv[4], True)
GDSCM = pd.read_csv("GDSC_mutations.Erlotinib.tsv", sep="\t", index_col=0, decimal=",") GDSCM = pd.DataFrame.transpose(GDSCM) GDSCC = pd.read_csv("GDSC_CNA.Erlotinib.tsv", sep="\t", index_col=0, decimal=",") GDSCC.drop_duplicates(keep='last') GDSCC = pd.DataFrame.transpose(GDSCC) selector = VarianceThreshold(0.05) selector.fit_transform(GDSCE) GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]] ls = GDSCE.columns.intersection(GDSCM.columns) ls = ls.intersection(GDSCC.columns) ls = ls.intersection(PDXE.columns) ls = ls.intersection(PDXM.columns) ls = ls.intersection(PDXC.columns) ls2 = GDSCE.index.intersection(GDSCM.index) ls2 = ls2.intersection(GDSCC.index) ls3 = PDXE.index.intersection(PDXM.index) ls3 = ls3.intersection(PDXC.index) ls = pd.unique(ls) PDXE = PDXE.loc[ls3, ls] PDXM = PDXM.loc[ls3, ls] PDXC = PDXC.loc[ls3, ls]
#2) remove some of the recordings and do it a few times (so manually k-folding), because that way if the same features are removed #then we know that for real those are the features not helpful xtrain_aud = sio.loadmat('xtrain_all_aud.mat') xtrain_aud = xtrain_aud['xtrain'] ytrain_aud = sio.loadmat('ytrain_all_aud.mat') ytrain_aud = ytrain_aud['ytrain'] # method 1: variance threshold Var_selector = VarThresh(.5) # without any parameters passed to varthresh it defaults to anything with all feautres the exact same # am going to start with .1 Var_selector.fit(xtrain_aud) which_feats = Var_selector.get_support() x_aud_fitted = Var_selector.transform(xtrain_aud) print x_aud_fitted.shape xtrunclength = sio.loadmat('xtrunclength.mat') xtrunclength = xtrunclength['xtrunclength'] xtesting = sio.loadmat('xtesting.mat') xtesting = xtesting['xtesting'] xtesting = xtesting[~np.isnan(xtesting).any(axis=1),:] xtesting = xtesting[~np.isinf(xtesting).any(axis=1),:] from CurrentThingsNeededtoRun import FinalClassifier
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.feature_selection import VarianceThreshold pd=pd.read_csv("train_label.csv") constant_filter = VarianceThreshold(threshold=0.05) constant_filter.fit(pd) len(pd.columns[constant_filter.get_support()]) constant_columns = [column for column in pd.columns if column not in pd.columns[constant_filter.get_support()]] pd.drop(labels=constant_columns, axis=1, inplace=True) print(constant_columns) pd.to_csv("train_label_p.csv",index=False)
df = file[columnList] df.dropna(axis=1, how='all', thresh=40051, inplace=True) #drop columns which only contain 1 unique values. (variance = 0) df = df[[col for col in df if not df[col].nunique() == 1]] df.to_csv('./cleanData.csv', index=False) #drop columns with low variance import pandas from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=1) sel.fit_transform(data) m = 0 for i in sel.get_support(True): print(data.columns[i]) m += 1 print(m) #use selectFromModel data = pandas.read_csv( '/Users/sherry/Desktop/python/energyProject/cleanData.csv', quotechar="'") data = data.astype('float64', inplace=True) data.dropna(inplace=True) feature = data[[col for col in data.columns if col != "GASAMT"]] lrModel = LinearRegression() selectFromModel = SelectFromModel(lrModel) selectFromModel.fit_transform(feature, data['GASAMT'])
newdf_test.drop('protocol_type', axis=1, inplace=True) newdf_test.drop('service', axis=1, inplace=True) print(newdf_test['label'].value_counts()) X_Probe = newdf.drop('label', 1) Y_Probe = newdf.label X_Probe_test = newdf_test.drop('label', 1) Y_Probe_test = newdf_test.label colNames = list(X_Probe) from sklearn.feature_selection import VarianceThreshold variance_threshold = VarianceThreshold() variance_threshold.fit(X_Probe) true = variance_threshold.get_support() varcolindex_Probe = [i for i, x in enumerate(true) if x] varcolname_Probe = list(colNames[i] for i in varcolindex_Probe) print('Features selected :', varcolname_Probe) features = newdf[varcolname_Probe].astype(float) features1 = newdf_test[varcolname_Probe].astype(float) lab = newdf['label'] lab1 = newdf_test['label'] from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state=0) t0 = time() clf.fit(features, lab) tt = time() - t0 print("Classifier trained in {} seconds.".format(round(tt, 3)))
def variance_threshold_selector(data, threshold=0.5): selector = VarianceThreshold(threshold) selector.fit(data) return data[data.columns[selector.get_support(indices=True)]]
for i in normalize.drop(['oferta_id', 'target', 'CONTROL'], axis=1).columns.values.tolist(): normalize[i] = normalize[i].map(float) normalize[i] = StandardScaler().fit_transform( normalize[i].values.reshape(-1, 1)) normal = normalize[normalize['CONTROL'] == 0] anormal = normalize[normalize['CONTROL'] == 1] del normal['CONTROL'] del anormal['CONTROL'] # VARIANCE REDUCTION selection = VarianceThreshold(threshold=0.0) selection.fit(normal.drop(['oferta_id', 'target'], axis=1)) features = selection.get_support(indices=True) features = list(normal.columns[features]) + ['oferta_id', 'target'] normal = normal[features] test_anormal = anormal[features] train, valid, _, _ = train_test_split(normal, normal, test_size=0.30, random_state=42) valid, test_normal, _, _ = train_test_split(valid, valid, test_size=len(anormal.index), random_state=42) valid = valid.drop(['oferta_id', 'target'], axis=1)
# ## Feature selection # ### Removing features with low or zero variance # Personally, I prefer to let the classifier algorithm chose which features to keep. But there is one thing that we can do ourselves. That is removing features with no or a very low variance. Sklearn has a handy method to do that: **VarianceThreshold**. By default it removes features with zero variance. This will not be applicable for this competition as we saw there are no zero-variance variables in the previous steps. But if we would remove features with less than 1% variance, we would remove 31 variables. # In[ ]: selector = VarianceThreshold(threshold=.01) selector.fit(train.drop( ['id', 'target'], axis=1)) # Fit to train without id and target variables f = np.vectorize(lambda x: not x) # Function to toggle boolean array elements v = train.drop(['id', 'target'], axis=1).columns[f(selector.get_support())] print('{} variables have too low variance.'.format(len(v))) print('These variables are {}'.format(list(v))) # We would lose rather many variables if we would select based on variance. But because we do not have so many variables, we'll let the classifier chose. For data sets with many more variables this could reduce the processing time. # # Sklearn also comes with other [feature selection methods](http://scikit-learn.org/stable/modules/feature_selection.html). One of these methods is *SelectFromModel* in which you let another classifier select the best features and continue with these. Below I'll show you how to do that with a Random Forest. # ### Selecting features with a Random Forest and SelectFromModel # Here we'll base feature selection on the feature importances of a random forest. With Sklearn's SelectFromModel you can then specify how many variables you want to keep. You can set a threshold on the level of feature importance manually. But we'll simply select the top 50% best variables. # # > The code in the cell below is borrowed from the [GitHub repo of Sebastian Raschka](https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch04/ch04.ipynb). This repo contains code samples of his book *Python Machine Learning*, which is an absolute must to read. # In[ ]: X_train = train.drop(['id', 'target'], axis=1)
total = total + float(tp+tn)/(tp+tn+fp+fn)*100 return total/len(labels) # train_text,train_classfi_number,train_classfi,train_feature_name = getTargetData("Breast_train.data") # test_text,test_classfi_number,test_classfi,test_feature_name = getTargetData("Breast_test.data") # for i in range(len(train_text)): # for j in range(len(train_text[0])): # train_text[i][j] = float(train_text[i][j]) # print type(train_text[i][j] ) # selector = VarianceThreshold() # data = selector.fit_transform(train_text) # index = selector.get_support(True) # train = data # test = [] # df = pd.DataFrame(test_text) # for line in index: # test.append(df[line]) X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]] selector = VarianceThreshold() selector.fit_transform(X) print selector.get_support() # clf = DecisionTreeClassifier(max_depth=4) # clf = SVC(kernel='rbf', probability=True) # clf.fit(data, train_classfi) # result = clf.predict(test_text)
def variance(self, X, threshold): sel = VarianceThreshold(threshold=(threshold * (1 - threshold))) sel_var = sel.fit_transform(X) X = X[X.columns[sel.get_support(indices=True)]] return X
def preprocess(self): print 'Preprocess...' print 'Start: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') data = self.data.copy() label = self.label.copy() m = data.shape[0] print data['MarriageStatus'].dtype #fillna for i in data.columns: if i!='AppId' and i!='InstallmentStartedOn': if data[i].hasnans: t0=pd.DataFrame(np.ones((data.shape[0],1),dtype=np.int),columns=[i+'_Ex'],index=data.index) ind0=data[data[i].isnull()].index t0.ix[ind0]=0 data[i+'_Ex']=t0 if data[i].dtype==np.object: if data[i].value_counts().sort_values().shape[0]>0: data[i].fillna(data[i].value_counts().sort_values().index[-1],inplace=True,downcast='infer') else: data[i].fillna('0',inplace=True,downcast='infer') else: if np.isnan(data[i].mean())==False: data[i].fillna(data[i].mean(),inplace=True,downcast='infer') else: data[i].fillna(0,inplace=True,downcast='infer') train,train_label,test,test_label=self.split(data,label) self.raw_train=train.copy() self.raw_train_label=train_label.copy() self.raw_test=test.copy() self.raw_test_label=test_label.copy() #delete AppId and InstallmentStartedOn data.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True) train.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True) test.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True) data.reset_index(inplace=True,drop=True) train.reset_index(inplace=True,drop=True) test.reset_index(inplace=True,drop=True) #preprocess enc0=LabelEncoder() enc1 = OneHotEncoder() scaler = MinMaxScaler() for i in train.columns: if train[i].dtype==np.object: t0=enc0.fit_transform(train[i].values.reshape(-1,1)) t1=enc1.fit_transform(t0.reshape(-1,1)).toarray() tf=pd.DataFrame(t1,index=train.index) tf.rename(columns=lambda x: i+'_'+str(x)+'_E', inplace=True) train.drop(i,inplace=True,axis=1) train=train.join(tf,how='inner') clas = enc0.classes_ if test[i][~test[i].isin(clas)].size != 0: ind = test[i][~test[i].isin(clas)].index test[i].iloc[ind] = clas[0] t0=enc0.transform(test[i].values.reshape(-1,1)) t1=enc1.transform(t0.reshape(-1,1)).toarray() tf=pd.DataFrame(t1,index=test.index) tf.rename(columns=lambda x: i+'_'+str(x)+'_E', inplace=True) test.drop(i,inplace=True,axis=1) test=test.join(tf,how='inner') else: tt0=train[i].values.reshape(-1,1) tt0_s=scaler.fit_transform(tt0) train[i+'_S']=tt0_s train.drop(i,inplace=True,axis=1) tt2=test[i].values.reshape(-1,1) tt2_s=scaler.transform(tt2) test[i+'_S']=tt2_s test.drop(i,inplace=True,axis=1) #feature selection sel = VarianceThreshold(threshold=0.0002) train_new=sel.fit_transform(train) sup=sel.get_support() features=train.columns.tolist() for i in xrange(train.shape[1]): if sup[i]==False: features.remove(train.columns[i]) train=pd.DataFrame(train_new,columns=features) test_new=sel.transform(test) test=pd.DataFrame(test_new,columns=features) self.train=train.copy() self.train_label=train_label.copy() self.test=test.copy() self.test_label=test_label.copy() print 'End: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return train,train_label,test,test_label
combo_ctr=0 feat_arr=[0 for col in range(feat_cnt)] #Initialize feature array for idx in range(feat_cnt): roll_idx=idx feat_space_search(feat_arr, idx) #Recurse feat_arr=[0 for col in range(feat_cnt)] #Reset feature array after each iteration print('# of Feature Combos Tested:', combo_ctr) print(best_score, sel_idx, len(data_np[0])) print("Wrapper Feat Sel Runtime:", time.time()-start_ts) if fs_type ==5: print("L2 Regularization") sel = SelectFromModel(LogisticRegression(penalty = 'l2', C = 1.0, solver = 'liblinear')) sel.fit(data_np, target_np) sel_idx = sel.get_support() ##2) Get lists of selected and non-selected features (names and indexes) ####### temp=[] temp_idx=[] temp_del=[] for i in range(len(data_np[0])): if sel_idx[i]==1: #Selected Features get added to temp header temp.append(header[i+feat_start]) temp_idx.append(i) else: #Indexes of non-selected features get added to delete array temp_del.append(i) print('Selected', temp) print('Features (total/selected):', len(data_np[0]), len(temp)) print('\n')
df_gram, df_edit, df_w2v, df_tdif, df_char_nstem, df_word_nstem, df_gram_nstem, df_edit_nstem, df_w2v_nstem, df_tdif_nstem, ), axis=1).replace(np.inf, 1e20) # Some infinites appear on the middle of the data... del df_w2v, df_char_nstem, df_word_nstem, df_gram_nstem, df_w2v_nstem del df_gram, df_edit, df_tdif, df_tdif_nstem, df_char, df_word, joblib.dump(df_metrics, 'df_metrics.pkl', compress=9) #id_test = df_tudo.iloc[num_train:]['id'] #y_train = df_tudo.iloc[:num_train].relevance.values #joblib.dump(y_train, 'y_train.pkl') #joblib.dump(id_test, 'id_test.pkl') id_test = joblib.load('id_test.pkl') y_train = joblib.load('y_train.pkl') var = VarianceThreshold() var.fit_transform(df_metrics) df_val_metrics = df_metrics[var.get_support(indices=True)] joblib.dump(df_val_metrics, 'df_val_metrics.pkl', compress=9) df_train = df_val_metrics.iloc[:num_train] df_test = df_val_metrics.iloc[num_train:]
# ---------------- nifti_masker = NiftiMasker(standardize=False, smoothing_fwhm=2, memory='nilearn_cache') # cache options gm_maps_masked = nifti_masker.fit_transform(gm_imgs_train) # The features with too low between-subject variance are removed using # :class:`sklearn.feature_selection.VarianceThreshold`. from sklearn.feature_selection import VarianceThreshold variance_threshold = VarianceThreshold(threshold=.01) gm_maps_thresholded = variance_threshold.fit_transform(gm_maps_masked) gm_maps_masked = variance_threshold.inverse_transform(gm_maps_thresholded) # Then we convert the data back to the mask image in order to use it for # decoding process mask = nifti_masker.inverse_transform(variance_threshold.get_support()) ############################################################################ # Prediction pipeline with ANOVA and SVR using # :class:`nilearn.decoding.DecoderRegressor` Object # In nilearn we can benefit from the built-in DecoderRegressor object to # do ANOVA with SVR instead of manually defining the whole pipeline. # This estimator also uses Cross Validation to select best models and ensemble # them. Furthermore, you can pass n_jobs=<some_high_value> to the # DecoderRegressor class to take advantage of a multi-core system. # To save time (because these are anat images with many voxels), we include # only the 1-percent voxels most correlated with the age variable to fit. We # also want to set mask hyperparameter to be the mask we just obtained above. from nilearn.decoding import DecoderRegressor
def spearm_cor_func(expected, pred): return spearmanr(expected, pred)[0] # Folders submission_filename_prefix = 'sc3_emanuel_phase2_' # Import data train_exp, train_cnv, train_ess, leader_exp, leader_cnv, prioritized_genes = read_data_sets() X_train_pre = train_exp X_test_pre = leader_exp var_thres = VarianceThreshold(0.65).fit(X_train_pre) X_train_pre = X_train_pre.loc[:, var_thres.get_support()] X_test_pre = X_test_pre.loc[:, var_thres.get_support()] # Prepare features features = X_train_pre.columns important_features = [] for gene in prioritized_genes: # Assemble prediction variables X_train = X_train_pre y_train = train_ess.ix[:, gene] X_test = X_test_pre # Feature selection fs = SelectKBest(f_regression, k=100) X_train = fs.fit_transform(X_train, y_train)
・Emedded Method Filter Methodは、大別して3つある ・特徴量の値のみ ・特徴量間の相関係数 ・統計的評価指標 # 特徴量のみ ・分散がゼロ => 全て同じ値 => 削減 from sklearn.feature_selection import VarianceThreshold X = desc_df.values select = VarianceThreshold() X_new = select.fit_transform(X) np.array(descs)[select.get_support()==False] # 削減後の特徴量の数を確認 ・分散がほぼゼロ => データをよく観察して削除するか判断 ・特徴量がほかの特徴量と完全に一致 # 特徴量間の相関係数 メリット ・互いに相関の高い特徴量の片方を削除することで、精度にあまり影響を与えずに特徴量空間の次元を下げる ・線形モデルの解釈性を上げることができる。 ピアソン相関係数(いわゆる普通の相関係数) threshold = 0.8 # 閾値
def do(): print 'loading data' #X_train, y_train = load_svmlight_file("sparse_input.log") #X_train, y_train = load_svmlight_file("GetDict_80w_new_input.txt") X_train, y_train = load_svmlight_file("GetUniq_100w_new_input.txt") #X_test, y_test = load_svmlight_file("svm_test.log", n_features=X_train.shape[1]) startime= time.clock() print 'fs start' _tree=''' print 'tree start' print '1:',X_train.shape clf_tree = ExtraTreesClassifier() X_train = clf_tree.fit(X_train.toarray(), y_train).transform(X_train) print '2:',X_train.shape #X_train_tree = sp.sparse.csr_matrix(X_new_tree) #''' #_threshold=''' th=0.99 print 'v start,threshold=',th print '1:',X_train.shape sel = VarianceThreshold(threshold=(th * (1 - th))) X_train=sel.fit_transform(X_train) print '2:',X_train.shape X_train=X_train.toarray() #print sel.variances_ fet_got = sel.get_support(True) print type(fet_got) str='' for it in fet_got: str+='%d,'%it print str return #''' _Kbest=''' num=180 print '_Kbest,',num print X_train.shape X_train = SelectKBest(chi2, k=num).fit_transform(X_train, y_train) print X_train.shape X_train=X_train.toarray() #''' _Kper=''' print X_train.shape X_train = SelectPercentile(f_classif, percentile=10).fit_transform(X_train, y_train) print X_train.shape #''' _svd=''' print 'PCA+RF 1:',X_train.shape svd = TruncatedSVD(n_components=500, random_state=42) svd.fit(X_train) TruncatedSVD(algorithm='randomized', n_components=500, n_iter=5, random_state=42, tol=0.0) print 'var percent=',(svd.explained_variance_ratio_.sum()) X_train = svd.transform(X_train) #X_test = svd.transform(X_test) print '2:',X_train.shape #print '2:',X_test.shape #''' fstime = time.clock() print 'fs end,time=%fs'%(fstime-startime) #_cv=''' print 'cv start,LSVM' #clf = svm.SVC() #clf = svm.LinearSVC() #clf = tree.DecisionTreeClassifier() clf = RandomForestClassifier(n_estimators=10) scores=cross_validation.cross_val_score(clf,X_train,y_train,cv=5,scoring="accuracy") print(scores,scores.mean()) #cv end #''' _svm=''' print 'SVM trainning start' clf = svm.SVC() clf.fit(X_train, y_train) print 'SVM predicting start' y_pred = clf.predict(X_train) print y_pred print y_train print 'SVM predict end' print "Accuracy", np.mean(y_pred == y_train) #''' _dtm=''' print 'DTM train start' clf = tree.DecisionTreeClassifier() #pca出来的是dense的,否则X.toarray() clf.fit(X_train, y_train) print 'DTM predict start' traintime = time.clock() print 'trainning end, time=%fs'%(traintime-startime) y_pred = clf.predict(X_train) #print y_pred print 'DTM predict end' print "Accuracy", np.mean(y_pred == y_train) # ''' predtime = time.clock() print 'predict end,time=%fs'%(predtime-startime)
def find_low_variance_features(self, threshold=0.0, skip_columns=[]): """ Wrapper for sklearn VarianceThreshold for use on pandas dataframes. """ df = self.dataset.data #print("Finding low-variance features.") #try: # get list of all the original df columns all_columns = df.columns # remove `skip_columns` remaining_columns = all_columns.drop(skip_columns) # get length of new index max_index = len(remaining_columns) - 1 # get indices for `skip_columns` skipped_idx = [all_columns.get_loc(column) for column in skip_columns] # adjust insert location by the number of columns removed # (for non-zero insertion locations) to keep relative # locations intact for idx, item in enumerate(skipped_idx): if item > max_index: diff = item - max_index skipped_idx[idx] -= diff if item == max_index: diff = item - len(skip_columns) skipped_idx[idx] -= diff if idx == 0: skipped_idx[idx] = item # get values of `skip_columns` skipped_values = df.iloc[:, skipped_idx].values # get dataframe values X = df.loc[:, remaining_columns].values # instantiate VarianceThreshold object vt = VarianceThreshold(threshold=threshold) # fit vt to data vt.fit(X) # get the indices of the features that are being kept feature_indices = vt.get_support(indices=True) # remove low-variance columns from index feature_names = [ remaining_columns[idx] for idx, _ in enumerate(remaining_columns) if idx in feature_indices ] # get the columns to be removed low_variance_features = list( np.setdiff1d(remaining_columns, feature_names)) self.low_variance_features += low_variance_features if (len(low_variance_features)): self.log.info( "find_low_variance_features: {0} features below {1}.".format( len(low_variance_features), threshold)) else: self.log.info( "find_low_variance_features: none found below threshold %s:", threshold) return low_variance_features
train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') ids_tr = train.pop('id').values ids_te = test.pop('id').values magic_tr = train.pop('wheezy-copper-turtle-magic').values magic_te = test.pop('wheezy-copper-turtle-magic').values target = train.pop('target').values train = train.values test = test.values # infomative columns of each magic value vt = VarianceThreshold(threshold=1.5) infomative_cols = [] for i in range(MAX_MAGIC_NO): vt.fit(train[magic_tr == i]) infomative_cols.append(vt.get_support(indices=True)) ### Step-1 ### oof_all = [] pred_all = [] for n in range(1, MAX_COMPONENTS + 1): oof_n = np.zeros(len(train)) pred_n = np.zeros(len(test)) gmm0 = GaussianMixture(n_components=n, covariance_type='full', random_state=RANDOM_SEED) gmm1 = GaussianMixture(n_components=n, covariance_type='full', random_state=RANDOM_SEED) for i in range(MAX_MAGIC_NO): print('.', end='')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) # ### Constant Feature Removal # In[12]: constant_filter = VarianceThreshold(threshold=0) constant_filter.fit(x_train) # In[13]: constant_filter.get_support().sum() # In[14]: constant_list = [not temp for temp in constant_filter.get_support()] constant_list # In[15]: x.columns[constant_list] # In[16]: x_train_filter = constant_filter.transform(x_train) x_test_filter = constant_filter.transform(x_test)