class f_regressionFDRPrim(primitive): def __init__(self, random_state=0): super(f_regressionFDRPrim, self).__init__(name='f_regressionFDR') self.id = 34 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the p-values for an estimated false discovery rate with F-value between label/feature for regression tasks. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFdr(f_regression) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) final_output = {0: output} return final_output
class UnivariateSelectChiFDRPrim(primitive): def __init__(self, random_state=0): super(UnivariateSelectChiFDRPrim, self).__init__(name='UnivariateSelectChiFDR') self.id = 31 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the p-values for an estimated false discovery rate with Chi-square. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'd' def can_accept(self, data): return self.can_accept_d(data, 'Classification') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFdr(chi2, alpha=0.05) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) try: mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) except Exception as e: print(e) final_output = {0: output} return final_output
def svm_cv(data, data_target): X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, data_target) print "*" * 79 print "Training..." # selector = SelectFdr(chi2) selector = SelectFdr(f_classif) selector.fit(X_train, y_train) clf = svm.SVC(kernel='linear', probability=True) clf.fit(selector.transform(X_train), y_train) print "Testing..." pred = clf.predict(selector.transform(X_test)) probs = pred.predict_proba(selector.transfrom(X_test)) accuracy_score = metrics.accuracy_score(y_test, pred) classification_report = metrics.classification_report(y_test, pred) support = selector.get_support() print support print accuracy_score print classification_report precision, recall, thresholds = precision_recall_curve(y_test, probs[:, 1])
def select_fdr(input_data, feature_names=None, score_func=f_classif, alpha=0.05): if score_func == f_classif: input_data, feature_names, _ = remove_constant(input_data, feature_names) x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) done = False increment = alpha while not done: feature_selector = SelectFdr(score_func=score_func, alpha=alpha) temp_x_train = feature_selector.fit_transform(x_train, y_train) temp_x_test = feature_selector.transform(x_test) if temp_x_train.shape[1] > 1 and temp_x_test.shape[1] > 1: done = True x_train = temp_x_train x_test = temp_x_test else: msg = 'Feature selection was too aggresive, ' msg += 'increasing alpha from {} to {}'.format( alpha, alpha + increment) alpha += increment logging.warning(msg) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] logging.info('Selected {} features'.format(x_train.shape[1])) final_args = {'score_func': score_func, 'alpha': alpha} return output_data, feature_names, final_args
def select_fdr(df, target_col): y = df[target_col] X = df.drop(target_col, axis=1) selector = SelectFdr(chi2, alpha=0.01).fit(X, y) true_list = list(selector.get_support()) index = [i for i in range(len(true_list)) if true_list[i] == True] if len(index) == 0: print( 'No features were selected: either the data is too noisy or the selection Test_data too strict.' ) return df else: saved_columns = [list(X.columns)[i] for i in index] result = pd.DataFrame(selector.transform(X), columns=saved_columns) result[target_col] = y return result
###### MASKING FOR SELECTED STIMS targetNames = ['bottle', 'face', 'scissors'] # the ttims of interest stimMask = targetData.labels.isin(targetNames) # indices for the stim of interest X_fMRI_selected = X_fMRI[stimMask] # features (for selected stimuli only) y = np.array(targetData.labelInd)[stimMask] # labels ###### FEATURE SELECTION # FDR feature selector selector = SelectFdr(f_classif, alpha=0.01) # FDR selector object selector.fit(X_fMRI_selected, y) # learning from the data X = selector.transform(X_fMRI_selected) # Selected features only indVoxels = selector.get_support(indices=True) # indices of surviving voxels ###### VISUALIZING FEATURE LOCATIONS # binary vector with 1s indicating selected voxels bROI = np.zeros(X_fMRI.shape[-1]) bROI[indVoxels] = 1 # reverse masking bROI_img = masker.inverse_transform(bROI) # Create the figure plot_stat_map(bROI_img, imgAnat, title='Voxels surviving FDR')