示例#1
0
class f_regressionFDRPrim(primitive):
    def __init__(self, random_state=0):
        super(f_regressionFDRPrim, self).__init__(name='f_regressionFDR')
        self.id = 34
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the p-values for an estimated false discovery rate with F-value between label/feature for regression tasks. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFdr(f_regression)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        mask = self.selector.get_support(indices=False)
        final_cols = list(compress(cols, mask))
        output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        final_output = {0: output}
        return final_output
示例#2
0
class UnivariateSelectChiFDRPrim(primitive):
    def __init__(self, random_state=0):
        super(UnivariateSelectChiFDRPrim, self).__init__(name='UnivariateSelectChiFDR')
        self.id = 31
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the p-values for an estimated false discovery rate with Chi-square. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'd'

    def can_accept(self, data):
        return self.can_accept_d(data, 'Classification')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFdr(chi2, alpha=0.05)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        try:
            mask = self.selector.get_support(indices=False)
            final_cols = list(compress(cols, mask))
            output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        except Exception as e:
            print(e)
        final_output = {0: output}
        return final_output
示例#3
0
def svm_cv(data, data_target):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, data_target)
    print "*" * 79
    print "Training..."
    # selector = SelectFdr(chi2)
    selector = SelectFdr(f_classif)
    selector.fit(X_train, y_train)
    clf = svm.SVC(kernel='linear', probability=True)
    clf.fit(selector.transform(X_train), y_train)
    print "Testing..."
    pred = clf.predict(selector.transform(X_test))
    probs = pred.predict_proba(selector.transfrom(X_test))
    accuracy_score = metrics.accuracy_score(y_test, pred)
    classification_report = metrics.classification_report(y_test, pred)
    support = selector.get_support()
    print support
    print accuracy_score
    print classification_report
    precision, recall, thresholds = precision_recall_curve(y_test, probs[:, 1])
示例#4
0
def select_fdr(input_data,
               feature_names=None,
               score_func=f_classif,
               alpha=0.05):
    if score_func == f_classif:
        input_data, feature_names, _ = remove_constant(input_data,
                                                       feature_names)

    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)

    done = False
    increment = alpha
    while not done:
        feature_selector = SelectFdr(score_func=score_func, alpha=alpha)
        temp_x_train = feature_selector.fit_transform(x_train, y_train)
        temp_x_test = feature_selector.transform(x_test)
        if temp_x_train.shape[1] > 1 and temp_x_test.shape[1] > 1:
            done = True
            x_train = temp_x_train
            x_test = temp_x_test
        else:
            msg = 'Feature selection was too aggresive, '
            msg += 'increasing alpha from {} to {}'.format(
                alpha, alpha + increment)
            alpha += increment
            logging.warning(msg)

    if dims == 3:
        x_train = make3D(x_train)
        x_test = make3D(x_test)

    output_data = (x_train, y_train, x_test, y_test)
    if feature_names is not None:
        mask = feature_selector.get_support()
        feature_names = feature_names[mask]

    logging.info('Selected {} features'.format(x_train.shape[1]))

    final_args = {'score_func': score_func, 'alpha': alpha}

    return output_data, feature_names, final_args
示例#5
0
def select_fdr(df, target_col):
    y = df[target_col]
    X = df.drop(target_col, axis=1)
    selector = SelectFdr(chi2, alpha=0.01).fit(X, y)
    true_list = list(selector.get_support())
    index = [i for i in range(len(true_list)) if true_list[i] == True]
    if len(index) == 0:
        print(
            'No features were selected: either the data is too noisy or the selection Test_data too strict.'
        )
        return df
    else:
        saved_columns = [list(X.columns)[i] for i in index]
        result = pd.DataFrame(selector.transform(X), columns=saved_columns)
        result[target_col] = y
    return result

###### MASKING FOR SELECTED STIMS
targetNames = ['bottle', 'face', 'scissors']  # the ttims of interest
stimMask = targetData.labels.isin(targetNames)  # indices for the stim of interest
X_fMRI_selected = X_fMRI[stimMask]   # features (for selected stimuli only)
y = np.array(targetData.labelInd)[stimMask]  # labels




###### FEATURE SELECTION
# FDR feature selector
selector = SelectFdr(f_classif, alpha=0.01)  # FDR selector object
selector.fit(X_fMRI_selected, y)   # learning from the data
X = selector.transform(X_fMRI_selected)   # Selected features only
indVoxels = selector.get_support(indices=True)   # indices of surviving voxels


###### VISUALIZING FEATURE LOCATIONS
# binary vector with 1s indicating selected voxels
bROI = np.zeros(X_fMRI.shape[-1])
bROI[indVoxels] = 1
# reverse masking
bROI_img = masker.inverse_transform(bROI)

# Create the figure
plot_stat_map(bROI_img, imgAnat, title='Voxels surviving FDR')