def test_select_percentile_classif_sparse():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X)
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)

    X_r2inv = univariate_filter.inverse_transform(X_r2)
    assert_true(sparse.issparse(X_r2inv))
    support_mask = safe_mask(X_r2inv, support)
    assert_equal(X_r2inv.shape, X.shape)
    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
    # Check other columns are empty
    assert_equal(X_r2inv.getnnz(), X_r.getnnz())
Exemplo n.º 2
0
def test_select_kbest_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the k best heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = (GenericUnivariateSelect(f_classif, mode="k_best",
                                    param=5).fit(X, y).transform(X))
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Exemplo n.º 3
0
    def get_feature_selection_model(cls, model_name, estimator_name=None):
        feature_selection_model = None
        model_param = copy.deepcopy(
            feature_selection_config_dict['model_param'])

        if model_name == 'Embedded':
            feature_selection_model = \
                SelectFromModel(estimator_name,
                                **model_param['Embedded'])
        elif model_name == 'Wrapper':
            feature_selection_model = \
                RFECV(estimator = get_estimator(estimator_name),
                        **model_param['Wrapper'])
        elif model_name == 'Filter':
            model_param['Filter']['score_func'] = get_score_func(
                model_param['Filter']['score_func'])
            feature_selection_model = GenericUnivariateSelect(
                **model_param['Filter'])
        elif model_name == 'KeepAll':
            feature_selection_model = 'KeepAll'
        else:
            raise ValueError(
                "estimator must be in ('Embedded','Wrapper','Filter','KeepAll') but is %s"
                % (model_name))

        return feature_selection_model
Exemplo n.º 4
0
def test_select_fdr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFdr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='fdr',
                                   param=0.0001).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Exemplo n.º 5
0
def test_select_heuristics_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the fdr, fwe and fpr heuristics
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ["fdr", "fpr", "fwe"]:
        X_r2 = (GenericUnivariateSelect(f_classif, mode=mode,
                                        param=0.01).fit(X, y).transform(X))
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_allclose(support, gtruth)
Exemplo n.º 6
0
 def test_generic_univariate_select_float(self):
     model = GenericUnivariateSelect()
     X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]])
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, 'generic univariate select',
         [('input', FloatTensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnGenericUnivariateSelect",
         allow_failure=
         "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
Exemplo n.º 7
0
    def single_fdr(alpha, n_informative, random_state):
        X, y = make_regression(n_samples=150,
                               n_features=20,
                               n_informative=n_informative,
                               shuffle=False,
                               random_state=random_state,
                               noise=10)

        with warnings.catch_warnings(record=True):
            # Warnings can be raised when no features are selected
            # (low alpha or very noisy data)
            univariate_filter = SelectFdr(f_regression, alpha=alpha)
            X_r = univariate_filter.fit(X, y).transform(X)
            X_r2 = GenericUnivariateSelect(f_regression,
                                           mode='fdr',
                                           param=alpha).fit(X, y).transform(X)

        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        num_false_positives = np.sum(support[n_informative:] == 1)
        num_true_positives = np.sum(support[:n_informative] == 1)

        if num_false_positives == 0:
            return 0.
        false_discovery_rate = (num_false_positives /
                                (num_true_positives + num_false_positives))
        return false_discovery_rate
Exemplo n.º 8
0
class MutualInfoRazor:

    def __init__(self, percentile=50):
        self.percentile = percentile
        self.transformer = GenericUnivariateSelect(score_func=mutual_info_regression,
                                                   mode='percentile', param=self.percentile)

    @property
    def support_(self):
        return self.transformer.get_support()

    def fit(self, X, Y):
        self.transformer.fit(X, Y)

    def predict(self, X):
        return self.transformer.transform(X=X)
Exemplo n.º 9
0
def test_select_percentile_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the percentile heuristic
    """
    X, y = make_regression(n_samples=200,
                           n_features=20,
                           n_informative=5,
                           shuffle=False,
                           random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(f_regression, mode='percentile',
                                   param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
    # Check inverse_transform respects dtype
    assert_array_equal(X_2.astype(bool),
                       univariate_filter.inverse_transform(X_r.astype(bool)))
Exemplo n.º 10
0
def test_select_percentile_classif_sparse():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = (GenericUnivariateSelect(f_classif, mode="percentile",
                                    param=25).fit(X, y).transform(X))
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)

    X_r2inv = univariate_filter.inverse_transform(X_r2)
    assert sparse.issparse(X_r2inv)
    support_mask = safe_mask(X_r2inv, support)
    assert X_r2inv.shape == X.shape
    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
    # Check other columns are empty
    assert X_r2inv.getnnz() == X_r.getnnz()
def feature_selection(X, y, test_size=0.2):
    (X_train, X_test, y_train, y_test) = train_test_split(X,
                                                          y,
                                                          test_size=0.2,
                                                          stratify=y,
                                                          random_state=42)

    selector = GenericUnivariateSelect(score_func=chi2,
                                       mode='percentile',
                                       param=70)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    print('Before selection shape:', X_train.shape)
    print('After selection shape:', X_train_selected.shape)

    return (X_train_selected < X_test_selected)
Exemplo n.º 12
0
def EvaluatePerformance(classifier, extractionMethod: str, reviews: [Review], label: str, featuresSelector, selectorParam):

    skf = StratifiedKFold(n_splits=10)

    global_features_index = GetGlobalFeaturesIndex(
        reviews, list(range(0, len(reviews))), extractionMethod)

    x, y = [], []

    for review in reviews:

        featuresVector = ExtractFeatureFromCorpus(
            global_features_index, review.review_content, extractionMethod)

        x.append(featuresVector)
        y.append(review.tag.tagId)

    transformer = None

    if extractionMethod in ef.USE_TFIDF:
        transformer = TfidfTransformer(smooth_idf=False)
        x = transformer.fit_transform(x, y).toarray().tolist()

    if extractionMethod in ef.USE_SMOTEENN:
        x, y = sme.fit_sample(x, y)
    elif extractionMethod in ef.USE_SMOTETOMEK:
        x, y = smt.fit_sample(x, y)

    selector = GenericUnivariateSelect(
        chi2, featuresSelector, param=selectorParam)

    x = selector.fit_transform(x, y)

    print("here1")
    grid_search = GridSearchCV(classifier, lg_param_grid, scoring=scorers, refit='accuracy_score',
                               cv=skf, return_train_score=True, n_jobs=-1)
    print("here2")
    grid_search.fit(x, y)
    print("here3")

    # make the predictions
    y_pred = grid_search.predict(x)

    print('Best params for {}'.format('accuracy_score'))
    print(grid_search.best_params_)
Exemplo n.º 13
0
    def operate(self, input_datanode, target_fields=None):
        from sklearn.feature_selection import GenericUnivariateSelect

        feature_types = input_datanode.feature_types
        X, y = input_datanode.data
        if target_fields is None:
            target_fields = collect_fields(feature_types, self.input_type)
        X_new = X[:, target_fields]

        n_fields = len(feature_types)
        irrevalent_fields = list(range(n_fields))
        for field_id in target_fields:
            irrevalent_fields.remove(field_id)

        # Because the pipeline guarantees that each feature is positive,
        # clip all values below zero to zero
        if self.score_func == 'chi2':
            X_new[X_new < 0] = 0.0

        if self.model is None:
            self.model = GenericUnivariateSelect(score_func=self.call_func,
                                                 param=self.alpha,
                                                 mode=self.mode)
            self.model.fit(X_new, y)

        _X = self.model.transform(X_new)
        is_selected = self.model.get_support()

        irrevalent_types = [feature_types[idx] for idx in irrevalent_fields]
        selected_types = [
            feature_types[idx] for idx in target_fields if is_selected[idx]
        ]
        selected_types.extend(irrevalent_types)

        new_X = np.hstack((_X, X[:, irrevalent_fields]))
        new_feature_types = selected_types
        output_datanode = DataNode((new_X, y), new_feature_types,
                                   input_datanode.task_type)
        output_datanode.trans_hist = input_datanode.trans_hist.copy()
        output_datanode.trans_hist.append(self.type)
        output_datanode.enable_balance = input_datanode.enable_balance
        output_datanode.data_balance = input_datanode.data_balance
        self.target_fields = target_fields.copy()

        return output_datanode
 def test_generic_univariate_select_int(self):
     model = GenericUnivariateSelect()
     X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]],
                  dtype=np.int64)
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, 'generic univariate select',
         [('input', Int64TensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnGenericUnivariateSelect",
         # Operator cast-1 is not implemented in onnxruntime
         allow_failure=
         "StrictVersion(onnx.__version__) < StrictVersion('1.2')")
Exemplo n.º 15
0
def feature_scores(X, Y):
    fselector = GenericUnivariateSelect(f_classif)
    fselector.fit(X, Y)
    p2scores = -np.log10(fselector.pvalues_)
    p2scores /= p2scores.max()

    mutSelector = GenericUnivariateSelect(mutual_info_classif)
    mutSelector.fit(X, Y)
    mutscores = mutSelector.scores_

    return fselector.pvalues_, p2scores, mutscores
Exemplo n.º 16
0
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    model_map = {
        'classifier': {
            'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
            'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLogisticRegression(),
            'KeepAll': 'KeepAll'
        },
        'regressor': {
            'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
            'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLasso(),
            'KeepAll': 'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
Exemplo n.º 17
0
def preprocess(X_train, y_train, X_test):
    print("== Preprocessing Data ==")
    X_merge = np.concatenate((X_train, X_test), axis=0)
    sep = X_train.shape[0]

    # Scale features to range [0, 1]
    scale = MinMaxScaler()
    X_merge = scale.fit_transform(X_merge)

    X_train = X_merge[:sep]
    X_test = X_merge[sep:]

    # Choose top features as ranked by chi squared test
    gus = GenericUnivariateSelect(score_func=chi2, mode="k_best", param=306)
    gus.fit(X_train, y_train)
    X_train = gus.transform(X_train)
    X_test = gus.transform(X_test)

    return X_train, X_test
Exemplo n.º 18
0
    def dtc03(self):
        #将y转化为一维形式:self.y_train,self.y_test
        self.y01_train = list()
        self.y01_test = list()
        for a in range(len(self.y_train)):
            self.y01_train.append(self.y_train[a][0])
        for b in range(len(self.y_test)):
            self.y01_test.append(self.y_test[b][0])
            
        #取出其中labels
        self.labels = list()
        for c in range(len(self.y_test)):
            if self.labels.count(self.y_test[c][0]) == 0:
                self.labels.append(self.y_test[c][0])
        print (self.labels)

        # SelectKBest算法的实现
        # 参数的获取
        if not self.kedit.text().strip():
            self.k = 10
        else:
            self.k = int(self.kedit.text())
        
        if not self.pedit.text().strip():
            self.param = 1e-05
        else:
            self.param = float(self.pedit.text())
        
        self.mode = self.mo_box.itemText(self.mo_box.currentIndex())
        
        # 定义模型
        if self.sp_box.itemText(self.sp_box.currentIndex()) == 'SelectKBest':
            self.clf = SelectKBest(score_func= f_classif,  k=self.k) 
            self.clf.fit_transform(self.x_train,self.y01_train)
            self.f_c = self.clf.get_support()
        elif self.sp_box.itemText(self.sp_box.currentIndex()) == 'SelectPercentile':
            self.clf = SelectPercentile(score_func= f_classif,  percentile= self.k) 
            self.clf.fit_transform(self.x_train,self.y01_train)
            self.f_c = self.clf.get_support()
        else:
            self.clf = GenericUnivariateSelect(score_func= f_classif, mode= self.mode, param=self.param) 
            self.clf.fit_transform(self.x_train,self.y01_train)
            self.f_c = self.clf.get_support()
        # 
        '''
        该模块是对dtable01模块进行设置,即显示训练集的训练结果
        '''
        # VarianceThreshold算法的结果显示
        self.ufs_dtable.setRowCount(2)
        self.ufs_dtable.setColumnCount(len(self.x_train[0]))
        mlan = "是否保留该特征(T/F)"
        self.ufs_dtable.setSpan(0, 0, 1, len(self.x_train[0]))
        self.ufs_dtable.setItem(0,0, QtGui.QTableWidgetItem(mlan.decode('utf-8')))
        for j in range(len(self.f_c)):
            self.ufs_dtable.setItem(1,j, QtGui.QTableWidgetItem(str(self.f_c[j])))
Exemplo n.º 19
0
def predictGenericUnivariateSelect(X, y, clf):
    features = GenericUnivariateSelect(chi2)
    features.fit_transform(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

    fit_chi2 = clf.fit(X_train, y_train)
    y_pred = fit_chi2.predict(X_test)

    f1_scores = []
    precision_scores = []
    recall_scores = []
    f1_score = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.f1_score(y_test, y_pred, average=None)
    recall = metrics.recall_score(y_test, y_pred, average=None)
    f1_scores.append(f1_score)
    precision_scores.append(precision)
    recall_scores.append(recall)

    return np.mean(f1_scores), np.mean(precision_scores), np.mean(
        recall_scores)
Exemplo n.º 20
0
def select_features_univariate(X, y, method='Decision_Tree'):
    """ with high dimensional datasets it aids classifier performance to select
    features of interest
    This function rejects features below a certain (univariate) threshold.


    Parameters
    ----------
    X : ndarray
            repetitions by features
    y     : ndarray
            vector of labels of each repetition
    method : string
            function used for data reduction
            {'decision_tree','decision_tree_RFECV','mutual_information',...
            'univariate_select'}
    Returns
    --------
    dictionary:
        X_transformed : ndarray
                repetitions by features (reduced)
        weights: ndarray or Boolean
                relative importance features or binary (important or not)

        """
    # based on the method we choose the clf to fit and transform the data
    if method == 'decision_tree_RFECV':
        clf = DecisionTreeClassifier()
        trans = RFECV(clf)
        X_transformed = trans.fit_transform(X, y)
        weights = trans.get_support()
    elif method == 'decision_tree':
        clf = DecisionTreeClassifier()
        clf.fit(X, y)
        # choose features with an importance that is more than avg.
        selected_features = np.where(
            clf.feature_importances_ > clf.feature_importances_.mean(0), 1, 0)
        X_transformed = X[:, selected_features == 1]
        weights = clf.feature_importances_
    elif method == 'mutual_information':
        mutual_info = mutual_info_classif(X, y)
        # choose features above the avg mutual information threshold.
        selected_features = np.where(mutual_info > mutual_info.mean(0), 1, 0)
        X_transformed = X[:, selected_features == 1]
        weights = mutual_info  #continuous
    elif method == 'univariate_select':
        # select features with more univariate activity than avg.
        trans = GenericUnivariateSelect(score_func=lambda X, y: X.mean(axis=0),
                                        mode='percentile',
                                        param=50)
        X_transformed = trans.fit_transform(X, y)
        weights = trans.get_support()  #binary

    return X_transformed, weights
Exemplo n.º 21
0
def test_select_percentile_classif_sparse():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the percentile heuristic
    """
    X, y = make_classification(n_samples=200, n_features=20,
                               n_informative=3, n_redundant=2,
                               n_repeated=0, n_classes=8,
                               n_clusters_per_class=1, flip_y=0.0,
                               class_sep=10, shuffle=False, random_state=0)
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='percentile',
                    param=25).fit(X, y).transform(X)
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Exemplo n.º 22
0
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    # TODO(PRESTON): eventually let threshold be user-configurable (or grid_searchable)
    # TODO(PRESTON): optimize the params used here
    model_map = {
        'classifier': {
            'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1)),
            'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLogisticRegression(),
            'KeepAll': 'KeepAll'
        },
        'regressor': {
            'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1)),
            'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLasso(),
            'KeepAll': 'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
Exemplo n.º 23
0
def univariate_feature_selection_with_GUS(dataframe):
    """ uses univariate statistics such as mutual information regression to select the k best features
    for a regression problem """
    from sklearn.feature_selection import GenericUnivariateSelect, mutual_info_regression

    X, y = xy_split(df)
    start = default_timer()
    select_features_gus = GenericUnivariateSelect(score_func=mutual_info_regression, mode="k_best", param=round((df.shape[1]-1)/3)).fit_transform(X, y)
    end = default_timer()
    print("Elapsed Time for feature selection: {}s".format(end-start))
    print(GenericUnivariateSelect.scores_)
    return select_features_gus
Exemplo n.º 24
0
def test_skdatasets_classif_MIFilter_vs_f_classif(X, y, p, k):
    '''
    Test several scikits learn datasets to compare features selection based on MI and anova
    for classification problems
    '''
    #scores, r = univariate_f_MI(X, y, type='cd',njobs=4)
    #scores, r= univariate_f_MI(X, y,k=30, type='cd') njobs=4
    #print("MI scores:",scores)
    #support_MI = get_support(scores, 4)
    kbest_univ_f_MI = _MI_Filter(univariate_f_MI, mode='k_best',param=k,type='cd',njobs=4)
    kbest_univ_f_MI.fit(X,y)
    support_MI = kbest_univ_f_MI._get_support_mask()
    print("MI Univariate scores:",kbest_univ_f_MI.scores_)
    print("MI Univariate r:",kbest_univ_f_MI.ranking_)
    print("MI Univariate support:",support_MI)
    kfirst_uforward_MI = _MI_Filter(univariate_forward_f_MI, mode='k_first',param=k,type='cd',njobs=4)
    kfirst_uforward_MI.fit(X,y)
    support_uforward_MI = kfirst_uforward_MI._get_support_mask()
    print("MI Univariate forward scores:",kfirst_uforward_MI.scores_)
    print("MI Univariate forward r:",kfirst_uforward_MI.ranking_)
    print("MI Univariate forward support:",support_uforward_MI)
    kfirst_mforward_MI = _MI_Filter(multivariate_forward_f_MI, mode='k_first',param=k,type='cd',njobs=4)
    kfirst_mforward_MI.fit(X,y)
    support_mforward_MI = kfirst_mforward_MI._get_support_mask()
    print("MI multivariate forward scores:",kfirst_mforward_MI.scores_)
    print("MI multivariate forward r:",kfirst_mforward_MI.ranking_)
    print("MI multivariate forward support:",support_mforward_MI)

    kfirst_mbackward_MI = _MI_Filter(multivariate_backward_f_MI, mode='k_first',param=-k,type='cd',njobs=4)
    kfirst_mbackward_MI.fit(X,y)
    support_mbackward_MI = kfirst_mbackward_MI._get_support_mask()
    print("MI multivariate backward scores:",kfirst_mbackward_MI.scores_)
    print("MI multivariate backward r:",kfirst_mbackward_MI.ranking_)
    print("MI multivariate backward support:",support_mbackward_MI)
    filter_F = GenericUnivariateSelect(f_classif, mode='k_best',param=k)
    filter_F.fit(X, y)
    print("F scores:",filter_F.scores_)
    support_F = filter_F._get_support_mask()
    print("F support :",support_F)
    '''
Exemplo n.º 25
0
    def feature_Selection(self, target=""):
        """Automated feature selection using sklearn GenericUnivariateSelect

        Keyword arguments:
        target -- target column for feature selection (default "")
        """
        data_set = self.DF
        y = data_set[target]
        X = data_set.drop(columns=[target])
        print(f"We are starting with the following columns:\n{X.columns}\n")
        transformer = GenericUnivariateSelect(
            f_classif if self.type == "classification" else f_regression,
            mode="percentile")
        self.data = transformer.fit_transform(X, y)
        columns_retained = self.DF.iloc[:, 1:].columns[
            transformer.get_support()].values
        self.DF = self.DF[columns_retained]
        self.DF[target] = y

        print(
            f"The following columns are left:\n{self.DF.drop(columns=[target]).columns}"
        )
Exemplo n.º 26
0
def test_mutual_info_classif():
    X, y = make_classification(
        n_samples=100,
        n_features=5,
        n_informative=1,
        n_redundant=1,
        n_repeated=0,
        n_classes=2,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    # Test in KBest mode.
    univariate_filter = SelectKBest(mutual_info_classif, k=2)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = (GenericUnivariateSelect(mutual_info_classif,
                                    mode="k_best",
                                    param=2).fit(X, y).transform(X))
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(5)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)

    # Test in Percentile mode.
    univariate_filter = SelectPercentile(mutual_info_classif, percentile=40)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = (GenericUnivariateSelect(mutual_info_classif,
                                    mode="percentile",
                                    param=40).fit(X, y).transform(X))
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(5)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)
Exemplo n.º 27
0
def featureSelection(x_train_features, y_train):

    #write feature selection code in here
    #selected_features: holds indices of selected features

    # write required feature extraction code in here

    selected_features = GenericUnivariateSelect(f_regression,
                                                'k_best',
                                                param=256).fit(
                                                    x_train_features,
                                                    y_train).get_support()

    return selected_features
 def test_generic_univariate_select_float(self):
     model = GenericUnivariateSelect()
     X = np.array(
         [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]],
         dtype=np.float32,
     )
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model,
         "generic univariate select",
         [("input", FloatTensorType([None, X.shape[1]]))],
     )
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnGenericUnivariateSelect",
         allow_failure="StrictVersion(onnx.__version__)"
         " < StrictVersion('1.2') or "
         "StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.2.1')",
     )
Exemplo n.º 29
0
def test_select_percentile_regression_full():
    # Test whether the relative univariate feature selection
    # selects all features when '100%' is asked.
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=100)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(
        f_regression, mode='percentile', param=100).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.ones(20)
    assert_array_equal(support, gtruth)
Exemplo n.º 30
0
    def decode(cls, obj):
        from sklearn.feature_selection import f_classif, f_regression, GenericUnivariateSelect

        new_obj = GenericUnivariateSelect.__new__(GenericUnivariateSelect)
        new_obj.__dict__ = obj['dict']

        if new_obj.score_func == 'f_classif':
            new_obj.score_func = f_classif
        elif new_obj.score_func == 'f_regression':
            new_obj.score_func = f_regression
        else:
            raise ValueError(
                'Unsupported GenericUnivariateSelect.score_func "%s"' %
                new_obj.score_func)

        return new_obj
Exemplo n.º 31
0
def test_select_kbest_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the k best heuristic
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
                           shuffle=False, random_state=0, noise=10)

    univariate_filter = SelectKBest(f_regression, k=5)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(
        f_regression, mode='k_best', param=5).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Exemplo n.º 32
0
def test_select_fwe_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fwe heuristic
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFwe(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(
        f_regression, mode='fwe', param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool))
    assert_less(np.sum(support[5:] == 1), 2)
Exemplo n.º 33
0
        train.drop(["id", "fault_severity", "location"], axis=1, inplace=True)
        test.drop(["id", "fault_severity", "location"], axis=1, inplace=True)

        train = train.fillna(0)
        train = train.astype(float)

        test = test.fillna(0)
        test = test.astype(float)

        print()
        print(train.shape)
        print(test.shape)

        # ch2 = SelectKBest(chi2, k=550)
        ch2 = GenericUnivariateSelect(score_func=chi2, mode="percentile", param=80)
        train = ch2.fit_transform(train, labels)
        test = ch2.transform(test)

        print(train.shape)
        print(test.shape)

        # print(train.shape)
        # print(test.shape)

        # pca = PCA(n_components=400)
        # print('transforming data')
        # train = pca.fit_transform(train)
        # test = pca.transform(test)
        # print('data transformed')
        # print(train.shape)
Exemplo n.º 34
0
                              ngram_range=(1,3), max_df=1.0, min_df=2, 
                              max_features=None, binary=False, norm=u'l2',
                              use_idf=True, smooth_idf=True, 
                              sublinear_tf=True,
                              tokenizer=rpg.SnowballEnglishStemmer())


tweet_data = rpg.tweet_corpus_maker(raw_tweet_data)  # form usable by sklearn
binary_region = tweet_data[2]    # selecting the East/West binary splits

initial_time = time()
X_tfidf = tfidf_vectz.fit_transform(tweet_data[0])  # tfidf vectorization
tfidf_time = time() - initial_time
print ("TFIDF Vectorization Time: %0.3f" % tfidf_time)

feature_selector = GenericUnivariateSelect(chi2, mode='percentile', param=25)
X_selected = feature_selector.fit_transform(X_tfidf, binary_region)

# This splits data into training/test splits etc.
clf_tests = rpg.make_full_data_test_samples(targets=binary_region, 
                                            vectz_dict = {'tfidfV': X_selected})

# Fits classifiers and prints evaluation metrics.
clf_bench = rpg.test_classifiers(data=clf_tests, clfs=CLASSIFIER_LIST, 
                                 vectz='tfidfV', ssize=len(binary_region), 
                                 scoring=['roc_auc'])

# Prints summary of ROC AUC estimate from 10-fold cross validation, and 
# a number of best features of each classifier.
top_feats = rpg.print_classifier(clfs=clf_bench, vectz=tfidf_vectz, 
                                 feat_sel=feature_selector, num_feats=20)
        ('filter', VarianceThreshold()),
    ])
    object_pipe = Pipeline([
        ('separator', DTypeSelector(key='object')),
        ('encoder', FeatureHasher(input_type='string')),
        ('filter', VarianceThreshold()),
    ])
    number_pipe = Pipeline([
        ('separator', DTypeSelector(key='number')),
        ('filter', VarianceThreshold()),
    ])
    feature_encoder = FeatureUnion(transformer_list=[('number', number_pipe),
                                                     ('datetime', datetime_pipe),
                                                     ('object', object_pipe),
                                                     ])
    feature_selector = GenericUnivariateSelect(mode='fwe', param=0.01)

    train_X_df = get_train_X_df(n_rows_with_caption=train_row_count_limit)
    y = get_train_y_values(n_rows_with_caption=train_row_count_limit)

    print('encoding features')
    X = feature_encoder.fit_transform(train_X_df, y)
    print('{} encoded'.format(X.shape), flush=True)

    print('selecting features')
    X = feature_selector.fit_transform(X, y)
    print('{} selected'.format(X.shape), flush=True)

    print_memory_usage()
    print('gc collecting')
    train_X_df_ref = [train_X_df]