Exemplo n.º 1
0
def test_feature_stacker():
    # basic sanity check for feature stacker
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    pca = RandomizedPCA(n_components=2)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("pca", pca), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
            select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
Exemplo n.º 2
0
def test_set_feature_union_steps():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    mult5 = Mult(5)
    mult5.get_feature_names = lambda: ['x5']

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    # Directly setting attr
    ft.transformer_list = [('m5', mult5)]
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(['m5__x5'], ft.get_feature_names())

    # Using set_params
    ft.set_params(transformer_list=[('mock', mult3)])
    assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
    assert_equal(['mock__x3'], ft.get_feature_names())

    # Using set_params to replace single step
    ft.set_params(mock=mult5)
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(['mock__x5'], ft.get_feature_names())
Exemplo n.º 3
0
def test_set_feature_union_steps():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ["x2"]
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ["x3"]
    mult5 = Mult(5)
    mult5.get_feature_names = lambda: ["x5"]

    ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
    assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]])))
    assert_equal(["m2__x2", "m3__x3"], ft.get_feature_names())

    # Directly setting attr
    ft.transformer_list = [("m5", mult5)]
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(["m5__x5"], ft.get_feature_names())

    # Using set_params
    ft.set_params(transformer_list=[("mock", mult3)])
    assert_array_equal([[3]], ft.transform(np.asarray([[1]])))
    assert_equal(["mock__x3"], ft.get_feature_names())

    # Using set_params to replace single step
    ft.set_params(mock=mult5)
    assert_array_equal([[5]], ft.transform(np.asarray([[1]])))
    assert_equal(["mock__x5"], ft.get_feature_names())
Exemplo n.º 4
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))
Exemplo n.º 5
0
def test_feature_union():
    # basic sanity check for feature union
    iris = load_iris()
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert_equal(X_transformed.shape, (X.shape[0], 3))

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # Test clone
    fs2 = assert_no_warnings(clone, fs)
    assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1])

    # test setting parameters
    fs.set_params(select__k=2)
    assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert_equal(X_transformed.shape, (X.shape[0], 8))

    # test error if some elements do not support transform
    assert_raises_regex(TypeError,
                        'All estimators should implement fit and '
                        'transform.*\\bNoTrans\\b',
                        FeatureUnion,
                        [("transform", Transf()), ("no_transform", NoTrans())])

    # test that init accepts tuples
    fs = FeatureUnion((("svd", svd), ("select", select)))
    fs.fit(X, y)
Exemplo n.º 6
0
def test_feature_union():
    # basic sanity check for feature union
    X = iris.data
    X -= X.mean(axis=0)
    y = iris.target
    svd = TruncatedSVD(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    fs = FeatureUnion([("svd", svd), ("select", select)])
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    assert X_transformed.shape == (X.shape[0], 3)

    # check if it does the expected thing
    assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())

    # test if it also works for sparse input
    # We use a different svd object to control the random_state stream
    fs = FeatureUnion([("svd", svd), ("select", select)])
    X_sp = sparse.csr_matrix(X)
    X_sp_transformed = fs.fit_transform(X_sp, y)
    assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())

    # Test clone
    fs2 = assert_no_warnings(clone, fs)
    assert fs.transformer_list[0][1] is not fs2.transformer_list[0][1]

    # test setting parameters
    fs.set_params(select__k=2)
    assert fs.fit_transform(X, y).shape == (X.shape[0], 4)

    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)])
    X_transformed = fs.fit_transform(X, y)
    assert X_transformed.shape == (X.shape[0], 8)

    # test error if some elements do not support transform
    assert_raises_regex(TypeError,
                        'All estimators should implement fit and '
                        'transform.*\\bNoTrans\\b',
                        FeatureUnion,
                        [("transform", Transf()), ("no_transform", NoTrans())])

    # test that init accepts tuples
    fs = FeatureUnion((("svd", svd), ("select", select)))
    fs.fit(X, y)
Exemplo n.º 7
0
class FeatureUnionDataFrame(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.fu = FeatureUnion(*args, **kwargs)

    def fit(self, X, y=None, **kwargs):
        self.fu.fit(X, y, **kwargs)
        return self

    def transform(self, X, y=None, **fit_params):
        return pd.DataFrame(self.fu.transform(X), columns=self.fu.get_feature_names())

    def get_feature_names(self):
        return self.fu.get_feature_names()

    def set_params(self, **kwargs):
        self.fu.set_params(**kwargs)

    def get_params(self, deep=False):
        return self.fu.get_params(deep)
Exemplo n.º 8
0
def test_set_feature_union_step_none():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    X = np.asarray([[1]])

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    ft.set_params(m2=None)
    assert_array_equal([[3]], ft.fit(X).transform(X))
    assert_array_equal([[3]], ft.fit_transform(X))
    assert_equal(['m3__x3'], ft.get_feature_names())

    ft.set_params(m3=None)
    assert_array_equal([[]], ft.fit(X).transform(X))
    assert_array_equal([[]], ft.fit_transform(X))
    assert_equal([], ft.get_feature_names())

    # check we can change back
    ft.set_params(m3=mult3)
    assert_array_equal([[3]], ft.fit(X).transform(X))
Exemplo n.º 9
0
def test_set_feature_union_step_drop(drop):
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    X = np.asarray([[1]])

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names())

    ft.set_params(m2=drop)
    assert_array_equal([[3]], ft.fit(X).transform(X))
    assert_array_equal([[3]], ft.fit_transform(X))
    assert_equal(['m3__x3'], ft.get_feature_names())

    ft.set_params(m3=drop)
    assert_array_equal([[]], ft.fit(X).transform(X))
    assert_array_equal([[]], ft.fit_transform(X))
    assert_equal([], ft.get_feature_names())

    # check we can change back
    ft.set_params(m3=mult3)
    assert_array_equal([[3]], ft.fit(X).transform(X))

    # Check 'drop' step at construction time
    ft = FeatureUnion([('m2', drop), ('m3', mult3)])
    assert_array_equal([[3]], ft.fit(X).transform(X))
    assert_array_equal([[3]], ft.fit_transform(X))
    assert_equal(['m3__x3'], ft.get_feature_names())
Exemplo n.º 10
0
def test_feature_union(weights):
    X = np.ones((10, 5))
    y = np.zeros(10)

    union = FeatureUnion(
        [
            ("tr0", ScalingTransformer()),
            ("tr1", ScalingTransformer()),
            ("tr2", ScalingTransformer()),
        ]
    )

    factors = [(2, 3, 5), (2, 4, 5), (2, 4, 6), (2, 4, None), (None, None, None)]
    params, sols, grid = [], [], []
    for constants, w in product(factors, weights or [None]):
        p = {}
        for n, c in enumerate(constants):
            if c is None:
                p["tr%d" % n] = None
            elif n == 3:  # 3rd is always an estimator
                p["tr%d" % n] = ScalingTransformer(c)
            else:
                p["tr%d__factor" % n] = c
        sol = union.set_params(transformer_weights=w, **p).transform(X)
        sols.append(sol)
        if w is not None:
            p["transformer_weights"] = w
        params.append(p)
        p2 = {"union__" + k: [v] for k, v in p.items()}
        p2["est"] = [CheckXClassifier(sol[0])]
        grid.append(p2)

    # Need to recreate the union after setting estimators to `None` above
    union = FeatureUnion(
        [
            ("tr0", ScalingTransformer()),
            ("tr1", ScalingTransformer()),
            ("tr2", ScalingTransformer()),
        ]
    )

    pipe = Pipeline([("union", union), ("est", CheckXClassifier())])
    gs = dcv.GridSearchCV(pipe, grid, refit=False, cv=2)

    with warnings.catch_warnings(record=True):
        gs.fit(X, y)
Exemplo n.º 11
0
def test_feature_union(weights):
    X = np.ones((10, 5))
    y = np.zeros(10)

    union = FeatureUnion([('tr0', ScalingTransformer()),
                          ('tr1', ScalingTransformer()),
                          ('tr2', ScalingTransformer())])

    factors = [(2, 3, 5), (2, 4, 5), (2, 4, 6),
               (2, 4, None), (None, None, None)]
    params, sols, grid = [], [], []
    for constants, w in product(factors, weights or [None]):
        p = {}
        for n, c in enumerate(constants):
            if c is None:
                p['tr%d' % n] = None
            elif n == 3:  # 3rd is always an estimator
                p['tr%d' % n] = ScalingTransformer(c)
            else:
                p['tr%d__factor' % n] = c
        sol = union.set_params(transformer_weights=w, **p).transform(X)
        sols.append(sol)
        if w is not None:
            p['transformer_weights'] = w
        params.append(p)
        p2 = {'union__' + k: [v] for k, v in p.items()}
        p2['est'] = [CheckXClassifier(sol[0])]
        grid.append(p2)

    # Need to recreate the union after setting estimators to `None` above
    union = FeatureUnion([('tr0', ScalingTransformer()),
                          ('tr1', ScalingTransformer()),
                          ('tr2', ScalingTransformer())])

    pipe = Pipeline([('union', union), ('est', CheckXClassifier())])
    gs = dcv.GridSearchCV(pipe, grid, refit=False, cv=2)

    with warnings.catch_warnings(record=True):
        gs.fit(X, y)
Exemplo n.º 12
0
def test_set_feature_union_step_drop(get_names):
    mult2 = Mult(2)
    mult3 = Mult(3)

    if get_names == "get_feature_names":
        mult2.get_feature_names = lambda: ["x2"]
        mult3.get_feature_names = lambda: ["x3"]
    else:  # get_feature_names_out
        mult2.get_feature_names_out = lambda input_features: ["x2"]
        mult3.get_feature_names_out = lambda input_features: ["x3"]

    X = np.asarray([[1]])

    ft = FeatureUnion([("m2", mult2), ("m3", mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert_array_equal(["m2__x2", "m3__x3"], getattr(ft, get_names)())

    with pytest.warns(None) as record:
        ft.set_params(m2="drop")
        assert_array_equal([[3]], ft.fit(X).transform(X))
        assert_array_equal([[3]], ft.fit_transform(X))
    assert_array_equal(["m3__x3"], getattr(ft, get_names)())
    assert not record

    with pytest.warns(None) as record:
        ft.set_params(m3="drop")
        assert_array_equal([[]], ft.fit(X).transform(X))
        assert_array_equal([[]], ft.fit_transform(X))
    assert_array_equal([], getattr(ft, get_names)())
    assert not record

    with pytest.warns(None) as record:
        # check we can change back
        ft.set_params(m3=mult3)
        assert_array_equal([[3]], ft.fit(X).transform(X))
    assert not record

    with pytest.warns(None) as record:
        # Check 'drop' step at construction time
        ft = FeatureUnion([("m2", "drop"), ("m3", mult3)])
        assert_array_equal([[3]], ft.fit(X).transform(X))
        assert_array_equal([[3]], ft.fit_transform(X))
    assert_array_equal(["m3__x3"], getattr(ft, get_names)())
    assert not record
Exemplo n.º 13
0
def test_set_feature_union_step_drop():
    mult2 = Mult(2)
    mult2.get_feature_names = lambda: ['x2']
    mult3 = Mult(3)
    mult3.get_feature_names = lambda: ['x3']
    X = np.asarray([[1]])

    ft = FeatureUnion([('m2', mult2), ('m3', mult3)])
    assert_array_equal([[2, 3]], ft.fit(X).transform(X))
    assert_array_equal([[2, 3]], ft.fit_transform(X))
    assert ['m2__x2', 'm3__x3'] == ft.get_feature_names()

    with pytest.warns(None) as record:
        ft.set_params(m2='drop')
        assert_array_equal([[3]], ft.fit(X).transform(X))
        assert_array_equal([[3]], ft.fit_transform(X))
    assert ['m3__x3'] == ft.get_feature_names()
    assert not record

    with pytest.warns(None) as record:
        ft.set_params(m3='drop')
        assert_array_equal([[]], ft.fit(X).transform(X))
        assert_array_equal([[]], ft.fit_transform(X))
    assert [] == ft.get_feature_names()
    assert not record

    with pytest.warns(None) as record:
        # check we can change back
        ft.set_params(m3=mult3)
        assert_array_equal([[3]], ft.fit(X).transform(X))
    assert not record

    with pytest.warns(None) as record:
        # Check 'drop' step at construction time
        ft = FeatureUnion([('m2', 'drop'), ('m3', mult3)])
        assert_array_equal([[3]], ft.fit(X).transform(X))
        assert_array_equal([[3]], ft.fit_transform(X))
    assert ['m3__x3'] == ft.get_feature_names()
    assert not record
Exemplo n.º 14
0
class MCPTFeatureEvaluator(BaseEstimator, SelectorMixin):
    def __init__(self,
                 convert_datetime=False,
                 selector_recipe='univariate_unbiased',
                 impute=False,
                 verbose=False,
                 copy=True):
        assert selector_recipe in \
            ['univariate_unbiased',
             'univariate_cscv'], "Provided 'selector_recipe' is not valid!"
        self.convert_datetime = convert_datetime
        self.selector_recipe = selector_recipe
        self.impute = impute
        self.verbose = verbose
        self.copy = copy

    def fit(self,
            X,
            y,
            cols_to_discrete=None,
            method='discrete',
            measure='mi',
            n_bins_x=5,
            n_bins_y=5,
            n_reps=100,
            cscv_folds=None,
            target='cpu'):
        # make sure n_reps is valid
        assert isinstance(n_reps, int) and n_reps >= 0, \
            "n_reps must be an integer greater than or equal to 0."
        if n_reps == 0:
            warnings.warn("n_reps=0 ... selector_recipe is not applicable.",
                          RuntimeWarning)

        # make sure that a DataFrame has been passed in
        assert isinstance(X, pd.DataFrame)
        # if y is a pandas Series or a numpy array then make
        # sure it is the same length as the DataFrame
        if isinstance(y, pd.Series) or isinstance(y, np.ndarray):
            assert len(y) == X.shape[0]
            if isinstance(y, np.ndarray):
                # if it is a numpy array then need to make it into a series
                # and assign a name to that series
                y_name = 'y_auto'
                y = pd.Series(y, name='y_auto')
            else:
                # capture the name from the y pandas Series
                y_name = y.name
            # Capture original predictor columns prior to adding y into X
            self.original_predictor_columns = X.columns
            # add y into X so that it can be pre-processed correctly
            X = X.assign(y_auto=y).rename(columns={'y_auto': y_name})
        else:
            # make sure that 'y' is a column in X
            assert y in X.columns, "The target column 'y' does not exist in 'X'!"
            y_name = y
            # Capture the original predictor columns without removing the target
            self.original_predictor_columns = X.columns

        # Convenience check:
        # Remove any columns that might have a single value as information
        # calculations will not really be accurate
        for col in X.columns:
            if X[col].nunique() == 1:
                if col == y_name:
                    raise RuntimeError(
                        "The target variable has an insufficient " +
                        "number of unique values!")
                X = X.drop([col], axis=1)
                warnings.warn(
                    "Removing column '" + col + "' due to non-unique values",
                    RuntimeWarning)

        # convert any columns to discrete that are indicated in the fit method
        if cols_to_discrete is not None:
            try:
                X[cols_to_discrete] = X[cols_to_discrete].astype(object)
            except KeyError:
                cols_error = list(set(cols_to_discrete) - set(X.columns))
                raise KeyError("The DataFrame does not " +
                               "include the following " +
                               "columns to discretize: %s" % cols_error)

        t0 = time()
        # Capture the column names in the DataFrame. These will be used
        # later in the presentation of the results
        self.numeric_cols = TypeSelector(np.number, True).fit_transform(X)
        self.bool_cols = TypeSelector("bool", True).fit_transform(X)
        self.category_cols = TypeSelector("category", True).fit_transform(X)
        self.object_cols = TypeSelector("object", True).fit_transform(X)
        self.datetime_cols = TypeSelector("datetime", True).fit_transform(X)
        self.preprocessed_columns = []
        self.column_type_mask = []

        if self.verbose:
            t1 = time()
            print("Type selector time: {0:.3g} sec".format(t1 - t0))

        t2 = time()
        # Generate the feature union with all the possible different dtypes
        if self.impute:
            self.preprocess_features = \
                FeatureUnion(transformer_list=[
                    ("numeric_features", make_pipeline(
                        TypeSelector(np.number),
                        Imputer(strategy="median")
                    )),
                    ("boolean_features", make_pipeline(
                        TypeSelector("bool"),
                        Imputer(strategy="most_frequent")
                    )),
                    ("categorical_features", make_pipeline(
                        TypeSelector("category"),
                        MostFrequentImputer(),
                        MultiColumnLabelEncoder(self.category_cols)
                    )),
                    ("object_features", make_pipeline(
                        TypeSelector("object"),
                        MostFrequentImputer(),
                        MultiColumnLabelEncoder(self.object_cols)
                    )),
                    ("datetime_features", make_pipeline(
                        TypeSelector("datetime"),
                        Imputer(strategy="most_frequent")
                    ))
                ])
        else:
            # don't impute missing values
            # less execute time
            self.preprocess_features = \
                FeatureUnion(transformer_list=[
                    ("numeric_features", make_pipeline(
                        TypeSelector(np.number)
                    )),
                    ("boolean_features", make_pipeline(
                        TypeSelector("bool")
                    )),
                    ("categorical_features", make_pipeline(
                        TypeSelector("category"),
                        MultiColumnLabelEncoder(self.category_cols)
                    )),
                    ("object_features", make_pipeline(
                        TypeSelector("object"),
                        MultiColumnLabelEncoder(self.object_cols)
                    )),
                    ("datetime_features", make_pipeline(
                        TypeSelector("datetime")
                    ))
                ])

        # If some of the dtypes are not present in the data set
        # and the feature union is run as-is, an error will be thrown.
        # As a result, those feature pipelines without any dtypes
        # need to be set to None prior to the FeatureUnion being executed
        # If they are present, then they need to be added to the
        # list of all columns being evaluated
        #
        # In addition, the categorical/object columns need to be tracked
        # so that the proper MI algorithm can be used in the calculation
        # (cont/cont, discrete/cont, discrete/discrete)
        #
        # Numerics
        if len(self.numeric_cols) == 0:
            self.preprocess_features.set_params(numeric_features=None)
        else:
            self.preprocessed_columns += list(self.numeric_cols)
            for i in range(len(self.numeric_cols)):
                self.column_type_mask += ['numeric']
        # Booleans
        if len(self.bool_cols) == 0:
            self.preprocess_features.set_params(boolean_features=None)
        else:
            self.preprocessed_columns += list(self.bool_cols)
            for i in range(len(self.bool_cols)):
                self.column_type_mask += ['discrete']
        # Categorical
        if len(self.category_cols) == 0:
            self.preprocess_features.set_params(categorical_features=None)
        else:
            self.preprocessed_columns += list(self.category_cols)
            for i in range(len(self.category_cols)):
                self.column_type_mask += ['discrete']
        # Object
        if len(self.object_cols) == 0:
            self.preprocess_features.set_params(object_features=None)
        else:
            self.preprocessed_columns += list(self.object_cols)
            for i in range(len(self.object_cols)):
                self.column_type_mask += ['discrete']
        # Datetime
        if len(self.datetime_cols) == 0:
            self.preprocess_features.set_params(datetime_features=None)
        else:
            self.preprocessed_columns += list(self.datetime_cols)
            for i in range(len(self.datetime_cols)):
                self.column_type_mask += ['datetime']

        X = self.preprocess_features.fit_transform(X)

        if self.verbose:
            t3 = time()
            print("Preprocess time: {0:.3g} sec".format(t3 - t2))

        # Convert back to a DataFrame so we can properly extract the target
        # variable. There may be a better way to do this but wanted to preprocess
        # the target variable inline with all the other variables
        X = pd.DataFrame(data=X, columns=self.preprocessed_columns)
        # extract y series from X after preprocessing, determine if it is
        # discrete or continuous, and remove from X
        y = X[y_name]
        X = X.drop([y_name], axis=1)

        # Derive 'target_is_discrete'
        target_is_discrete = (y_name in self.category_cols) or \
                                (y_name in self.object_cols)
        # Adjust 'self.column_type_mask' and 'self.columns' given that
        # 'y' was just separated out from 'X'
        self.preprocessed_columns = np.array(self.preprocessed_columns)
        self.column_type_mask = np.array(self.column_type_mask)
        predictor_idx = np.where(self.preprocessed_columns != y_name)[0]
        self.column_type_mask = self.column_type_mask[predictor_idx]
        self.preprocessed_columns = self.preprocessed_columns[predictor_idx]

        # Next steps ... run the MCPT
        self.method = method
        self.measure = measure
        self.target = target
        self.n_bins_x = n_bins_x
        self.n_bins_y = n_bins_y
        self.n_mcpt_reps = n_reps
        if cscv_folds is None and ('cscv' in self.selector_recipe):
            self.n_cscv_folds = 4
            warnings.warn(
                "No value provided for 'cscv_folds'. '" +
                "Setting to '4' CSCV folds by default.", RuntimeWarning)
        else:
            self.n_cscv_folds = cscv_folds

        kwargs = {'method': self.method, 'measure': self.measure, \
                  'target_is_discrete': target_is_discrete, \
                  'column_type_mask': self.column_type_mask, \
                  'n_bins_x': self.n_bins_x, 'n_bins_y': self.n_bins_y, \
                  'n_reps': self.n_mcpt_reps, 'cscv_folds': self.n_cscv_folds, \
                  'target': self.target, 'verbose': self.verbose}

        if self.selector_recipe in ['univariate_unbiased', 'univariate_cscv']:
            info_matrix = univariate.screen_univariate(X.values, y.values,
                                                       **kwargs)
        else:
            raise ValueError

        var_series = pd.Series(self.preprocessed_columns, name='Variables')
        if self.measure == 'mi':
            measure_name = 'MI'
        else:
            measure_name = 'UR'
        if self.n_mcpt_reps > 0:
            col_names = [measure_name, 'Solo p-value', 'Unbiased p-value']
        else:
            col_names = [measure_name]
        if self.n_cscv_folds is not None:
            col_names += ['P(<=median)']

        self.information = pd.DataFrame(info_matrix, columns=col_names)
        self.information.insert(0, 'Variable', var_series)

        self.information = self.information.sort_values(by=measure_name,
                                                        ascending=False)
        self.information = self.information.reset_index(drop=True)

        return self

    def _get_support_mask(self):
        check_is_fitted(self, 'information')

        orig_cols = self.original_predictor_columns

        if self.n_mcpt_reps > 0:
            if self.selector_recipe == "univariate_unbiased":
                # Get the variable names
                variable_mask = self.information['Unbiased p-value'] < 0.05
                selected = self.information['Variable'][variable_mask].values
            elif self.selector_recipe == "univariate_cscv":
                variable_mask_1 = self.information['P(<=median)'] <= 0.2
                variable_mask_2 = self.information['Unbiased p-value'] < 0.05
                variable_mask = variable_mask_1.values & variable_mask_2.values
                selected = self.information['Variable'][variable_mask].values
            else:
                raise ValueError
            mask = np.array(
                [True if col in selected else False for col in orig_cols])
        else:
            # self.n_mcpt_reps = 0 .. return all original columns
            mask = np.repeat(True, len(orig_cols))

        return mask
Exemplo n.º 15
0
# For training data, must remove the empty lines to ensure accurate training
(X_train, raw_train, y_train) = preprocessing_removeEmpty(raw_train, y_train)
print('Number of lines training set: ' + str(len(X_train)))
(X_test, raw_test, y_test) = preprocessing_removeEmpty(raw_test, y_test)
print('Number of lines testing set: ' + str(len(X_test)))

unigram_vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=1)
temp_uni_tfidf = unigram_vectorizer.fit_transform(X_train).toarray()
# n_features = len(unigram_vectorizer.get_feature_names())
n_features = 100
multigrams_vectorizer = TfidfVectorizer(ngram_range=(2, 3),
                                        min_df=2,
                                        max_features=n_features)
comb_vectorizer = FeatureUnion([("uni_vec", unigram_vectorizer),
                                ("multi_vec", multigrams_vectorizer)])
comb_vectorizer.set_params(multi_vec=None)
X_train_tfidf = comb_vectorizer.fit_transform(X_train).toarray()

feature_names = comb_vectorizer.get_feature_names()
print("num_features: " + str(len(feature_names)))
# print(feature_names[:50])
print("features extracted & tfidf transformed.")
# Transform documents to document-term matrix. (.transform) - No learning involved as it is test data
# For test data
# X_test_tfidf = vectorizer.transform(X_test).toarray()
X_test_tfidf = comb_vectorizer.transform(X_test).toarray()

# model = svm.SVC(kernel="linear",C=100,cache_size=5000,probability=True)
print('Creating Model...')
model = svm.LinearSVC(C=10, multi_class='ovr')
# model = svm.SVC(kernel="linear",C=10,decision_function_shape='ovr',cache_size=5000,probability=True)
Exemplo n.º 16
0
def extract_features(X,
                     sfreq,
                     selected_funcs,
                     funcs_params=None,
                     n_jobs=1,
                     return_as_df=False):
    """ Extraction of temporal or spectral features from epoched EEG signals.

    Parameters
    ----------
    X : ndarray, shape (n_epochs, n_channels, n_times)
        Array of epoched EEG data.

    sfreq : float
        Sampling rate of the data.

    selected_funcs : list of str
        The elements of `selected_features` are aliases for the feature
        functions which will be used to extract features from the data.
        (See `mne_features` documentation for a complete list of available
        feature functions).

    funcs_params : dict or None (default: None)
        If not None, dict of optional parameters to be passed to the feature
        functions. Each key of the `funcs_params` dict should be of the form :
        [alias_feature_function]__[optional_param] (for example:
        'higuchi_fd__kmax`).

    n_jobs : int (default: 1)
        Number of CPU cores used when parallelizing the feature extraction.
        If given a value of -1, all cores are used.

    return_as_df : bool (default: False)
        If True, the extracted features will be returned as a Pandas DataFrame.
        The column index is a MultiIndex (see `pd.MultiIndex`) which contains
        the alias of each feature function which was used. If False, the
        features are returned as a 2d Numpy array.

    Returns
    -------
    array-like, shape (n_epochs, n_features)
    """
    if sfreq <= 0:
        raise ValueError('Sampling rate `sfreq` must be positive.')
    univariate_funcs = get_univariate_funcs(sfreq)
    bivariate_funcs = get_bivariate_funcs(sfreq)
    feature_funcs = univariate_funcs.copy()
    feature_funcs.update(bivariate_funcs)
    sel_funcs = _check_func_names(selected_funcs, feature_funcs.keys())

    # Feature extraction
    n_epochs = X.shape[0]
    _tr = [(n, FeatureFunctionTransformer(func=feature_funcs[n]))
           for n in sel_funcs]
    extractor = FeatureUnion(transformer_list=_tr)
    if funcs_params is not None:
        extractor.set_params(**funcs_params)
    res = joblib.Parallel(n_jobs=n_jobs)(
        joblib.delayed(_apply_extractor)(extractor, X[j, :, :])
        for j in range(n_epochs))
    Xnew = np.vstack(res)
    if return_as_df:
        return _format_as_dataframe(Xnew, extractor.get_feature_names())
    else:
        return Xnew
Exemplo n.º 17
0
def extract_features(X,
                     sfreq,
                     selected_funcs,
                     funcs_params=None,
                     n_jobs=1,
                     ch_names=None,
                     return_as_df=False):
    """Extraction of temporal or spectral features from epoched EEG signals.

    Parameters
    ----------
    X : ndarray, shape (n_epochs, n_channels, n_times)
        Array of epoched EEG data.

    sfreq : float
        Sampling rate of the data.

    selected_funcs : list of str or tuples
        The elements of ``selected_features`` are either strings or tuples of
        the form ``(str, callable)``. If an element is of type ``str``, it is
        the alias of a feature function. The aliases are built from the
        feature functions' names by removing ``compute_``. For instance, the
        alias of the feature function :func:`compute_ptp_amp` is ``ptp_amp``.
        (See the documentation of mne-features). If an element is of type
        ``tuple``, the first element of the tuple should be a string
        (name/alias given to a user-defined feature function) and the second
        element should be a  callable (a user-defined feature function which
        accepts Numpy arrays with shape ``(n_channels, n_times)``). The
        names/aliases given to user-defined feature functions should not
        intersect the aliases used by mne-features. If the name given to a
        user-defined feature function is already used as an alias in
        mne-features, an error will be raised.

    funcs_params : dict or None (default: None)
        If not None, dict of optional parameters to be passed to the feature
        functions. Each key of the ``funcs_params`` dict should be of the form:
        ``[alias_feature_function]__[optional_param]`` (for example:
        ``higuchi_fd__kmax``).

    n_jobs : int (default: 1)
        Number of CPU cores used when parallelizing the feature extraction.
        If given a value of -1, all cores are used.

    ch_names : list of str or None (default: None)
        If not None, list containing the names of each input channel.

    return_as_df : bool (default: False)
        If True, the extracted features will be returned as a Pandas DataFrame.
        The column index is a MultiIndex (see :class:`~pandas.MultiIndex`)
        which contains the alias of each feature function which was used.
        If False, the features are returned as a 2d Numpy array.

    Returns
    -------
    array-like, shape (n_epochs, n_features)
    """
    if sfreq <= 0:
        raise ValueError('Sampling rate `sfreq` must be positive.')
    univariate_funcs = get_univariate_funcs(sfreq)
    bivariate_funcs = get_bivariate_funcs(sfreq)
    feature_funcs = univariate_funcs.copy()
    feature_funcs.update(bivariate_funcs)
    sel_funcs = _check_funcs(selected_funcs, feature_funcs)

    if ch_names is not None and len(ch_names) != X.shape[1]:
        raise ValueError('`ch_names` should be of length {%s}' % X.shape[1])

    # Feature extraction
    n_epochs = X.shape[0]
    _tr = [(n, FeatureFunctionTransformer(func=func)) for n, func in sel_funcs]
    extractor = FeatureUnion(transformer_list=_tr)
    if funcs_params is not None:
        extractor.set_params(**funcs_params)
    res = joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(_apply_extractor)(
        extractor, X[j, :, :], ch_names, return_as_df)
                                         for j in range(n_epochs))
    feature_names = res[0][1]
    res = list(zip(*res))[0]
    Xnew = np.vstack(res)
    if return_as_df:
        return _format_as_dataframe(Xnew, feature_names)
    else:
        return Xnew
Exemplo n.º 18
0
kf = KFold(len(X_trainDF),5,shuffle=True,random_state=55)

while compteur < Iteration:
    print compteur
    C = 10**(uniform(-6,-2))
    p = uniform(3,6)
    npca = randrange(5, 30)
    
    which_feature = {k:int(proba.random()) for k in Feature.transformer_weights.keys()}
    which_feature['HOGFeature'] = 1
    which_feature['SobelFeature'] = 1
    Feature.transformer_weights = which_feature
    param = {'SobelFeature__PCA__n_components':npca,
             'RawImage__PCA__n_components':npca,
             'HOGFeature__PCA__n_components':npca}
    Feature.set_params(**param)
    
    scores = []; rocauctr = []; rocaucval = []
    print 'Debut cross-validation'
    for train_index, val_index in kf:
        X_trDF, X_valDF = X_trainDF.iloc[train_index], X_trainDF.iloc[val_index]
        y_trDF, y_valDF = y_trainDF.iloc[train_index], y_trainDF.iloc[val_index]
        
        X_tr = Feature.fit_transform(X_trDF)
        y_tr = np.array(y_trDF)[:,np.newaxis]
        
        X_val = Feature.transform(X_valDF)
        y_val = np.array(y_valDF)[:,np.newaxis]
    
        model = LogisticRegression(penalty='l2',C = C, 
                                 class_weight = {0:1,1:p})