def test_different_pipelines():
    random_seed = 1233
    X_train, y_train = load_gunpoint(return_X_y=True)
    steps = [
        ('segment', RandomIntervalSegmenter(n_intervals='sqrt')),
        ('transform',
         FeatureUnion([
             ('mean',
              RowwiseTransformer(
                  FunctionTransformer(func=np.mean, validate=False))),
             ('std',
              RowwiseTransformer(
                  FunctionTransformer(func=np.std, validate=False))),
             ('slope',
              RowwiseTransformer(
                  FunctionTransformer(func=time_series_slope,
                                      validate=False))),
         ])),
    ]
    pipe = Pipeline(steps, random_state=random_seed)
    a = pipe.fit_transform(X_train)
    tran = RandomIntervalFeatureExtractor(
        n_intervals='sqrt',
        features=[np.mean, np.std, time_series_slope],
        random_state=random_seed)
    b = tran.fit_transform(X_train)
    np.testing.assert_array_equal(a, b)
    np.testing.assert_array_equal(pipe.steps[0][1].intervals_, tran.intervals_)
def test_output_format_dim(n_instances, len_series, n_intervals, features):
    X = generate_df_from_array(np.ones(len_series),
                               n_rows=n_instances,
                               n_cols=1)
    n_rows, n_cols = X.shape
    trans = RandomIntervalFeatureExtractor(n_intervals=n_intervals,
                                           features=features)
    Xt = trans.fit_transform(X)
    assert isinstance(Xt, pd.DataFrame)
    assert Xt.shape[0] == n_rows
    assert np.array_equal(Xt.values, np.ones(Xt.shape))
def test_results(n_instances, len_series, n_intervals):
    x = np.random.normal(size=len_series)
    X = generate_df_from_array(x, n_rows=n_instances, n_cols=1)
    trans = RandomIntervalFeatureExtractor(
        n_intervals=n_intervals, features=[np.mean, np.std, time_series_slope])
    Xt = trans.fit_transform(X)
    # Check results
    for s, e in trans.intervals_:
        assert np.all(Xt.filter(like=f'_{s}_{e}_mean') == np.mean(x[s:e]))
        assert np.all(Xt.filter(like=f'_{s}_{e}_std') == np.std(x[s:e]))
        assert np.all(
            Xt.filter(like=f'_{s}_{e}_time_series_slope') == time_series_slope(
                x[s:e]))
def test_pipeline_predictions(n_intervals, n_estimators):
    random_state = 1234

    # Due to tie-breaking/floating point rounding in the final decision tree classifier, the results depend on the
    # exact column order of the input data

    #  Compare pipeline predictions outside of ensemble.
    steps = [('segment', RandomIntervalSegmenter(n_intervals=n_intervals)),
             ('transform',
              FeatureUnion([('mean',
                             RowwiseTransformer(
                                 FunctionTransformer(func=np.mean,
                                                     validate=False))),
                            ('std',
                             RowwiseTransformer(
                                 FunctionTransformer(func=np.std,
                                                     validate=False))),
                            ('slope',
                             RowwiseTransformer(
                                 FunctionTransformer(func=time_series_slope,
                                                     validate=False)))])),
             ('clf', DecisionTreeClassifier())]
    clf1 = Pipeline(steps, random_state=random_state)
    clf1.fit(X_train, y_train)
    a = clf1.predict(X_test)

    steps = [('transform',
              RandomIntervalFeatureExtractor(
                  n_intervals=n_intervals,
                  features=[np.mean, np.std, time_series_slope])),
             ('clf', DecisionTreeClassifier())]
    clf2 = Pipeline(steps, random_state=random_state)
    clf2.fit(X_train, y_train)
    b = clf2.predict(X_test)
    np.array_equal(a, b)
def test_TimeSeriesForest_predictions(n_estimators, n_intervals):
    random_state = 1234

    # fully modular implementation using pipeline with FeatureUnion
    # steps = [
    #     ('segment', RandomIntervalSegmenter(n_intervals=n_intervals)),
    #     ('transform', FeatureUnion([
    #         ('mean', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False))),
    #         ('std', RowwiseTransformer(FunctionTransformer(func=np.std, validate=False))),
    #         ('slope', RowwiseTransformer(FunctionTransformer(func=time_series_slope, validate=False)))
    #     ])),
    #     ('clf', DecisionTreeClassifier())
    # ]
    # base_estimator = Pipeline(steps)
    features = [np.mean, np.std, time_series_slope]
    steps = [('transform',
              RandomIntervalFeatureExtractor(n_intervals=n_intervals,
                                             features=features)),
             ('clf', DecisionTreeClassifier())]
    base_estimator = Pipeline(steps)

    clf1 = TimeSeriesForestClassifier(base_estimator=base_estimator,
                                      random_state=random_state,
                                      n_estimators=n_estimators)
    clf1.fit(X_train, y_train)
    a = clf1.predict_proba(X_test)

    # default, semi-modular implementation using RandomIntervalFeatureExtractor internally
    clf2 = TimeSeriesForestClassifier(random_state=random_state,
                                      n_estimators=n_estimators)
    clf2.set_params(**{'base_estimator__transform__n_intervals': n_intervals})
    clf2.fit(X_train, y_train)
    b = clf2.predict_proba(X_test)

    np.testing.assert_array_equal(a, b)
def test_different_implementations():
    random_seed = 1233
    X_train, y_train = load_gunpoint(return_X_y=True)

    # Compare with chained transformations.
    tran1 = RandomIntervalSegmenter(n_intervals='sqrt',
                                    random_state=random_seed)
    tran2 = RowwiseTransformer(
        FunctionTransformer(func=np.mean, validate=False))
    A = tran2.fit_transform(tran1.fit_transform(X_train))

    tran = RandomIntervalFeatureExtractor(n_intervals='sqrt',
                                          features=[np.mean],
                                          random_state=random_seed)
    B = tran.fit_transform(X_train)

    np.testing.assert_array_equal(A, B)
def test_random_state():
    N_ITER = 10
    X = generate_df_from_array(np.random.normal(size=20))
    random_state = 1234
    trans = RandomIntervalFeatureExtractor(n_intervals='random',
                                           random_state=random_state)
    first_Xt = trans.fit_transform(X)
    for _ in range(N_ITER):
        trans = RandomIntervalFeatureExtractor(n_intervals='random',
                                               random_state=random_state)
        Xt = trans.fit_transform(X)
        assert first_Xt.equals(Xt)
示例#8
0
def test_Pipeline_random_state():
    steps = [('transform', RandomIntervalFeatureExtractor(features=[np.mean])),
             ('clf', DecisionTreeClassifier())]
    pipe = Pipeline(steps)

    # Check that pipe is initiated without random_state
    assert pipe.random_state is None
    assert pipe.get_params()['random_state'] is None

    # Check that all components are initiated without random_state
    for step in pipe.steps:
        assert step[1].random_state is None
        assert step[1].get_params()['random_state'] is None

    # Check that if random state is set, it's set to itself and all its random components
    rs = 1234
    pipe.set_params(**{'random_state': rs})

    assert pipe.random_state == rs
    assert pipe.get_params()['random_state'] == rs

    for step in pipe.steps:
        assert step[1].random_state == rs
        assert step[1].get_params()['random_state'] == rs

    # Check specific results
    X_train, y_train = load_gunpoint(return_X_y=True)
    X_test, y_test = load_gunpoint("TEST", return_X_y=True)

    steps = [
        ('segment', RandomIntervalSegmenter(n_intervals=3)),
        ('extract',
         RowwiseTransformer(FunctionTransformer(func=np.mean,
                                                validate=False))),
        ('clf', DecisionTreeClassifier())
    ]
    pipe = Pipeline(steps, random_state=rs)
    pipe.fit(X_train, y_train)
    y_pred_first = pipe.predict(X_test)
    N_ITER = 10
    for _ in range(N_ITER):
        pipe = Pipeline(steps, random_state=rs)
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        np.testing.assert_array_equal(y_pred_first, y_pred)
示例#9
0
    def __init__(self,
                 base_estimator=None,
                 n_estimators=500,
                 criterion='mse',
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.,
                 max_features=None,
                 max_leaf_nodes=None,
                 min_impurity_decrease=0.,
                 min_impurity_split=None,
                 bootstrap=False,
                 oob_score=False,
                 n_jobs=None,
                 random_state=None,
                 verbose=0,
                 warm_start=False,
                 check_input=True):

        if base_estimator is None:
            features = [np.mean, np.std, time_series_slope]
            steps = [('transform',
                      RandomIntervalFeatureExtractor(n_intervals='sqrt',
                                                     features=features)),
                     ('clf', DecisionTreeRegressor())]
            base_estimator = Pipeline(steps)

        elif not isinstance(base_estimator, Pipeline):
            raise ValueError(
                'Base estimator must be pipeline with transforms.')
        elif not isinstance(base_estimator.steps[-1][1],
                            DecisionTreeRegressor):
            raise ValueError(
                'Last step in base estimator pipeline must be DecisionTreeRegressor.'
            )

        # Assign values, even though passed on to base estimator below, necessary here for cloning
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split

        # Rename estimator params according to name in pipeline.
        estimator = base_estimator.steps[-1][0]
        estimator_params = {
            "criterion": criterion,
            "max_depth": max_depth,
            "min_samples_split": min_samples_split,
            "min_samples_leaf": min_samples_leaf,
            "min_weight_fraction_leaf": min_weight_fraction_leaf,
            "max_features": max_features,
            "max_leaf_nodes": max_leaf_nodes,
            "min_impurity_decrease": min_impurity_decrease,
            "min_impurity_split": min_impurity_split,
        }
        estimator_params = {
            f'{estimator}__{pname}': pval
            for pname, pval in estimator_params.items()
        }

        # Pass on params.
        super(TimeSeriesForestRegressor, self).__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            estimator_params=tuple(estimator_params.keys()),
            bootstrap=bootstrap,
            oob_score=oob_score,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose,
            warm_start=warm_start,
        )

        # Assign random state to pipeline.
        base_estimator.set_params(**{
            'random_state': random_state,
            'check_input': False
        })

        # Store renamed estimator params.
        for pname, pval in estimator_params.items():
            self.__setattr__(pname, pval)
        self.check_input = check_input
def test_bad_features(bad_features):
    X, y = load_gunpoint(return_X_y=True)
    with pytest.raises(ValueError):
        RandomIntervalFeatureExtractor(n_intervals=bad_features).fit(X)