def test_row_transformer_function_transformer_series_to_primitives(): X, y = load_gunpoint(return_X_y=True) ft = FunctionTransformer(func=np.mean, validate=False) t = SeriesToPrimitivesRowTransformer(ft, check_transformer=False) Xt = t.fit_transform(X, y) assert Xt.shape == X.shape assert isinstance(Xt.iloc[0, 0], float) # check series-to-primitive transforms
def test_different_implementations(): random_state = 1233 X_train, y_train = make_classification_problem() # Compare with chained transformations. tran1 = RandomIntervalSegmenter(n_intervals=1, random_state=random_state) tran2 = SeriesToPrimitivesRowTransformer(FunctionTransformer( func=np.mean, validate=False), check_transformer=False) A = tran2.fit_transform(tran1.fit_transform(X_train)) tran = RandomIntervalFeatureExtractor(n_intervals=1, features=[np.mean], random_state=random_state) B = tran.fit_transform(X_train) np.testing.assert_array_almost_equal(A, B)
def test_different_pipelines(): random_state = 1233 X_train, y_train = make_classification_problem() steps = [ ( "segment", RandomIntervalSegmenter(n_intervals=1, random_state=random_state), ), ( "transform", FeatureUnion([ ( "mean", SeriesToPrimitivesRowTransformer( FunctionTransformer(func=np.mean, validate=False), check_transformer=False, ), ), ( "std", SeriesToPrimitivesRowTransformer( FunctionTransformer(func=np.std, validate=False), check_transformer=False, ), ), ( "slope", SeriesToPrimitivesRowTransformer( FunctionTransformer(func=time_series_slope, validate=False), check_transformer=False, ), ), ]), ), ] pipe = Pipeline(steps) a = pipe.fit_transform(X_train) tran = RandomIntervalFeatureExtractor( n_intervals=1, features=[np.mean, np.std, time_series_slope], random_state=random_state, ) b = tran.fit_transform(X_train) np.testing.assert_array_equal(a, b) np.testing.assert_array_equal(pipe.steps[0][1].intervals_, tran.intervals_)
def test_feature_importances_single_feature_interval_and_estimator(): random_state = 1234 # Compute using default method features = [np.mean] steps = [ ( "transform", RandomIntervalFeatureExtractor(n_intervals=1, features=features, random_state=random_state), ), ("clf", DecisionTreeClassifier()), ] base_estimator = Pipeline(steps) clf1 = TimeSeriesForestClassifier(estimator=base_estimator, random_state=random_state, n_estimators=1) clf1.fit(X_train, y_train) # Extract the interval and the estimator, and compute using pipelines intervals = clf1.estimators_[0].steps[0][1].intervals_ steps = [ ("segment", IntervalSegmenter(intervals)), ( "transform", FeatureUnion([( "mean", SeriesToPrimitivesRowTransformer( FunctionTransformer(func=np.mean, validate=False), check_transformer=False, ), )]), ), ("clf", clone(clf1.estimators_[0].steps[-1][1])), ] clf2 = Pipeline(steps) clf2.fit(X_train, y_train) # Check for feature importances obtained from the estimators fi_expected = clf1.estimators_[0].steps[-1][1].feature_importances_ fi_actual = clf2.steps[-1][1].feature_importances_ np.testing.assert_array_equal(fi_actual, fi_expected)
def test_RowTransformer_pipeline(): X_train, y_train = load_basic_motions(split="train", return_X_y=True) X_test, y_test = load_basic_motions(split="test", return_X_y=True) # using pure sklearn def row_mean(X): if isinstance(X, pd.Series): X = pd.DataFrame(X) Xt = pd.concat([pd.Series(col.apply(np.mean)) for _, col in X.items()], axis=1) return Xt def row_first(X): if isinstance(X, pd.Series): X = pd.DataFrame(X) Xt = pd.concat( [ pd.Series(from_nested_to_2d_array(col).iloc[:, 0]) for _, col in X.items() ], axis=1, ) return Xt # specify column as a list, otherwise pandas Series are selected and # passed on to the transformers transformer = ColumnTransformer([ ("mean", FunctionTransformer(func=row_mean, validate=False), ["dim_0"]), ("first", FunctionTransformer(func=row_first, validate=False), ["dim_1"]), ]) estimator = RandomForestClassifier(n_estimators=2, random_state=1) steps = [("extract", transformer), ("classify", estimator)] model = Pipeline(steps=steps) model.fit(X_train, y_train) expected = model.predict(X_test) # using sktime with sklearn pipeline transformer = ColumnTransformer([ ( "mean", SeriesToPrimitivesRowTransformer( FunctionTransformer(func=np.mean, validate=False), check_transformer=False, ), ["dim_0"], ), ( "first", SeriesToPrimitivesRowTransformer( FunctionTransformer(func=lambda x: x[0], validate=False), check_transformer=False, ), ["dim_1"], ), ]) estimator = RandomForestClassifier(n_estimators=2, random_state=1) steps = [("extract", transformer), ("classify", estimator)] model = Pipeline(steps=steps) model.fit(X_train, y_train) actual = model.predict(X_test) np.testing.assert_array_equal(expected, actual)
from sktime.datasets import load_gunpoint from sktime.transformers.panel.compose import ( SeriesToPrimitivesRowTransformer, ) from sktime.transformers.panel.segment import RandomIntervalSegmenter from sktime.transformers.panel.summarize import ( RandomIntervalFeatureExtractor, ) from sktime.utils._testing.panel import make_classification_problem from sktime.utils.time_series import time_series_slope X, y = make_classification_problem() n_classes = len(np.unique(y)) mean_transformer = SeriesToPrimitivesRowTransformer( FunctionTransformer(func=np.mean, validate=False, kw_args={"axis": 0}), check_transformer=False, ) std_transformer = SeriesToPrimitivesRowTransformer( FunctionTransformer(func=np.std, validate=False, kw_args={"axis": 0}), check_transformer=False, ) # Check simple cases. def test_predict_proba(): clf = TimeSeriesForestClassifier(n_estimators=2) clf.fit(X, y) proba = clf.predict_proba(X) assert proba.shape == (X.shape[0], n_classes) np.testing.assert_array_equal(np.ones(X.shape[0]), np.sum(proba, axis=1))
from sklearn.pipeline import Pipeline from sklearn.preprocessing import FunctionTransformer from sklearn.tree import DecisionTreeClassifier from sktime.datasets import load_gunpoint from sktime.transformers.panel.compose import ( SeriesToPrimitivesRowTransformer, ) from sktime.transformers.panel.segment import RandomIntervalSegmenter from sktime.utils._testing import make_classification_problem # load data X, y = make_classification_problem() X_train, X_test, y_train, y_test = train_test_split(X, y) mean_transformer = SeriesToPrimitivesRowTransformer(FunctionTransformer( func=np.mean, validate=False), check_transformer=False) std_transformer = SeriesToPrimitivesRowTransformer(FunctionTransformer( func=np.std, validate=False), check_transformer=False) def test_FeatureUnion_pipeline(): # pipeline with segmentation plus multiple feature extraction steps = [ ("segment", RandomIntervalSegmenter(n_intervals=1)), ( "transform", FeatureUnion([("mean", mean_transformer), ("std", std_transformer)]),
def test_feature_importances_multi_intervals_estimators( n_intervals, n_estimators): random_state = 1234 n_features = 2 # Compute feature importances using the default method features = [np.mean, np.std] steps = [ ( "transform", RandomIntervalFeatureExtractor(n_intervals=n_intervals, features=features, random_state=random_state), ), ("clf", DecisionTreeClassifier()), ] base_estimator = Pipeline(steps) clf1 = TimeSeriesForestClassifier(estimator=base_estimator, random_state=random_state, n_estimators=n_estimators) clf1.fit(X_train, y_train) fi_expected = np.zeros([n_estimators, n_intervals * n_features]) fi_actual = np.zeros([n_estimators, n_intervals * n_features]) # Obtain intervals and decision trees from fitted classifier for i in range(n_estimators): intervals = clf1.estimators_[i].steps[0][1].intervals_ steps = [ ("segment", IntervalSegmenter(intervals)), ( "transform", FeatureUnion([ ( "mean", SeriesToPrimitivesRowTransformer( FunctionTransformer(func=np.mean, validate=False), check_transformer=False, ), ), ( "std", SeriesToPrimitivesRowTransformer( FunctionTransformer(func=np.std, validate=False), check_transformer=False, ), ), ]), ), ("clf", clone(clf1.estimators_[i].steps[-1][1])), ] clf2 = Pipeline(steps) clf2.fit(X_train, y_train) # Compute and check for individual feature importances fi_expected[ i, :] = clf1.estimators_[i].steps[-1][1].feature_importances_ fi_actual[i, :] = clf2.steps[-1][1].feature_importances_ np.testing.assert_array_equal(fi_actual[i, :], fi_expected[i, :]) # Compute normalised feature values of the time series using the # default property fis_expacted = clf1.feature_importances_ # Compute normalised feature values of the time series from the pipeline # implementation n_timepoints = len(clf1.estimators_[0].steps[0][1]._time_index) fis_actual = np.zeros((n_timepoints, n_features)) for i in range(n_estimators): intervals = clf1.estimators_[i].steps[0][1].intervals_ for j in range(n_features): for k in range(n_intervals): start, end = intervals[k] fis_actual[start:end, j] += fi_actual[i, (j * n_intervals) + k] fis_actual = fis_actual / n_estimators / n_intervals np.testing.assert_array_equal(fis_actual, fis_expacted)