def test_tabularize(): n_obs_X = 20 n_cols_X = 3 X = generate_df_from_array(np.random.normal(size=n_obs_X), n_rows=10, n_cols=n_cols_X) # Test single series input. Xt = tabularize(X.iloc[:, 0], return_array=True) assert Xt.shape[0] == X.shape[0] assert Xt.shape[1] == n_obs_X Xt = tabularize(X.iloc[:, 0]) assert Xt.index.equals(X.index) # Test dataframe input with columns having series of different length. n_obs_Y = 13 n_cols_Y = 2 Y = generate_df_from_array(np.random.normal(size=n_obs_Y), n_rows=10, n_cols=n_cols_Y) X = pd.concat([X, Y], axis=1) Xt = tabularize(X, return_array=True) assert Xt.shape[0] == X.shape[0] assert Xt.shape[1] == (n_cols_X * n_obs_X) + (n_cols_Y * n_obs_Y) Xt = tabularize(X) assert Xt.index.equals(X.index)
def test_output_format_dim(): for n_cols in [1, 3]: for n_rows in [1, 3]: for n_obs in [2, 100]: for n_intervals in [ 0.1, 0.5, 1.0, 1, 3, 10, 'sqrt', 'random', 'log' ]: X = generate_df_from_array(np.ones(n_obs), n_rows=n_rows, n_cols=n_cols) trans = RandomIntervalSegmenter(n_intervals=n_intervals) Xt = trans.fit_transform(X) # Check number of rows and output type. assert isinstance(Xt, pd.DataFrame) assert Xt.shape[0] == X.shape[0] # Check number of generated intervals/columns. if n_intervals != 'random': if np.issubdtype(type(n_intervals), np.floating): assert Xt.shape[1] == np.maximum( 1, int(n_obs * n_intervals)) * n_cols elif np.issubdtype(type(n_intervals), np.integer): assert Xt.shape[1] == n_intervals * n_cols elif n_intervals == 'sqrt': assert Xt.shape[1] == np.maximum( 1, int(np.sqrt(n_obs))) * n_cols elif n_intervals == 'log': assert Xt.shape[1] == np.maximum( 1, int(np.log(n_obs))) * n_cols
def test_output_format_dim(): for n_cols in range(1, 3, 10): for n_rows in [1, 3, 10]: for n_obs in [2, 3, 10]: X = generate_df_from_array(np.ones(n_obs), n_rows=n_rows, n_cols=n_cols) _test_output_format_dim(X)
def test_bad_input_args(bad_components): X = generate_df_from_array(np.ones(10), n_rows=10, n_cols=1) if isinstance(bad_components, str): with pytest.raises(TypeError): PCATransformer(n_components=bad_components).fit(X) else: with pytest.raises(ValueError): PCATransformer(n_components=bad_components).fit(X)
def test_random_state(): X = generate_df_from_array(np.random.normal(size=20)) random_state = 1234 trans = RandomIntervalFeatureExtractor(n_intervals='random', random_state=random_state) first_Xt = trans.fit_transform(X) for _ in range(N_ITER): trans = RandomIntervalFeatureExtractor(n_intervals='random', random_state=random_state) Xt = trans.fit_transform(X) assert first_Xt.equals(Xt)
def test_output_format_dim(n_instances, len_series, n_intervals, features): X = generate_df_from_array(np.ones(len_series), n_rows=n_instances, n_cols=1) n_rows, n_cols = X.shape trans = RandomIntervalFeatureExtractor(n_intervals=n_intervals, features=features) Xt = trans.fit_transform(X) assert isinstance(Xt, pd.DataFrame) assert Xt.shape[0] == n_rows assert np.array_equal(Xt.values, np.ones(Xt.shape))
def test_random_state(): X = generate_df_from_array(np.random.normal(size=10)) random_state = 1234 for n_intervals in [0.5, 10, 'sqrt', 'random', 'log']: trans = RandomIntervalSegmenter(n_intervals=n_intervals, random_state=random_state) first_Xt = trans.fit_transform(X) for _ in range(N_ITER): trans = RandomIntervalSegmenter(n_intervals=n_intervals, random_state=random_state) Xt = trans.fit_transform(X) np.testing.assert_array_equal(tabularize(first_Xt).values, tabularize(Xt).values)
def test_rowwise_transformer_sklearn_transfomer(): mu = 10 sd = 5 X = generate_df_from_array(np.random.normal(loc=mu, scale=5, size=(100,)), n_rows=10, n_cols=1) t = StandardScaler(with_mean=True, with_std=True) r = RowwiseTransformer(t) Xt = r.fit_transform(X) assert Xt.shape == X.shape assert isinstance(Xt.iloc[0, 0], (pd.Series, np.ndarray)) # check series-to-series transform np.testing.assert_almost_equal(Xt.iloc[0, 0].mean(), 0) # check standardisation np.testing.assert_almost_equal(Xt.iloc[0, 0].std(), 1, decimal=2)
def test_results(n_instances, len_series, n_intervals): x = np.random.normal(size=len_series) X = generate_df_from_array(x, n_rows=n_instances, n_cols=1) trans = RandomIntervalFeatureExtractor( n_intervals=n_intervals, features=[np.mean, np.std, time_series_slope]) Xt = trans.fit_transform(X) # Check results for s, e in trans.intervals_: assert np.all(Xt.filter(like=f'_{s}_{e}_mean') == np.mean(x[s:e])) assert np.all(Xt.filter(like=f'_{s}_{e}_std') == np.std(x[s:e])) assert np.all( Xt.filter(like=f'_{s}_{e}_time_series_slope') == time_series_slope( x[s:e]))
def test_output_format_dim(len_series, n_instances, n_intervals): X = generate_df_from_array(np.ones(len_series), n_rows=n_instances, n_cols=1) trans = RandomIntervalSegmenter(n_intervals=n_intervals) Xt = trans.fit_transform(X) # Check number of rows and output type. assert isinstance(Xt, pd.DataFrame) assert Xt.shape[0] == X.shape[0] # Check number of generated intervals/columns. if n_intervals != 'random': if np.issubdtype(type(n_intervals), np.floating): assert Xt.shape[1] == np.maximum(1, int(len_series * n_intervals)) elif np.issubdtype(type(n_intervals), np.integer): assert Xt.shape[1] == n_intervals elif n_intervals == 'sqrt': assert Xt.shape[1] == np.maximum(1, int(np.sqrt(len_series))) elif n_intervals == 'log': assert Xt.shape[1] == np.maximum(1, int(np.log(len_series)))
from sktime.datasets import load_gunpoint from sktime.pipeline import FeatureUnion, Pipeline from sklearn.tree import DecisionTreeClassifier from sktime.transformers.series_to_series import RandomIntervalSegmenter from sktime.transformers.series_to_tabular import RandomIntervalFeatureExtractor from sklearn.preprocessing import FunctionTransformer from sktime.utils.time_series import time_series_slope N_ITER = 10 n = 20 d = 1 m = 20 n_classes = 2 X = generate_df_from_array(np.random.normal(size=m), n_rows=n, n_cols=d) y = pd.Series(np.random.choice(np.arange(n_classes) + 1, size=n)) # Check if random state always gives same results def test_random_state(): random_state = 1234 clf = TimeSeriesForestClassifier(n_estimators=2, random_state=random_state) clf.fit(X, y) first_pred = clf.predict_proba(X) for _ in range(N_ITER): clf = TimeSeriesForestClassifier(n_estimators=2, random_state=random_state) clf.fit(X, y) y_pred = clf.predict_proba(X) np.testing.assert_array_equal(first_pred, y_pred)
def test_bad_input_args(bad_interval): X = generate_df_from_array(np.ones(10), n_rows=10, n_cols=2) with pytest.raises(ValueError): RandomIntervalSegmenter(n_intervals=bad_interval).fit(X)
def test_early_trans_fail(): X = generate_df_from_array(np.ones(10), n_rows=1, n_cols=1) pca = PCATransformer(n_components=1) with pytest.raises(NotFittedError): pca.transform(X)
def test_bad_input_args(): X = generate_df_from_array(np.ones(10), n_rows=10, n_cols=2) bad_n_intervals = [0, -0, 'str', 1.2, -1.2, -1] for arg in bad_n_intervals: with pytest.raises(ValueError): RandomIntervalSegmenter(n_intervals=arg).fit(X)
from sktime.datasets import load_gunpoint from sktime.pipeline import FeatureUnion, Pipeline from sklearn.tree import DecisionTreeClassifier from sktime.transformers.segment import RandomIntervalSegmenter from sktime.transformers.summarise import RandomIntervalFeatureExtractor from sklearn.preprocessing import FunctionTransformer from sktime.utils.time_series import time_series_slope import pytest n_instances = 20 n_columns = 1 len_series = 20 n_classes = 2 X = generate_df_from_array(np.random.normal(size=len_series), n_rows=n_instances, n_cols=n_columns) y = pd.Series(np.random.choice(np.arange(n_classes) + 1, size=n_instances)) # Check if random state always gives same results def test_random_state(): N_ITER = 10 random_state = 1234 clf = TimeSeriesForestClassifier(n_estimators=2, random_state=random_state) clf.fit(X, y) first_pred = clf.predict_proba(X) for _ in range(N_ITER): clf = TimeSeriesForestClassifier(n_estimators=2, random_state=random_state)