def test_different_pipelines(): random_state = 1233 X_train, y_train = make_classification_problem() steps = [ ('segment', RandomIntervalSegmenter(n_intervals='sqrt', random_state=random_state)), ('transform', FeatureUnion([ ('mean', RowTransformer(FunctionTransformer(func=np.mean, validate=False))), ('std', RowTransformer(FunctionTransformer(func=np.std, validate=False))), ('slope', RowTransformer( FunctionTransformer(func=time_series_slope, validate=False))), ])), ] pipe = Pipeline(steps) a = pipe.fit_transform(X_train) tran = RandomIntervalFeatureExtractor( n_intervals='sqrt', features=[np.mean, np.std, time_series_slope], random_state=random_state) b = tran.fit_transform(X_train) np.testing.assert_array_equal(a, b) np.testing.assert_array_equal(pipe.steps[0][1].intervals_, tran.intervals_)
def test_output_format_dim(n_instances, n_timepoints, n_intervals, features): X = generate_df_from_array(np.ones(n_timepoints), n_rows=n_instances, n_cols=1) n_rows, n_cols = X.shape trans = RandomIntervalFeatureExtractor(n_intervals=n_intervals, features=features) Xt = trans.fit_transform(X) assert isinstance(Xt, pd.DataFrame) assert Xt.shape[0] == n_rows assert np.array_equal(Xt.values, np.ones(Xt.shape))
def test_different_implementations(): random_state = 1233 X_train, y_train = make_classification_problem() # Compare with chained transformations. tran1 = RandomIntervalSegmenter(n_intervals='sqrt', random_state=random_state) tran2 = RowTransformer(FunctionTransformer(func=np.mean, validate=False)) A = tran2.fit_transform(tran1.fit_transform(X_train)) tran = RandomIntervalFeatureExtractor(n_intervals='sqrt', features=[np.mean], random_state=random_state) B = tran.fit_transform(X_train) np.testing.assert_array_equal(A, B)
def test_TimeSeriesForest_predictions(n_estimators, n_intervals): random_state = 1234 X_train, y_train = load_gunpoint(split="train", return_X_y=True) X_test, y_test = load_gunpoint(split="test", return_X_y=True) features = [np.mean, np.std, time_series_slope] steps = [('transform', RandomIntervalFeatureExtractor(random_state=random_state, features=features)), ('clf', DecisionTreeClassifier())] estimator = Pipeline(steps) clf1 = TimeSeriesForestClassifier(estimator=estimator, random_state=random_state, n_estimators=n_estimators) clf1.fit(X_train, y_train) a = clf1.predict_proba(X_test) # default, semi-modular implementation using # RandomIntervalFeatureExtractor internally clf2 = TimeSeriesForestClassifier(random_state=random_state, n_estimators=n_estimators) clf2.fit(X_train, y_train) b = clf2.predict_proba(X_test) np.testing.assert_array_equal(a, b)
def main(): generator = DataGenerator(labeled_data_file=args.labeled_data_file, data_util_file=args.data_util_file, threshold=args.threshold, dt=args.dt, L=args.L, tmin=args.tmin, tmax=args.tmax) training_data, test_data = generator.get_data(ts_nth_element=args.ts_nth_element, training_frac=0.7) steps = [ ('extract', RandomIntervalFeatureExtractor(n_intervals='sqrt', features=[np.mean, np.std, time_series_slope])), ('clf', DecisionTreeClassifier()) ] time_series_tree = Pipeline(steps) tsf = TimeSeriesForestClassifier( estimator=time_series_tree, n_estimators=args.n_estimators, criterion='entropy' if args.criterion == 'entropy' else 'gini', bootstrap=True, oob_score=True, random_state=1, # n_jobs=4, verbose=1 ) x = detabularize(pd.DataFrame(training_data[:,1:])) try: with parallel_backend('threading', n_jobs=args.n_jobs): tsf = tsf.fit(x, training_data[:,0]) with open('{save_file_name}.pickle'.format(save_file_name=args.save_file_name), 'wb') \ as TimeSeriesForestModel: pickle.dump(tsf, TimeSeriesForestModel, protocol=pickle.HIGHEST_PROTOCOL) except Exception as ex: print(ex)
def _validate_estimator(self): if not isinstance(self.n_estimators, numbers.Integral): raise ValueError("n_estimators must be an integer, " "got {0}.".format(type(self.n_estimators))) if self.n_estimators <= 0: raise ValueError("n_estimators must be greater than zero, " "got {0}.".format(self.n_estimators)) # Set base estimator if self.estimator is None: # Set default time series forest features = [np.mean, np.std, time_series_slope] steps = [('transform', RandomIntervalFeatureExtractor( n_intervals='sqrt', features=features, random_state=self.random_state)), ('clf', DecisionTreeRegressor(random_state=self.random_state))] self.estimator_ = Pipeline(steps) else: # else check given estimator is a pipeline with prior # transformations and final decision tree if not isinstance(self.estimator, Pipeline): raise ValueError('`estimator` must be ' 'pipeline with transforms.') if not isinstance(self.estimator.steps[-1][1], DecisionTreeRegressor): raise ValueError('Last step in `estimator` must be ' 'DecisionTreeRegressor.') self.estimator_ = self.estimator # Set parameters according to naming in pipeline estimator_params = { "criterion": self.criterion, "max_depth": self.max_depth, "min_samples_split": self.min_samples_split, "min_samples_leaf": self.min_samples_leaf, "min_weight_fraction_leaf": self.min_weight_fraction_leaf, "max_features": self.max_features, "max_leaf_nodes": self.max_leaf_nodes, "min_impurity_decrease": self.min_impurity_decrease, "min_impurity_split": self.min_impurity_split, } final_estimator = self.estimator_.steps[-1][0] self.estimator_params = { f'{final_estimator}__{pname}': pval for pname, pval in estimator_params.items() } # Set renamed estimator parameters for pname, pval in self.estimator_params.items(): self.__setattr__(pname, pval)
def test_results(n_instances, n_timepoints, n_intervals): x = np.random.normal(size=n_timepoints) X = generate_df_from_array(x, n_rows=n_instances, n_cols=1) t = RandomIntervalFeatureExtractor( n_intervals=n_intervals, features=[np.mean, np.std, time_series_slope]) Xt = t.fit_transform(X) # Check results intervals = t.intervals_ for start, end in intervals: expected_mean = np.mean(x[start:end]) expected_std = np.std(x[start:end]) expected_slope = time_series_slope(x[start:end]) actual_means = Xt.filter(like=f'*_{start}_{end}_mean').values actual_stds = Xt.filter(like=f'_{start}_{end}_std').values actual_slopes = Xt.filter( like=f'_{start}_{end}_time_series_slope').values assert np.all(actual_means == expected_mean) assert np.all(actual_stds == expected_std) assert np.all(actual_slopes == expected_slope)
def test_equivalent_model_specifications(n_intervals, n_estimators): random_state = 1234 X_train, y_train = load_gunpoint(split="train", return_X_y=True) X_test, y_test = load_gunpoint(split="test", return_X_y=True) # Due to tie-breaking/floating point rounding in the final decision tree # classifier, the results depend on the # exact column order of the input data # Compare pipeline predictions outside of ensemble. steps = [ ('segment', RandomIntervalSegmenter(n_intervals=n_intervals, random_state=random_state)), ('transform', FeatureUnion([ ('mean', RowTransformer(FunctionTransformer(func=np.mean, validate=False))), ('std', RowTransformer(FunctionTransformer(func=np.std, validate=False))), ('slope', RowTransformer( FunctionTransformer(func=time_series_slope, validate=False))) ])), ('clf', DecisionTreeClassifier(random_state=random_state)) ] clf1 = Pipeline(steps) clf1.fit(X_train, y_train) a = clf1.predict(X_test) steps = [('transform', RandomIntervalFeatureExtractor( n_intervals=n_intervals, features=[np.mean, np.std, time_series_slope], random_state=random_state)), ('clf', DecisionTreeClassifier(random_state=random_state))] clf2 = Pipeline(steps) clf2.fit(X_train, y_train) b = clf2.predict(X_test) np.array_equal(a, b)
# Define param grid n_estimators_list = [50, 100, 200, 300, 400, 500] features_list = [ [np.mean, np.std, time_series_slope], [np.mean, np.std, time_series_slope, skew], [np.mean, np.std, time_series_slope, kurtosis], [np.mean, np.std, time_series_slope, skew, kurtosis], ] n_intervals_list = [0.05, 0.1, 0.25, 0.5, 'log', 'sqrt'] param_grid = { 'n_estimators': n_estimators_list, 'estimator__transform__n_intervals': n_intervals_list, 'estimator__transform__features': features_list } BASE_ESTIMATOR = Pipeline([('transform', RandomIntervalFeatureExtractor()), ('estimator', DecisionTreeClassifier())]) warnings.filterwarnings("ignore", category=UserWarning) warnings.filterwarnings("ignore", category=FutureWarning) HOME = os.path.expanduser("~") DATA_PATH = os.path.join(HOME, "Documents/Research/data/Univariate_ts") RESULTS_PATH = "results" RANDOM_STATE = 1 OUTER_CV_N_SPLITS = 30 INNER_N_SPLITS = 10 # Alternatively, we can use a helper function to create them automatically datasets = make_datasets(path=DATA_PATH, dataset_cls=UEADataset,
def test_bad_features(bad_features): X, y = make_classification_problem() with pytest.raises(ValueError): RandomIntervalFeatureExtractor(n_intervals=bad_features).fit(X)