def test_highLevelsktime(network=catch22ForestClassifier()): ''' truly generalised test with sktime tasks/strategies load data, build task construct classifier, build strategy fit, score ''' print("start test_highLevelsktime()") from sktime.highlevel.tasks import TSCTask from sktime.highlevel.strategies import TSCStrategy from sklearn.metrics import accuracy_score train = load_gunpoint(split='TRAIN') test = load_gunpoint(split='TEST') task = TSCTask(target='class_val', metadata=train) strategy = TSCStrategy(network) strategy.fit(task, train.iloc[:10]) y_pred = strategy.predict(test.iloc[:10]) y_test = test.iloc[:10][task.target].values.astype(np.float) print(accuracy_score(y_test, y_pred)) print("End test_highLevelsktime()")
def test_TSCStrategy(dataset): train = dataset(split='TRAIN') test = dataset(split='TEST') s = TSCStrategy(classifier) task = TSCTask(target='class_val') s.fit(task, train) y_pred = s.predict(test) assert y_pred.shape == test[task.target].shape
def test_orchestration(): data = load_gunpoint() dataset = DatasetRAM(dataset=data, dataset_name='gunpoint') task = TSCTask(target='class_val') # create strategies clf = TimeSeriesForestClassifier(n_estimators=1, random_state=1) strategy = TSCStrategy(clf) # result backend resultRAM = ResultRAM() orchestrator = Orchestrator(datasets=[dataset], tasks=[task], strategies=[strategy], cv=SingleSplit(random_state=1), result=resultRAM) orchestrator.run(save_strategies=False) result = resultRAM.load() actual = result[0].y_pred # expected output task = TSCTask(target='class_val') cv = SingleSplit(random_state=1) for train_idx, test_idx in cv.split(data): train = data.iloc[train_idx, :] test = data.iloc[test_idx, :] clf = TimeSeriesForestClassifier(n_estimators=1, random_state=1) strategy = TSCStrategy(clf) strategy.fit(task, train) expected = strategy.predict(test) np.testing.assert_array_equal(actual, expected)
def main(args): # Load and wrangle data dataset = Dataset.get_by_name(run.experiment.workspace, "processed_json") raw_data_df = dataset.to_pandas_dataframe() processed_data_df = prepare_dataframe( raw_data_df, time_series_length=args.timeserieslength, threshold=args.threshold) # Split data train = processed_data_df.sample(frac=args.train_data_split, random_state=42) test = processed_data_df.drop(train.index) # Example for logging run.log( "data_split_fraction", args.train_data_split, "Fraction of samples used for training", ) run.log("train_samples", train.shape[0], "Number of samples used for training") run.log("test_samples", test.shape[0], "Number of samples used for testing") # Train task = TSCTask(target="label", metadata=train) clf = TimeSeriesForestClassifier(n_estimators=args.n_estimators) strategy = TSCStrategy(clf) strategy.fit(task, train) run.log("n_estimators", args.n_estimators, "Number of tree estimators used in the model") # Metrics y_pred = strategy.predict(test) y_test = test[task.target] accuracy = accuracy_score(y_test, y_pred) run.log("Accuracy", f"{accuracy:1.3f}", "Accuracy of model") # Persist model os.makedirs(args.model_folder, exist_ok=True) model_path = os.path.join(args.model_folder, args.model_filename) dump(strategy, model_path)
def test_single_dataset_single_strategy_against_sklearn( dataset, cv, metric_func, estimator, results_cls, tmpdir): # set up orchestration cv = cv(random_state=1) task = TSCTask(target="class_val") # create strategies clf = make_reduction_pipeline(estimator) strategy = TSCStrategy(clf) # result backend if results_cls in [HDDResults]: # for hard drive results, create temporary directory using pytest's tmpdir fixture tempdir = tmpdir.mkdir("results/") path = tempdir.dirpath() results = results_cls(path=path) elif results_cls in [RAMResults]: results = results_cls() else: raise ValueError() orchestrator = Orchestrator(datasets=[dataset], tasks=[task], strategies=[strategy], cv=cv, results=results) orchestrator.fit_predict(save_fitted_strategies=False) evaluator = Evaluator(results) # create metric classes for evaluation and set metric kwargs if metric_func in [accuracy_score]: kwargs = {} # empty kwargs for simple pairwise metrics metric = PairwiseMetric(func=metric_func, name="metric") elif metric_func in [f1_score]: kwargs = {"average": "macro"} # set kwargs for composite metrics metric = AggregateMetric(func=metric_func, name="metric", **kwargs) else: raise ValueError() metrics = evaluator.evaluate(metric=metric) actual = metrics["metric_mean"].iloc[0] # compare against sklearn cross_val_score data = dataset.load() # load data X = data.loc[:, task.features] y = data.loc[:, task.target] expected = cross_val_score(clf, X, y, scoring=make_scorer(metric_func, **kwargs), cv=cv).mean() # compare results np.testing.assert_array_equal(actual, expected)
def test_stat(): data = load_gunpoint() dataset = RAMDataset(dataset=data, name="gunpoint") task = TSCTask(target="class_val") fc = TimeSeriesForestClassifier(n_estimators=1, random_state=1) strategy_fc = TSCStrategy(fc, name="tsf") pf = ProximityForest(n_trees=1, random_state=1) strategy_pf = TSCStrategy(pf, name="pf") # result backend results = RAMResults() orchestrator = Orchestrator(datasets=[dataset], tasks=[task], strategies=[strategy_pf, strategy_fc], cv=SingleSplit(random_state=1), results=results) orchestrator.fit_predict(save_fitted_strategies=False) analyse = Evaluator(results) metric = PairwiseMetric(func=accuracy_score, name="accuracy") _ = analyse.evaluate(metric=metric) ranks = analyse.rank(ascending=True) pf_rank = ranks.loc[ranks.strategy == "pf", "accuracy_mean_rank"].item() # 1 fc_rank = ranks.loc[ranks.strategy == "tsf", "accuracy_mean_rank"].item() # 2 rank_array = [pf_rank, fc_rank] rank_array_test = [1, 2] _, sign_test_df = analyse.sign_test() sign_array = [[sign_test_df["pf"][0], sign_test_df["pf"][1]], [sign_test_df["tsf"][0], sign_test_df["tsf"][1]]] sign_array_test = [[1, 1], [1, 1]] np.testing.assert_equal([rank_array, sign_array], [rank_array_test, sign_array_test])
def test_automated_orchestration_vs_manual(data_loader): data = data_loader() dataset = RAMDataset(dataset=data, name="data") task = TSCTask(target="class_val") # create strategies # clf = TimeSeriesForestClassifier(n_estimators=1, random_state=1) clf = make_reduction_pipeline( RandomForestClassifier(n_estimators=2, random_state=1)) strategy = TSCStrategy(clf) # result backend results = RAMResults() orchestrator = Orchestrator(datasets=[dataset], tasks=[task], strategies=[strategy], cv=SingleSplit(random_state=1), results=results) orchestrator.fit_predict(save_fitted_strategies=False) result = next(results.load_predictions( cv_fold=0, train_or_test="test")) # get only first item of iterator actual = result.y_pred # expected output task = TSCTask(target="class_val") cv = SingleSplit(random_state=1) train_idx, test_idx = next(cv.split(data)) train = data.iloc[train_idx, :] test = data.iloc[test_idx, :] strategy.fit(task, train) expected = strategy.predict(test) # compare results np.testing.assert_array_equal(actual, expected)
# max_features_subset=3, # p_sample_subset=0.5, # bootstrap_sample_subset=False, # transformation="pca", # random_state=1, # ) estimator = RotationForestClassifier(n_estimators=200, min_features_subset=3, max_features_subset=3, p_sample_subset=0.5, bootstrap_sample_subset=False, n_jobs=-1) strategies = [ TSCStrategy(estimator=make_reduction_pipeline(estimator=estimator), name="rotf") ] # define results output results = HDDResults(path=RESULTS_PATH) # results = RAMResults() # run orchestrator orchestrator = Orchestrator(datasets=datasets, tasks=tasks, strategies=strategies, cv=PresplitFilesCV(), results=results) start = time.time() orchestrator.fit_predict(save_fitted_strategies=False,
train = processed_data_df.sample(frac=args.train_data_split, random_state=42) test = processed_data_df.drop(train.index) # Example logging run.log( "data_split_fraction", args.train_data_split, "Fraction of samples used for training", ) run.log("train_samples", train.shape[0], "Number of samples used for training") run.log("test_samples", test.shape[0], "Number of samples used for testing") # Train task = TSCTask(target="label", metadata=train) clf = TimeSeriesForestClassifier(n_estimators=args.n_estimators) strategy = TSCStrategy(clf) strategy.fit(task, train) run.log("n_estimators", args.n_estimators, "Number of tree estimators used in the model") # Metrics y_pred = strategy.predict(test) y_test = test[task.target] accuracy = accuracy_score(y_test, y_pred) run.log("Accuracy", f"{accuracy:1.3f}", "Accuracy of model") # Add to outputs os.makedirs("outputs", exist_ok=True) local_model_path = os.path.join("outputs", "model.pkl") dump(strategy, local_model_path) run.upload_file("pickled_model", local_model_path)