예제 #1
0
def test_highLevelsktime(network=catch22ForestClassifier()):
    '''
    truly generalised test with sktime tasks/strategies
        load data, build task
        construct classifier, build strategy
        fit,
        score
    '''

    print("start test_highLevelsktime()")

    from sktime.highlevel.tasks import TSCTask
    from sktime.highlevel.strategies import TSCStrategy
    from sklearn.metrics import accuracy_score

    train = load_gunpoint(split='TRAIN')
    test = load_gunpoint(split='TEST')
    task = TSCTask(target='class_val', metadata=train)

    strategy = TSCStrategy(network)
    strategy.fit(task, train.iloc[:10])

    y_pred = strategy.predict(test.iloc[:10])
    y_test = test.iloc[:10][task.target].values.astype(np.float)
    print(accuracy_score(y_test, y_pred))

    print("End test_highLevelsktime()")
예제 #2
0
def test_TSCStrategy(dataset):
    train = dataset(split='TRAIN')
    test = dataset(split='TEST')
    s = TSCStrategy(classifier)
    task = TSCTask(target='class_val')
    s.fit(task, train)
    y_pred = s.predict(test)
    assert y_pred.shape == test[task.target].shape
예제 #3
0
def test_orchestration():
    data = load_gunpoint()

    dataset = DatasetRAM(dataset=data, dataset_name='gunpoint')
    task = TSCTask(target='class_val')

    # create strategies
    clf = TimeSeriesForestClassifier(n_estimators=1, random_state=1)
    strategy = TSCStrategy(clf)

    # result backend
    resultRAM = ResultRAM()
    orchestrator = Orchestrator(datasets=[dataset],
                                tasks=[task],
                                strategies=[strategy],
                                cv=SingleSplit(random_state=1),
                                result=resultRAM)

    orchestrator.run(save_strategies=False)
    result = resultRAM.load()
    actual = result[0].y_pred

    # expected output
    task = TSCTask(target='class_val')
    cv = SingleSplit(random_state=1)
    for train_idx, test_idx in cv.split(data):
        train = data.iloc[train_idx, :]
        test = data.iloc[test_idx, :]
        clf = TimeSeriesForestClassifier(n_estimators=1, random_state=1)
        strategy = TSCStrategy(clf)
        strategy.fit(task, train)
        expected = strategy.predict(test)

    np.testing.assert_array_equal(actual, expected)
예제 #4
0
def main(args):
    # Load and wrangle data
    dataset = Dataset.get_by_name(run.experiment.workspace, "processed_json")
    raw_data_df = dataset.to_pandas_dataframe()

    processed_data_df = prepare_dataframe(
        raw_data_df,
        time_series_length=args.timeserieslength,
        threshold=args.threshold)

    # Split data
    train = processed_data_df.sample(frac=args.train_data_split,
                                     random_state=42)
    test = processed_data_df.drop(train.index)

    # Example for logging
    run.log(
        "data_split_fraction",
        args.train_data_split,
        "Fraction of samples used for training",
    )
    run.log("train_samples", train.shape[0],
            "Number of samples used for training")
    run.log("test_samples", test.shape[0],
            "Number of samples used for testing")

    # Train
    task = TSCTask(target="label", metadata=train)
    clf = TimeSeriesForestClassifier(n_estimators=args.n_estimators)
    strategy = TSCStrategy(clf)
    strategy.fit(task, train)
    run.log("n_estimators", args.n_estimators,
            "Number of tree estimators used in the model")

    # Metrics
    y_pred = strategy.predict(test)
    y_test = test[task.target]
    accuracy = accuracy_score(y_test, y_pred)
    run.log("Accuracy", f"{accuracy:1.3f}", "Accuracy of model")

    # Persist model
    os.makedirs(args.model_folder, exist_ok=True)
    model_path = os.path.join(args.model_folder, args.model_filename)
    dump(strategy, model_path)
예제 #5
0
def test_single_dataset_single_strategy_against_sklearn(
        dataset, cv, metric_func, estimator, results_cls, tmpdir):
    # set up orchestration
    cv = cv(random_state=1)
    task = TSCTask(target="class_val")

    # create strategies
    clf = make_reduction_pipeline(estimator)
    strategy = TSCStrategy(clf)

    # result backend
    if results_cls in [HDDResults]:
        # for hard drive results, create temporary directory using pytest's tmpdir fixture
        tempdir = tmpdir.mkdir("results/")
        path = tempdir.dirpath()
        results = results_cls(path=path)
    elif results_cls in [RAMResults]:
        results = results_cls()
    else:
        raise ValueError()

    orchestrator = Orchestrator(datasets=[dataset],
                                tasks=[task],
                                strategies=[strategy],
                                cv=cv,
                                results=results)
    orchestrator.fit_predict(save_fitted_strategies=False)

    evaluator = Evaluator(results)

    # create metric classes for evaluation and set metric kwargs
    if metric_func in [accuracy_score]:
        kwargs = {}  # empty kwargs for simple pairwise metrics
        metric = PairwiseMetric(func=metric_func, name="metric")
    elif metric_func in [f1_score]:
        kwargs = {"average": "macro"}  # set kwargs for composite metrics
        metric = AggregateMetric(func=metric_func, name="metric", **kwargs)
    else:
        raise ValueError()

    metrics = evaluator.evaluate(metric=metric)
    actual = metrics["metric_mean"].iloc[0]

    # compare against sklearn cross_val_score
    data = dataset.load()  # load data
    X = data.loc[:, task.features]
    y = data.loc[:, task.target]
    expected = cross_val_score(clf,
                               X,
                               y,
                               scoring=make_scorer(metric_func, **kwargs),
                               cv=cv).mean()

    # compare results
    np.testing.assert_array_equal(actual, expected)
예제 #6
0
def test_stat():
    data = load_gunpoint()
    dataset = RAMDataset(dataset=data, name="gunpoint")
    task = TSCTask(target="class_val")

    fc = TimeSeriesForestClassifier(n_estimators=1, random_state=1)
    strategy_fc = TSCStrategy(fc, name="tsf")
    pf = ProximityForest(n_trees=1, random_state=1)
    strategy_pf = TSCStrategy(pf, name="pf")

    # result backend
    results = RAMResults()
    orchestrator = Orchestrator(datasets=[dataset],
                                tasks=[task],
                                strategies=[strategy_pf, strategy_fc],
                                cv=SingleSplit(random_state=1),
                                results=results)

    orchestrator.fit_predict(save_fitted_strategies=False)

    analyse = Evaluator(results)
    metric = PairwiseMetric(func=accuracy_score, name="accuracy")
    _ = analyse.evaluate(metric=metric)

    ranks = analyse.rank(ascending=True)
    pf_rank = ranks.loc[ranks.strategy == "pf",
                        "accuracy_mean_rank"].item()  # 1
    fc_rank = ranks.loc[ranks.strategy == "tsf",
                        "accuracy_mean_rank"].item()  # 2
    rank_array = [pf_rank, fc_rank]
    rank_array_test = [1, 2]
    _, sign_test_df = analyse.sign_test()

    sign_array = [[sign_test_df["pf"][0], sign_test_df["pf"][1]],
                  [sign_test_df["tsf"][0], sign_test_df["tsf"][1]]]
    sign_array_test = [[1, 1], [1, 1]]
    np.testing.assert_equal([rank_array, sign_array],
                            [rank_array_test, sign_array_test])
예제 #7
0
def test_automated_orchestration_vs_manual(data_loader):
    data = data_loader()

    dataset = RAMDataset(dataset=data, name="data")
    task = TSCTask(target="class_val")

    # create strategies
    # clf = TimeSeriesForestClassifier(n_estimators=1, random_state=1)
    clf = make_reduction_pipeline(
        RandomForestClassifier(n_estimators=2, random_state=1))
    strategy = TSCStrategy(clf)

    # result backend
    results = RAMResults()
    orchestrator = Orchestrator(datasets=[dataset],
                                tasks=[task],
                                strategies=[strategy],
                                cv=SingleSplit(random_state=1),
                                results=results)

    orchestrator.fit_predict(save_fitted_strategies=False)
    result = next(results.load_predictions(
        cv_fold=0, train_or_test="test"))  # get only first item of iterator
    actual = result.y_pred

    # expected output
    task = TSCTask(target="class_val")
    cv = SingleSplit(random_state=1)
    train_idx, test_idx = next(cv.split(data))
    train = data.iloc[train_idx, :]
    test = data.iloc[test_idx, :]
    strategy.fit(task, train)
    expected = strategy.predict(test)

    # compare results
    np.testing.assert_array_equal(actual, expected)
예제 #8
0
#     max_features_subset=3,
#     p_sample_subset=0.5,
#     bootstrap_sample_subset=False,
#     transformation="pca",
#     random_state=1,
# )

estimator = RotationForestClassifier(n_estimators=200,
                                     min_features_subset=3,
                                     max_features_subset=3,
                                     p_sample_subset=0.5,
                                     bootstrap_sample_subset=False,
                                     n_jobs=-1)

strategies = [
    TSCStrategy(estimator=make_reduction_pipeline(estimator=estimator),
                name="rotf")
]

# define results output
results = HDDResults(path=RESULTS_PATH)
# results = RAMResults()

# run orchestrator
orchestrator = Orchestrator(datasets=datasets,
                            tasks=tasks,
                            strategies=strategies,
                            cv=PresplitFilesCV(),
                            results=results)

start = time.time()
orchestrator.fit_predict(save_fitted_strategies=False,
예제 #9
0
train = processed_data_df.sample(frac=args.train_data_split, random_state=42)
test = processed_data_df.drop(train.index)

# Example logging
run.log(
    "data_split_fraction",
    args.train_data_split,
    "Fraction of samples used for training",
)
run.log("train_samples", train.shape[0], "Number of samples used for training")
run.log("test_samples", test.shape[0], "Number of samples used for testing")

# Train
task = TSCTask(target="label", metadata=train)
clf = TimeSeriesForestClassifier(n_estimators=args.n_estimators)
strategy = TSCStrategy(clf)
strategy.fit(task, train)
run.log("n_estimators", args.n_estimators,
        "Number of tree estimators used in the model")

# Metrics
y_pred = strategy.predict(test)
y_test = test[task.target]
accuracy = accuracy_score(y_test, y_pred)
run.log("Accuracy", f"{accuracy:1.3f}", "Accuracy of model")

# Add to outputs
os.makedirs("outputs", exist_ok=True)
local_model_path = os.path.join("outputs", "model.pkl")
dump(strategy, local_model_path)
run.upload_file("pickled_model", local_model_path)