예제 #1
0
def test_min_length(n_intervals, min_length):
    series_len = 30
    x = np.arange(series_len)

    tran = RandomIntervalSegmenter(n_intervals=n_intervals,
                                   min_length=min_length)
    intervals = tran._rand_intervals_fixed_n(x, n_intervals=n_intervals)
    starts = intervals[:, 0]
    ends = intervals[:, 1]
    assert np.all(ends - starts >= min_length)  # minimum length
예제 #2
0
def rise_benchmarking():
    for i in range(len(benchmark_datasets)):
        dataset = benchmark_datasets[i]
        print(str(i) + " problem = " + dataset)
        rise = fb.RandomIntervalSpectralForest(n_estimators=100)
        exp.run_experiment(overwrite=True,
                           problem_path=data_dir,
                           results_path=results_dir,
                           cls_name="PythonRISE",
                           classifier=rise,
                           dataset=dataset,
                           train_file=False)
        steps = [('segment',
                  RandomIntervalSegmenter(n_intervals=1, min_length=5)),
                 ('transform',
                  FeatureUnion([('acf',
                                 RowTransformer(
                                     FunctionTransformer(func=acf_coefs,
                                                         validate=False))),
                                ('ps',
                                 RowTransformer(
                                     FunctionTransformer(func=powerspectrum,
                                                         validate=False)))])),
                 ('tabularise', Tabularizer()),
                 ('clf', DecisionTreeClassifier())]
        base_estimator = Pipeline(steps)
        rise = TimeSeriesForestClassifier(estimator=base_estimator,
                                          n_estimators=100)
        exp.run_experiment(overwrite=True,
                           problem_path=data_dir,
                           results_path=results_dir,
                           cls_name="PythonRISEComposite",
                           classifier=rise,
                           dataset=dataset,
                           train_file=False)
def test_different_implementations():
    random_state = 1233
    X_train, y_train = make_classification_problem()

    # Compare with chained transformations.
    tran1 = RandomIntervalSegmenter(n_intervals='sqrt',
                                    random_state=random_state)
    tran2 = RowTransformer(FunctionTransformer(func=np.mean, validate=False))
    A = tran2.fit_transform(tran1.fit_transform(X_train))

    tran = RandomIntervalFeatureExtractor(n_intervals='sqrt',
                                          features=[np.mean],
                                          random_state=random_state)
    B = tran.fit_transform(X_train)

    np.testing.assert_array_equal(A, B)
def test_different_pipelines():
    random_state = 1233
    X_train, y_train = make_classification_problem()
    steps = [
        ('segment',
         RandomIntervalSegmenter(n_intervals='sqrt',
                                 random_state=random_state)),
        ('transform',
         FeatureUnion([
             ('mean',
              RowTransformer(FunctionTransformer(func=np.mean,
                                                 validate=False))),
             ('std',
              RowTransformer(FunctionTransformer(func=np.std,
                                                 validate=False))),
             ('slope',
              RowTransformer(
                  FunctionTransformer(func=time_series_slope,
                                      validate=False))),
         ])),
    ]
    pipe = Pipeline(steps)
    a = pipe.fit_transform(X_train)
    tran = RandomIntervalFeatureExtractor(
        n_intervals='sqrt',
        features=[np.mean, np.std, time_series_slope],
        random_state=random_state)
    b = tran.fit_transform(X_train)
    np.testing.assert_array_equal(a, b)
    np.testing.assert_array_equal(pipe.steps[0][1].intervals_, tran.intervals_)
예제 #5
0
def test_FeatureUnion_pipeline():
    # pipeline with segmentation plus multiple feature extraction
    steps = [
        ("segment", RandomIntervalSegmenter(n_intervals=3)),
        (
            "transform",
            FeatureUnion([
                (
                    "mean",
                    RowTransformer(
                        FunctionTransformer(func=np.mean, validate=False)),
                ),
                (
                    "std",
                    RowTransformer(
                        FunctionTransformer(func=np.std, validate=False)),
                ),
            ]),
        ),
        ("clf", DecisionTreeClassifier()),
    ]
    clf = Pipeline(steps)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    assert y_pred.shape[0] == y_test.shape[0]
    np.testing.assert_array_equal(np.unique(y_pred), np.unique(y_test))
예제 #6
0
def test_rand_intervals_rand_n(random_state):
    tran = RandomIntervalSegmenter(random_state=random_state)
    series_len = 30
    x = np.arange(series_len)

    intervals = tran._rand_intervals_rand_n(x)
    assert intervals.ndim == 2
    assert np.issubdtype(intervals.dtype, np.integer)
    # assert intervals.shape[0] == np.unique(intervals, axis=0).shape[0]  #
    # no duplicates

    starts = intervals[:, 0]
    ends = intervals[:, 1]
    assert np.all(ends <= x.size)  # within bounds
    assert np.all(starts >= 0)  # within bounds
    assert np.all(ends > starts)  # only non-empty intervals
예제 #7
0
def set_classifier(cls, resampleId):
    """
    Basic way of determining the classifier to build. To differentiate settings just and another elif. So, for example, if
    you wanted tuned TSF, you just pass TuneTSF and set up the tuning mechanism in the elif.
    This may well get superceded, it is just how e have always done it
    :param cls: String indicating which classifier you want
    :return: A classifier.

    """
    if cls.lower() == 'pf':
        return pf.ProximityForest(random_state=resampleId)
    elif cls.lower() == 'pt':
        return pf.ProximityTree(random_state=resampleId)
    elif cls.lower() == 'ps':
        return pf.ProximityStump(random_state=resampleId)
    elif cls.lower() == 'rise':
        return fb.RandomIntervalSpectralForest(random_state=resampleId)
    elif cls.lower() == 'tsf':
        return ib.TimeSeriesForest(random_state=resampleId)
    elif cls.lower() == 'boss':
        return db.BOSSEnsemble(random_state=resampleId)
    elif cls.lower() == 'cboss':
        return db.BOSSEnsemble(random_state=resampleId,
                               randomised_ensemble=True,
                               max_ensemble_size=50)
    elif cls.lower() == 'tde':
        return tde.TemporalDictionaryEnsemble(random_state=resampleId)
    elif cls.lower() == 'st':
        return st.ShapeletTransformClassifier(time_contract_in_mins=1500)
    elif cls.lower() == 'dtwcv':
        return nn.KNeighborsTimeSeriesClassifier(metric="dtwcv")
    elif cls.lower() == 'ee' or cls.lower() == 'elasticensemble':
        return dist.ElasticEnsemble()
    elif cls.lower() == 'tsfcomposite':
        #It defaults to TSF
        return ensemble.TimeSeriesForestClassifier()
    elif cls.lower() == 'risecomposite':
        steps = [('segment',
                  RandomIntervalSegmenter(n_intervals=1, min_length=5)),
                 ('transform',
                  FeatureUnion([('acf',
                                 RowTransformer(
                                     FunctionTransformer(func=acf_coefs,
                                                         validate=False))),
                                ('ps',
                                 RowTransformer(
                                     FunctionTransformer(func=powerspectrum,
                                                         validate=False)))])),
                 ('tabularise', Tabularizer()),
                 ('clf', DecisionTreeClassifier())]
        base_estimator = Pipeline(steps)
        return ensemble.TimeSeriesForestClassifier(estimator=base_estimator,
                                                   n_estimators=100)
    else:
        raise Exception('UNKNOWN CLASSIFIER')
예제 #8
0
def test_output_format_dim(n_timepoints, n_instances, n_intervals):
    X = generate_df_from_array(np.ones(n_timepoints),
                               n_rows=n_instances,
                               n_cols=1)

    trans = RandomIntervalSegmenter(n_intervals=n_intervals)
    Xt = trans.fit_transform(X)

    # Check number of rows and output type.
    assert isinstance(Xt, pd.DataFrame)
    assert Xt.shape[0] == X.shape[0]

    # Check number of generated intervals/columns.
    if n_intervals != 'random':
        if np.issubdtype(type(n_intervals), np.floating):
            assert Xt.shape[1] == np.maximum(1,
                                             int(n_timepoints * n_intervals))
        elif np.issubdtype(type(n_intervals), np.integer):
            assert Xt.shape[1] == n_intervals
        elif n_intervals == 'sqrt':
            assert Xt.shape[1] == np.maximum(1, int(np.sqrt(n_timepoints)))
        elif n_intervals == 'log':
            assert Xt.shape[1] == np.maximum(1, int(np.log(n_timepoints)))
def test_equivalent_model_specifications(n_intervals, n_estimators):
    random_state = 1234
    X_train, y_train = load_gunpoint(split="train", return_X_y=True)
    X_test, y_test = load_gunpoint(split="test", return_X_y=True)

    # Due to tie-breaking/floating point rounding in the final decision tree
    # classifier, the results depend on the
    # exact column order of the input data

    #  Compare pipeline predictions outside of ensemble.
    steps = [
        ('segment',
         RandomIntervalSegmenter(n_intervals=n_intervals,
                                 random_state=random_state)),
        ('transform',
         FeatureUnion([
             ('mean',
              RowTransformer(FunctionTransformer(func=np.mean,
                                                 validate=False))),
             ('std',
              RowTransformer(FunctionTransformer(func=np.std,
                                                 validate=False))),
             ('slope',
              RowTransformer(
                  FunctionTransformer(func=time_series_slope, validate=False)))
         ])), ('clf', DecisionTreeClassifier(random_state=random_state))
    ]
    clf1 = Pipeline(steps)
    clf1.fit(X_train, y_train)
    a = clf1.predict(X_test)

    steps = [('transform',
              RandomIntervalFeatureExtractor(
                  n_intervals=n_intervals,
                  features=[np.mean, np.std, time_series_slope],
                  random_state=random_state)),
             ('clf', DecisionTreeClassifier(random_state=random_state))]
    clf2 = Pipeline(steps)
    clf2.fit(X_train, y_train)
    b = clf2.predict(X_test)
    np.array_equal(a, b)
예제 #10
0
def test_random_state():
    X = generate_df_from_array(np.random.normal(size=10))
    random_state = 1234

    for n_intervals in [0.5, 10, 'sqrt', 'random', 'log']:
        trans = RandomIntervalSegmenter(n_intervals=n_intervals,
                                        random_state=random_state)
        first_Xt = trans.fit_transform(X)
        for _ in range(N_ITER):
            trans = RandomIntervalSegmenter(n_intervals=n_intervals,
                                            random_state=random_state)
            Xt = trans.fit_transform(X)
            np.testing.assert_array_equal(
                tabularize(first_Xt).values,
                tabularize(Xt).values)
예제 #11
0
def tsf_benchmarking():
    for i in range(len(benchmark_datasets)):
        dataset = benchmark_datasets[i]
        print(str(i) + " problem = " + dataset)
        tsf = ib.TimeSeriesForest(n_estimators=100)
        exp.run_experiment(overwrite=False,
                           problem_path=data_dir,
                           results_path=results_dir,
                           cls_name="PythonTSF",
                           classifier=tsf,
                           dataset=dataset,
                           train_file=False)
        steps = [
            ('segment', RandomIntervalSegmenter(n_intervals='sqrt')),
            ('transform',
             FeatureUnion([('mean',
                            RowTransformer(
                                FunctionTransformer(func=np.mean,
                                                    validate=False))),
                           ('std',
                            RowTransformer(
                                FunctionTransformer(func=np.std,
                                                    validate=False))),
                           ('slope',
                            RowTransformer(
                                FunctionTransformer(func=time_series_slope,
                                                    validate=False)))])),
            ('clf', DecisionTreeClassifier())
        ]
        base_estimator = Pipeline(steps)
        tsf = TimeSeriesForestClassifier(estimator=base_estimator,
                                         n_estimators=100)
        exp.run_experiment(overwrite=False,
                           problem_path=data_dir,
                           results_path=results_dir,
                           cls_name="PythonTSFComposite",
                           classifier=tsf,
                           dataset=dataset,
                           train_file=False)
예제 #12
0
def set_classifier(cls, resampleId):
    """
    Basic way of determining the classifier to build. To differentiate settings just and another elif. So, for example, if
    you wanted tuned TSF, you just pass TuneTSF and set up the tuning mechanism in the elif.
    This may well get superceded, it is just how e have always done it
    :param cls: String indicating which classifier you want
    :return: A classifier.

    """
    if cls.lower() == 'pf':
        return pf.ProximityForest(random_state=resampleId)
    elif cls.lower() == 'pt':
        return pf.ProximityTree(random_state=resampleId)
    elif cls.lower() == 'ps':
        return pf.ProximityStump(random_state=resampleId)
    elif cls.lower() == 'rise':
        return fb.RandomIntervalSpectralForest(random_state=resampleId)
    elif cls.lower() == 'tsf':
        return ib.TimeSeriesForest(random_state=resampleId)
    elif cls.lower() == 'boss':
        return db.BOSSEnsemble()
    elif cls.lower() == 'st':
        return st.ShapeletTransformClassifier(time_contract_in_mins=1500)
    elif cls.lower() == 'dtw':
        return nn.KNeighborsTimeSeriesClassifier(metric="dtw")
    elif cls.lower() == 'ee' or cls.lower() == 'elasticensemble':
        return dist.ElasticEnsemble()
    elif cls.lower() == 'shapedtw_raw':
        return ShapeDTW(subsequence_length=30,
                        shape_descriptor_function="raw",
                        metric_params=None)
    elif cls.lower() == 'shapedtw_dwt':
        return ShapeDTW(subsequence_length=30,
                        shape_descriptor_function="dwt",
                        metric_params={"num_levels_dwt": 3})
    elif cls.lower() == 'shapedtw_paa':
        return ShapeDTW(subsequence_length=30,
                        shape_descriptor_function="paa",
                        metric_params={"num_intervals_paa": 5})
    elif cls.lower() == 'shapedtw_slope':
        return ShapeDTW(subsequence_length=30,
                        shape_descriptor_function="slope",
                        metric_params={"num_intervals_slope": 5})
    elif cls.lower() == 'shapedtw_hog1d':
        return ShapeDTW(subsequence_length=30,
                        shape_descriptor_function="hog1d",
                        metric_params={
                            "num_bins_hog1d": 8,
                            "num_intervals_hog1d": 2,
                            "scaling_factor_hog1d": 0.1
                        })
    elif cls.lower() == 'tsfcomposite':
        #It defaults to TSF
        return ensemble.TimeSeriesForestClassifier()
    elif cls.lower() == 'risecomposite':
        steps = [('segment',
                  RandomIntervalSegmenter(n_intervals=1, min_length=5)),
                 ('transform',
                  FeatureUnion([('acf',
                                 RowTransformer(
                                     FunctionTransformer(func=acf_coefs,
                                                         validate=False))),
                                ('ps',
                                 RowTransformer(
                                     FunctionTransformer(func=powerspectrum,
                                                         validate=False)))])),
                 ('tabularise', Tabularizer()),
                 ('clf', DecisionTreeClassifier())]
        base_estimator = Pipeline(steps)
        return ensemble.TimeSeriesForestClassifier(estimator=base_estimator,
                                                   n_estimators=100)
    else:
        raise Exception('UNKNOWN CLASSIFIER')
예제 #13
0
def test_bad_input_args(bad_interval):
    X = generate_df_from_array(np.ones(10), n_rows=10, n_cols=2)
    with pytest.raises(ValueError):
        RandomIntervalSegmenter(n_intervals=bad_interval).fit(X)
예제 #14
0
def main():
    #1. Loading and splitting the dataset
    X_train, y_train = load_italy_power_demand(split='train', return_X_y=True)
    X_test, y_test = load_italy_power_demand(split='test', return_X_y=True)
    print('Shape of X, y train and test dataset', X_train.shape, y_train.shape,
          X_test.shape, y_test.shape, '\n')
    print('X_train:', X_train.head(), '\n')
    print('\nX_train info', X_train.info(), '\n')

    labels, counts = np.unique(y_train, return_counts=True)
    print(
        '\nThere are', labels,
        'labels in this dataset, one corresponds to winter and the other to summer. The counter of each one is',
        counts, '\n')

    #2. Creating a Model, Fit and Predict Sklearn Classifier
    #Sktime Tabularizing the data
    X_train_tab = tabularize(X_train)
    X_test_tab = tabularize(X_test)
    print('\n X_train tabularized\n', X_train_tab.head(), '\n')

    #2.1 SKlearn RandomForest Classifier
    classifier = RandomForestClassifier(n_estimators=100)
    classifier.fit(X_train_tab, y_train)
    y_pred = classifier.predict(X_test_tab)
    print('Accuracy sklearn RandomForestClassifier',
          round(accuracy_score(y_test, y_pred), 4), '\n')

    #2.2 Same SKlearn as above but using make_pipeline w/ Sktime Tabularizer
    classifier = make_pipeline(Tabularizer(),
                               RandomForestClassifier(n_estimators=100),
                               verbose=True)
    classifier.fit(X_train, y_train)
    print(
        'Accuracy sklearn RandomForestClassifier using sklearn make_pipeline in which the first step is to sktime Tabularize()',
        round(classifier.score(X_test, y_test), 4), '\n')

    #3 Sklearn using make_pipeline w/ Sktime TSFreshFeatureExtractor
    classifier = make_pipeline(TSFreshFeatureExtractor(show_warnings=False),
                               RandomForestClassifier(n_estimators=100))
    classifier.fit(X_train, y_train)
    print(
        'Accuracy sklearn RandomForestClassifier using sklearn make_pipeline in which the first step is to sktime TSFreshFeatureExtractor that automatically extracts and filters several key statistical features from the nested X_train time series',
        round(classifier.score(X_test, y_test), 4), '\n')

    #4. Using Time series algorithms and classifiers from sklearn/sktime
    steps = [
        ('segment', RandomIntervalSegmenter(n_intervals='sqrt')),  #Sktime
        (
            'transform',
            FeatureUnion([  #Sklearn
                ('mean',
                 RowTransformer(
                     FunctionTransformer(func=np.mean,
                                         validate=False))),  #sktime
                ('std',
                 RowTransformer(
                     FunctionTransformer(func=np.std,
                                         validate=False))),  #sktime
                ('slope',
                 RowTransformer(
                     FunctionTransformer(func=time_series_slope,
                                         validate=False)))  #sktime
            ])),
        ('clf', DecisionTreeClassifier())  #From Sklearn
    ]
    time_series_tree = Pipeline(steps, verbose=True)  #sklearn
    time_series_tree.fit(X_train, y_train)
    print(
        'Accuracy sklearn DecisionTreeClassifier using sklearn Pipeline() as well as segmentation and transformation techniques from sktime and sklearn',
        round(time_series_tree.score(X_test, y_test), 4))

    #5. Using Time series Sktime
    tsf = TimeSeriesForestClassifier(n_estimators=100, verbose=True)
    tsf.fit(X_train, y_train)
    print('Accuracy sktime TimeSeriesForestClassifier',
          round(tsf.score(X_test, y_test), 4))