def test_different_pipelines():
    random_state = 1233
    X_train, y_train = make_classification_problem()
    steps = [
        ('segment',
         RandomIntervalSegmenter(n_intervals='sqrt',
                                 random_state=random_state)),
        ('transform',
         FeatureUnion([
             ('mean',
              RowTransformer(FunctionTransformer(func=np.mean,
                                                 validate=False))),
             ('std',
              RowTransformer(FunctionTransformer(func=np.std,
                                                 validate=False))),
             ('slope',
              RowTransformer(
                  FunctionTransformer(func=time_series_slope,
                                      validate=False))),
         ])),
    ]
    pipe = Pipeline(steps)
    a = pipe.fit_transform(X_train)
    tran = RandomIntervalFeatureExtractor(
        n_intervals='sqrt',
        features=[np.mean, np.std, time_series_slope],
        random_state=random_state)
    b = tran.fit_transform(X_train)
    np.testing.assert_array_equal(a, b)
    np.testing.assert_array_equal(pipe.steps[0][1].intervals_, tran.intervals_)
def test_output_format_dim(n_instances, n_timepoints, n_intervals, features):
    X = generate_df_from_array(np.ones(n_timepoints),
                               n_rows=n_instances,
                               n_cols=1)
    n_rows, n_cols = X.shape
    trans = RandomIntervalFeatureExtractor(n_intervals=n_intervals,
                                           features=features)
    Xt = trans.fit_transform(X)
    assert isinstance(Xt, pd.DataFrame)
    assert Xt.shape[0] == n_rows
    assert np.array_equal(Xt.values, np.ones(Xt.shape))
def test_different_implementations():
    random_state = 1233
    X_train, y_train = make_classification_problem()

    # Compare with chained transformations.
    tran1 = RandomIntervalSegmenter(n_intervals='sqrt',
                                    random_state=random_state)
    tran2 = RowTransformer(FunctionTransformer(func=np.mean, validate=False))
    A = tran2.fit_transform(tran1.fit_transform(X_train))

    tran = RandomIntervalFeatureExtractor(n_intervals='sqrt',
                                          features=[np.mean],
                                          random_state=random_state)
    B = tran.fit_transform(X_train)

    np.testing.assert_array_equal(A, B)
def test_TimeSeriesForest_predictions(n_estimators, n_intervals):
    random_state = 1234
    X_train, y_train = load_gunpoint(split="train", return_X_y=True)
    X_test, y_test = load_gunpoint(split="test", return_X_y=True)

    features = [np.mean, np.std, time_series_slope]
    steps = [('transform',
              RandomIntervalFeatureExtractor(random_state=random_state,
                                             features=features)),
             ('clf', DecisionTreeClassifier())]
    estimator = Pipeline(steps)

    clf1 = TimeSeriesForestClassifier(estimator=estimator,
                                      random_state=random_state,
                                      n_estimators=n_estimators)
    clf1.fit(X_train, y_train)
    a = clf1.predict_proba(X_test)

    # default, semi-modular implementation using
    # RandomIntervalFeatureExtractor internally
    clf2 = TimeSeriesForestClassifier(random_state=random_state,
                                      n_estimators=n_estimators)
    clf2.fit(X_train, y_train)
    b = clf2.predict_proba(X_test)

    np.testing.assert_array_equal(a, b)
示例#5
0
def main():
    generator = DataGenerator(labeled_data_file=args.labeled_data_file, data_util_file=args.data_util_file,
                              threshold=args.threshold, dt=args.dt, L=args.L, tmin=args.tmin, tmax=args.tmax)
    training_data, test_data = generator.get_data(ts_nth_element=args.ts_nth_element,
                                                                   training_frac=0.7)
    steps = [
        ('extract', RandomIntervalFeatureExtractor(n_intervals='sqrt',
                                                   features=[np.mean, np.std, time_series_slope])),
        ('clf', DecisionTreeClassifier())
    ]
    time_series_tree = Pipeline(steps)
    tsf = TimeSeriesForestClassifier(
        estimator=time_series_tree,
        n_estimators=args.n_estimators,
        criterion='entropy' if args.criterion == 'entropy' else 'gini',
        bootstrap=True,
        oob_score=True,
        random_state=1,
        # n_jobs=4,
        verbose=1
    )
    x = detabularize(pd.DataFrame(training_data[:,1:]))
    try:
        with parallel_backend('threading', n_jobs=args.n_jobs):
            tsf = tsf.fit(x, training_data[:,0])
        with open('{save_file_name}.pickle'.format(save_file_name=args.save_file_name), 'wb') \
                as TimeSeriesForestModel:
            pickle.dump(tsf, TimeSeriesForestModel, protocol=pickle.HIGHEST_PROTOCOL)
    except Exception as ex:
        print(ex)
示例#6
0
    def _validate_estimator(self):

        if not isinstance(self.n_estimators, numbers.Integral):
            raise ValueError("n_estimators must be an integer, "
                             "got {0}.".format(type(self.n_estimators)))

        if self.n_estimators <= 0:
            raise ValueError("n_estimators must be greater than zero, "
                             "got {0}.".format(self.n_estimators))

        # Set base estimator
        if self.estimator is None:
            # Set default time series forest
            features = [np.mean, np.std, time_series_slope]
            steps = [('transform',
                      RandomIntervalFeatureExtractor(
                          n_intervals='sqrt',
                          features=features,
                          random_state=self.random_state)),
                     ('clf',
                      DecisionTreeRegressor(random_state=self.random_state))]
            self.estimator_ = Pipeline(steps)

        else:
            # else check given estimator is a pipeline with prior
            # transformations and final decision tree
            if not isinstance(self.estimator, Pipeline):
                raise ValueError('`estimator` must be '
                                 'pipeline with transforms.')
            if not isinstance(self.estimator.steps[-1][1],
                              DecisionTreeRegressor):
                raise ValueError('Last step in `estimator` must be '
                                 'DecisionTreeRegressor.')
            self.estimator_ = self.estimator

        # Set parameters according to naming in pipeline
        estimator_params = {
            "criterion": self.criterion,
            "max_depth": self.max_depth,
            "min_samples_split": self.min_samples_split,
            "min_samples_leaf": self.min_samples_leaf,
            "min_weight_fraction_leaf": self.min_weight_fraction_leaf,
            "max_features": self.max_features,
            "max_leaf_nodes": self.max_leaf_nodes,
            "min_impurity_decrease": self.min_impurity_decrease,
            "min_impurity_split": self.min_impurity_split,
        }
        final_estimator = self.estimator_.steps[-1][0]
        self.estimator_params = {
            f'{final_estimator}__{pname}': pval
            for pname, pval in estimator_params.items()
        }

        # Set renamed estimator parameters
        for pname, pval in self.estimator_params.items():
            self.__setattr__(pname, pval)
def test_results(n_instances, n_timepoints, n_intervals):
    x = np.random.normal(size=n_timepoints)
    X = generate_df_from_array(x, n_rows=n_instances, n_cols=1)
    t = RandomIntervalFeatureExtractor(
        n_intervals=n_intervals, features=[np.mean, np.std, time_series_slope])
    Xt = t.fit_transform(X)
    # Check results
    intervals = t.intervals_
    for start, end in intervals:
        expected_mean = np.mean(x[start:end])
        expected_std = np.std(x[start:end])
        expected_slope = time_series_slope(x[start:end])

        actual_means = Xt.filter(like=f'*_{start}_{end}_mean').values
        actual_stds = Xt.filter(like=f'_{start}_{end}_std').values
        actual_slopes = Xt.filter(
            like=f'_{start}_{end}_time_series_slope').values

        assert np.all(actual_means == expected_mean)
        assert np.all(actual_stds == expected_std)
        assert np.all(actual_slopes == expected_slope)
def test_equivalent_model_specifications(n_intervals, n_estimators):
    random_state = 1234
    X_train, y_train = load_gunpoint(split="train", return_X_y=True)
    X_test, y_test = load_gunpoint(split="test", return_X_y=True)

    # Due to tie-breaking/floating point rounding in the final decision tree
    # classifier, the results depend on the
    # exact column order of the input data

    #  Compare pipeline predictions outside of ensemble.
    steps = [
        ('segment',
         RandomIntervalSegmenter(n_intervals=n_intervals,
                                 random_state=random_state)),
        ('transform',
         FeatureUnion([
             ('mean',
              RowTransformer(FunctionTransformer(func=np.mean,
                                                 validate=False))),
             ('std',
              RowTransformer(FunctionTransformer(func=np.std,
                                                 validate=False))),
             ('slope',
              RowTransformer(
                  FunctionTransformer(func=time_series_slope, validate=False)))
         ])), ('clf', DecisionTreeClassifier(random_state=random_state))
    ]
    clf1 = Pipeline(steps)
    clf1.fit(X_train, y_train)
    a = clf1.predict(X_test)

    steps = [('transform',
              RandomIntervalFeatureExtractor(
                  n_intervals=n_intervals,
                  features=[np.mean, np.std, time_series_slope],
                  random_state=random_state)),
             ('clf', DecisionTreeClassifier(random_state=random_state))]
    clf2 = Pipeline(steps)
    clf2.fit(X_train, y_train)
    b = clf2.predict(X_test)
    np.array_equal(a, b)
示例#9
0
# Define param grid
n_estimators_list = [50, 100, 200, 300, 400, 500]
features_list = [
    [np.mean, np.std, time_series_slope],
    [np.mean, np.std, time_series_slope, skew],
    [np.mean, np.std, time_series_slope, kurtosis],
    [np.mean, np.std, time_series_slope, skew, kurtosis],
]
n_intervals_list = [0.05, 0.1, 0.25, 0.5, 'log', 'sqrt']
param_grid = {
    'n_estimators': n_estimators_list,
    'estimator__transform__n_intervals': n_intervals_list,
    'estimator__transform__features': features_list
}

BASE_ESTIMATOR = Pipeline([('transform', RandomIntervalFeatureExtractor()),
                           ('estimator', DecisionTreeClassifier())])

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

HOME = os.path.expanduser("~")
DATA_PATH = os.path.join(HOME, "Documents/Research/data/Univariate_ts")
RESULTS_PATH = "results"
RANDOM_STATE = 1
OUTER_CV_N_SPLITS = 30
INNER_N_SPLITS = 10

# Alternatively, we can use a helper function to create them automatically
datasets = make_datasets(path=DATA_PATH,
                         dataset_cls=UEADataset,
def test_bad_features(bad_features):
    X, y = make_classification_problem()
    with pytest.raises(ValueError):
        RandomIntervalFeatureExtractor(n_intervals=bad_features).fit(X)