Пример #1
0
def main():
    generator = DataGenerator(labeled_data_file=args.labeled_data_file, data_util_file=args.data_util_file,
                              threshold=args.threshold, dt=args.dt, L=args.L, tmin=args.tmin, tmax=args.tmax)
    training_data, test_data = generator.get_data(ts_nth_element=args.ts_nth_element,
                                                                   training_frac=0.7)
    steps = [
        ('extract', RandomIntervalFeatureExtractor(n_intervals='sqrt',
                                                   features=[np.mean, np.std, time_series_slope])),
        ('clf', DecisionTreeClassifier())
    ]
    time_series_tree = Pipeline(steps)
    tsf = TimeSeriesForestClassifier(
        estimator=time_series_tree,
        n_estimators=args.n_estimators,
        criterion='entropy' if args.criterion == 'entropy' else 'gini',
        bootstrap=True,
        oob_score=True,
        random_state=1,
        # n_jobs=4,
        verbose=1
    )
    x = detabularize(pd.DataFrame(training_data[:,1:]))
    try:
        with parallel_backend('threading', n_jobs=args.n_jobs):
            tsf = tsf.fit(x, training_data[:,0])
        with open('{save_file_name}.pickle'.format(save_file_name=args.save_file_name), 'wb') \
                as TimeSeriesForestModel:
            pickle.dump(tsf, TimeSeriesForestModel, protocol=pickle.HIGHEST_PROTOCOL)
    except Exception as ex:
        print(ex)
def test_TimeSeriesForest_predictions(n_estimators, n_intervals):
    random_state = 1234
    X_train, y_train = load_gunpoint(split="train", return_X_y=True)
    X_test, y_test = load_gunpoint(split="test", return_X_y=True)

    features = [np.mean, np.std, time_series_slope]
    steps = [
        (
            "transform",
            RandomIntervalFeatureExtractor(
                random_state=random_state, features=features
            ),
        ),
        ("clf", DecisionTreeClassifier()),
    ]
    estimator = Pipeline(steps)

    clf1 = TimeSeriesForestClassifier(
        estimator=estimator, random_state=random_state, n_estimators=n_estimators
    )
    clf1.fit(X_train, y_train)
    a = clf1.predict_proba(X_test)

    # default, semi-modular implementation using
    # RandomIntervalFeatureExtractor internally
    clf2 = TimeSeriesForestClassifier(
        random_state=random_state, n_estimators=n_estimators
    )
    clf2.fit(X_train, y_train)
    b = clf2.predict_proba(X_test)

    np.testing.assert_array_equal(a, b)
Пример #3
0
def tsf_classifier(X_train, X_test, y_train, y_test):
    """ML Scorer based on sktime pipeline with a TimeSeriesForestClassifier."""
    steps = [('concatenate', ColumnConcatenator()),
             ('classify', TimeSeriesForestClassifier(n_estimators=100))]
    clf = Pipeline(steps)
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)
Пример #4
0
def rise_benchmarking():
    for i in range(len(benchmark_datasets)):
        dataset = benchmark_datasets[i]
        print(str(i) + " problem = " + dataset)
        rise = fb.RandomIntervalSpectralForest(n_estimators=100)
        exp.run_experiment(overwrite=True,
                           problem_path=data_dir,
                           results_path=results_dir,
                           cls_name="PythonRISE",
                           classifier=rise,
                           dataset=dataset,
                           train_file=False)
        steps = [('segment',
                  RandomIntervalSegmenter(n_intervals=1, min_length=5)),
                 ('transform',
                  FeatureUnion([('acf',
                                 RowTransformer(
                                     FunctionTransformer(func=acf_coefs,
                                                         validate=False))),
                                ('ps',
                                 RowTransformer(
                                     FunctionTransformer(func=powerspectrum,
                                                         validate=False)))])),
                 ('tabularise', Tabularizer()),
                 ('clf', DecisionTreeClassifier())]
        base_estimator = Pipeline(steps)
        rise = TimeSeriesForestClassifier(estimator=base_estimator,
                                          n_estimators=100)
        exp.run_experiment(overwrite=True,
                           problem_path=data_dir,
                           results_path=results_dir,
                           cls_name="PythonRISEComposite",
                           classifier=rise,
                           dataset=dataset,
                           train_file=False)
Пример #5
0
def _rf_scorer(X_train, X_test, y_train, y_test):
    steps = [
        ('concatenate', ColumnConcatenator()),
        ('classify', TimeSeriesForestClassifier(n_estimators=100))
    ]
    clf = Pipeline(steps)
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test)
Пример #6
0
def tsf_benchmarking():
    for i in range(0, len(benchmark_datasets)):
        dataset = benchmark_datasets[i]
        print(str(i) + " problem = " + dataset)
        tsf = ib.TimeSeriesForest(n_estimators=100)
        exp.run_experiment(
            overwrite=False,
            problem_path=data_dir,
            results_path=results_dir,
            cls_name="PythonTSF",
            classifier=tsf,
            dataset=dataset,
            train_file=False,
        )
        steps = [
            ("segment", RandomIntervalSegmenter(n_intervals="sqrt")),
            (
                "transform",
                FeatureUnion(
                    [
                        (
                            "mean",
                            make_row_transformer(
                                FunctionTransformer(func=np.mean, validate=False)
                            ),
                        ),
                        (
                            "std",
                            make_row_transformer(
                                FunctionTransformer(func=np.std, validate=False)
                            ),
                        ),
                        (
                            "slope",
                            make_row_transformer(
                                FunctionTransformer(
                                    func=time_series_slope, validate=False
                                )
                            ),
                        ),
                    ]
                ),
            ),
            ("clf", DecisionTreeClassifier()),
        ]
        base_estimator = Pipeline(steps)
        tsf = TimeSeriesForestClassifier(estimator=base_estimator, n_estimators=100)
        exp.run_experiment(
            overwrite=False,
            problem_path=data_dir,
            results_path=results_dir,
            cls_name="PythonTSFComposite",
            classifier=tsf,
            dataset=dataset,
            train_file=False,
        )
def test_predict_proba():
    clf = TimeSeriesForestClassifier(n_estimators=2)
    clf.fit(X, y)
    proba = clf.predict_proba(X)

    assert proba.shape == (X.shape[0], n_classes)
    np.testing.assert_array_equal(np.ones(X.shape[0]), np.sum(proba, axis=1))

    # test single row input
    y_proba = clf.predict_proba(X.iloc[[0], :])
    assert y_proba.shape == (1, n_classes)

    y_pred = clf.predict(X.iloc[[0], :])
    assert y_pred.shape == (1,)
    def fit(self, luck_average_windows, assessment_windows, until=None, max_horizon=9 * 6):
        logger("MODEL-FIT").debug(
            "max_horizon: {} / avg windows: {} / assmnt windows: {} / until: {} / total_data_size: {}".format(
                max_horizon,
                str(luck_average_windows),
                str(assessment_windows),
                until,
                len(self.data_points)))
        if until is not None and (until < 0 or until >= len(self.data_points)):
            logger("MODEL-FIT").error("Parameter until is too large for the given data points: {}".format(until))
            return
        self.horizon = max_horizon
        for wi, w in enumerate(assessment_windows):
            if w > self.horizon:
                break
            # prepare data frame for sktime package

            temporary_data_fit_file = self.prepare_ts_file(0, len(self.data_points) if until is None else until,
                                                           self.case_observation_size, wi, w)

            # parse data frames from the temporary fit data file
            X, y = load_from_tsfile_to_dataframe(temporary_data_fit_file, replace_missing_vals_with="-100")
            # which label is the first one?
            true_index = 0
            if y[0] == "false":
                true_index = 1
            new_class_weights = self.create_class_weight_dict(true_index=true_index)
            estimators = []
            for i in range(0, len(luck_average_windows)):
                estimators.append(("TSF{}".format(i), TimeSeriesForestClassifier(
                    n_estimators=int(self.no_estimators),
                    n_jobs=16,
                    max_depth=self.max_depth,
                    class_weight=new_class_weights,
                    criterion=self.criterion,
                    min_samples_split=self.min_samples_split,
                    min_samples_leaf=self.min_samples_leaf,
                    oob_score=self.oob_score,
                    bootstrap=self.bootstrap),
                                   [i]))
            c = ColumnEnsembleClassifier(estimators=estimators)
            c.fit(X, y)
            # print(str(c.classes_))
            self.classifiers.append(c)
Пример #9
0
def main(args):
    # Load and wrangle data
    raw_data_df = run.input_datasets["rawdata"].to_pandas_dataframe()

    processed_data_df = prepare_dataframe(
        raw_data_df,
        time_series_length=args.timeserieslength,
        threshold=args.threshold)

    # Split data
    train = processed_data_df.sample(frac=args.train_data_split,
                                     random_state=42)
    test = processed_data_df.drop(train.index)

    # Example for logging
    run.log(
        "data_split_fraction",
        args.train_data_split,
        "Fraction of samples used for training",
    )
    run.log("train_samples", train.shape[0],
            "Number of samples used for training")
    run.log("test_samples", test.shape[0],
            "Number of samples used for testing")

    # Train
    task = TSCTask(target="label", metadata=train)
    clf = TimeSeriesForestClassifier(n_estimators=args.n_estimators)
    strategy = TSCStrategy(clf)
    strategy.fit(task, train)
    run.log("n_estimators", args.n_estimators,
            "Number of tree estimators used in the model")

    # Metrics
    y_pred = strategy.predict(test)
    y_test = test[task.target]
    accuracy = accuracy_score(y_test, y_pred)
    run.log("Accuracy", f"{accuracy:1.3f}", "Accuracy of model")

    # Persist model
    os.makedirs("outputs", exist_ok=True)
    model_path = os.path.join("outputs", args.model_filename)
    dump(strategy, model_path)
Пример #10
0
def test_stat():
    data = load_gunpoint(split="train")
    dataset = RAMDataset(dataset=data, name="gunpoint")
    task = TSCTask(target="class_val")

    fc = TimeSeriesForestClassifier(n_estimators=1, random_state=1)
    strategy_fc = TSCStrategy(fc, name="tsf")
    pf = ProximityForest(n_estimators=1, random_state=1)
    strategy_pf = TSCStrategy(pf, name="pf")

    # result backend
    results = RAMResults()
    orchestrator = Orchestrator(
        datasets=[dataset],
        tasks=[task],
        strategies=[strategy_pf, strategy_fc],
        cv=SingleSplit(random_state=1),
        results=results,
    )

    orchestrator.fit_predict(save_fitted_strategies=False)

    analyse = Evaluator(results)
    metric = PairwiseMetric(func=accuracy_score, name="accuracy")
    _ = analyse.evaluate(metric=metric)

    ranks = analyse.rank(ascending=True)
    pf_rank = ranks.loc[ranks.strategy == "pf",
                        "accuracy_mean_rank"].item()  # 1
    fc_rank = ranks.loc[ranks.strategy == "tsf",
                        "accuracy_mean_rank"].item()  # 2
    rank_array = [pf_rank, fc_rank]
    rank_array_test = [1, 2]
    _, sign_test_df = analyse.sign_test()

    sign_array = [
        [sign_test_df["pf"][0], sign_test_df["pf"][1]],
        [sign_test_df["tsf"][0], sign_test_df["tsf"][1]],
    ]
    sign_array_test = [[1, 1], [1, 1]]
    np.testing.assert_equal([rank_array, sign_array],
                            [rank_array_test, sign_array_test])
Пример #11
0
def tsf_benchmarking():
    for i in range(len(benchmark_datasets)):
        dataset = benchmark_datasets[i]
        print(str(i) + " problem = " + dataset)
        tsf = ib.TimeSeriesForest(n_estimators=100)
        exp.run_experiment(overwrite=False,
                           problem_path=data_dir,
                           results_path=results_dir,
                           cls_name="PythonTSF",
                           classifier=tsf,
                           dataset=dataset,
                           train_file=False)
        steps = [
            ('segment', RandomIntervalSegmenter(n_intervals='sqrt')),
            ('transform',
             FeatureUnion([('mean',
                            RowTransformer(
                                FunctionTransformer(func=np.mean,
                                                    validate=False))),
                           ('std',
                            RowTransformer(
                                FunctionTransformer(func=np.std,
                                                    validate=False))),
                           ('slope',
                            RowTransformer(
                                FunctionTransformer(func=time_series_slope,
                                                    validate=False)))])),
            ('clf', DecisionTreeClassifier())
        ]
        base_estimator = Pipeline(steps)
        tsf = TimeSeriesForestClassifier(estimator=base_estimator,
                                         n_estimators=100)
        exp.run_experiment(overwrite=False,
                           problem_path=data_dir,
                           results_path=results_dir,
                           cls_name="PythonTSFComposite",
                           classifier=tsf,
                           dataset=dataset,
                           train_file=False)
Пример #12
0
def main():
    #1. Loading and splitting the dataset
    X_train, y_train = load_italy_power_demand(split='train', return_X_y=True)
    X_test, y_test = load_italy_power_demand(split='test', return_X_y=True)
    print('Shape of X, y train and test dataset', X_train.shape, y_train.shape,
          X_test.shape, y_test.shape, '\n')
    print('X_train:', X_train.head(), '\n')
    print('\nX_train info', X_train.info(), '\n')

    labels, counts = np.unique(y_train, return_counts=True)
    print(
        '\nThere are', labels,
        'labels in this dataset, one corresponds to winter and the other to summer. The counter of each one is',
        counts, '\n')

    #2. Creating a Model, Fit and Predict Sklearn Classifier
    #Sktime Tabularizing the data
    X_train_tab = tabularize(X_train)
    X_test_tab = tabularize(X_test)
    print('\n X_train tabularized\n', X_train_tab.head(), '\n')

    #2.1 SKlearn RandomForest Classifier
    classifier = RandomForestClassifier(n_estimators=100)
    classifier.fit(X_train_tab, y_train)
    y_pred = classifier.predict(X_test_tab)
    print('Accuracy sklearn RandomForestClassifier',
          round(accuracy_score(y_test, y_pred), 4), '\n')

    #2.2 Same SKlearn as above but using make_pipeline w/ Sktime Tabularizer
    classifier = make_pipeline(Tabularizer(),
                               RandomForestClassifier(n_estimators=100),
                               verbose=True)
    classifier.fit(X_train, y_train)
    print(
        'Accuracy sklearn RandomForestClassifier using sklearn make_pipeline in which the first step is to sktime Tabularize()',
        round(classifier.score(X_test, y_test), 4), '\n')

    #3 Sklearn using make_pipeline w/ Sktime TSFreshFeatureExtractor
    classifier = make_pipeline(TSFreshFeatureExtractor(show_warnings=False),
                               RandomForestClassifier(n_estimators=100))
    classifier.fit(X_train, y_train)
    print(
        'Accuracy sklearn RandomForestClassifier using sklearn make_pipeline in which the first step is to sktime TSFreshFeatureExtractor that automatically extracts and filters several key statistical features from the nested X_train time series',
        round(classifier.score(X_test, y_test), 4), '\n')

    #4. Using Time series algorithms and classifiers from sklearn/sktime
    steps = [
        ('segment', RandomIntervalSegmenter(n_intervals='sqrt')),  #Sktime
        (
            'transform',
            FeatureUnion([  #Sklearn
                ('mean',
                 RowTransformer(
                     FunctionTransformer(func=np.mean,
                                         validate=False))),  #sktime
                ('std',
                 RowTransformer(
                     FunctionTransformer(func=np.std,
                                         validate=False))),  #sktime
                ('slope',
                 RowTransformer(
                     FunctionTransformer(func=time_series_slope,
                                         validate=False)))  #sktime
            ])),
        ('clf', DecisionTreeClassifier())  #From Sklearn
    ]
    time_series_tree = Pipeline(steps, verbose=True)  #sklearn
    time_series_tree.fit(X_train, y_train)
    print(
        'Accuracy sklearn DecisionTreeClassifier using sklearn Pipeline() as well as segmentation and transformation techniques from sktime and sklearn',
        round(time_series_tree.score(X_test, y_test), 4))

    #5. Using Time series Sktime
    tsf = TimeSeriesForestClassifier(n_estimators=100, verbose=True)
    tsf.fit(X_train, y_train)
    print('Accuracy sktime TimeSeriesForestClassifier',
          round(tsf.score(X_test, y_test), 4))
Пример #13
0
# data -> our function -> (X_nested, y)

X = generate_long_table(ts)
X.head()

X_nested = from_long_to_nested(X)
X_nested.head()
y = np.array(['a'])  # , 'b', 'a', 'b', 'a', 'b', 'a', 'b'])

print(X_nested)

X_train, X_test, y_train, y_test = train_test_split(X_nested, y)
print(X.head())
classifier = ColumnEnsembleClassifier(estimators=[
    ("TSF1", TimeSeriesForestClassifier(n_estimators=100), [1]),
    ("TSF2", TimeSeriesForestClassifier(n_estimators=100), [2]),
])
classifier.fit(X_train, y_train)

# Use the test portion of data for prediction so we can understand how accurate our model was learned
y_pred = classifier.predict(X_test)
# Use the native `accuracy_score` method to calculate the accuracy based on the test outcomes and the predicted outcomes
print("Accuracy score is: " + str(accuracy_score(y_test, y_pred)))


def generate_example_long_table(num_cases=50, series_len=20, num_dims=2):
    rows_per_case = series_len * num_dims
    total_rows = num_cases * series_len * num_dims

    case_ids = np.empty(total_rows, dtype=np.int)
Пример #14
0
X_train_timedata = X_train_timedata.to_frame()
X_test_timedata = X_test_timedata.to_frame()

ts_train = pd.Series(X_train_timedata['combine'].values,
                     index=X_train_timedata.index)
X_ts_train = ts_train.to_frame()

ts_test = pd.Series(X_test_timedata['combine'].values,
                    index=X_test_timedata.index)
X_ts_test = ts_test.to_frame()

for row_num in range(0, X_ts_train.shape[0]):
    series1 = pd.Series(X_ts_train.iat[row_num, 0])
    X_ts_train.iat[row_num, 0] = series1

for row_num in range(0, X_ts_test.shape[0]):
    series2 = pd.Series(X_ts_test.iat[row_num, 0])
    X_ts_test.iat[row_num, 0] = series2

## =======================Column ensembling================================
clf = ColumnEnsembleClassifier(estimators=[
    ("TSF0", TimeSeriesForestClassifier(n_estimators=5), [0]),
])

start_time = time.time()
clf.fit(X_ts_train, y_train)
Efficiency = time.time() - start_time
Accuracy = clf.score(X_ts_test, y_test)
print("Efficiency is:\n", Efficiency)
print("Accuracy is :\n", Accuracy)
Пример #15
0
    signal_names = ["chan_%d" % x for x in range(num_channels)]
    return signal_names, X, y

def testlime(signal_names, clf, x, y):
    class_names=[y]

    num_slices=20
    num_features=10

    explainer = lime_ts.LimeTimeSeriesExplainer(class_names=class_names,
                                                signal_names=signal_names)

    labelid = 0
    exp = explainer.explain_instance(x, clf.predict_proba, num_features=num_features, num_samples=100, num_slices=num_slices, labels=[labelid], replacement_method='total_mean')
    exp.as_pyplot_figure(labelid)
    plt.show()

if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    
    signal_names, X, y = genDataset(80, 4, 30)

    steps = [
        ("concatenate", ColumnConcatenator()),
        ("classify", TimeSeriesForestClassifier(n_estimators=100)),
    ]
    clf = Pipeline(steps)
    clf.fit(X,y)

    testlime(signal_names, clf, X[0], y[0])
Пример #16
0
models = {
    "features":
    make_pipeline(TruncationTransformer(lower=MAX_LENGTH),
                  TSFreshFeatureExtractor(default_fc_parameters="efficient",
                                          show_warnings=False,
                                          n_jobs=-1),
                  RandomForestClassifier(n_jobs=-1, random_state=1),
                  verbose=True),
    "interval":
    make_pipeline(TruncationTransformer(lower=15000),
                  TimeSeriesForestClassifier(
                      estimator=time_series_tree,
                      n_estimators=100,
                      criterion="entropy",
                      bootstrap=True,
                      oob_score=True,
                      random_state=1,
                      n_jobs=-1,
                  ),
                  verbose=True),
    "shapelet":
    make_pipeline(TruncationTransformer(lower=1000),
                  ContractedShapeletTransform(
                      time_contract_in_mins=10,
                      num_candidates_to_sample_per_case=10,
                      verbose=2,
                      random_state=1),
                  RandomForestClassifier(n_estimators=100,
                                         n_jobs=-1,
                                         random_state=1),
Пример #17
0
                                                               random_state=j)

        # set CV
        _, counts = np.unique(y_train, return_counts=True)
        n_splits = np.minimum(counts.min(), INNER_N_SPLITS)
        n_repeats = np.maximum(1, INNER_N_SPLITS // n_splits)
        # cv = StratifiedKFold(n_splits=n_splits, shuffle=True,
        # random_state=RANDOM_STATE)
        inner_cv = RepeatedStratifiedKFold(n_splits=n_splits,
                                           n_repeats=n_repeats,
                                           random_state=RANDOM_STATE)
        print(f'Dataset: {i + 1}/{n_datasets} {dataset.name} - n_splits: '
              f'{j + 1}/{OUTER_CV_N_SPLITS}')

        # set estimator
        estimator = TimeSeriesForestClassifier(BASE_ESTIMATOR, n_jobs=-1)
        gscv = GridSearchCV(estimator,
                            param_grid,
                            scoring='neg_log_loss',
                            cv=inner_cv,
                            refit=True,
                            iid=False,
                            error_score='raise',
                            verbose=True)

        # tune when enough samples for all classes are available
        start = time.time()
        gscv.fit(X_train, y_train)
        results[0] = time.time() - start

        # predict
Пример #18
0
X_test = tslearn.utils.to_sktime_dataset(X_test)
y_train = np.vstack([y_train, y_val])
y_train = pd.Series(y_train.reshape(-1))
y_test = pd.Series(y_test.reshape(-1))


# Timeseries random foreset for every column
for i, col in enumerate(col_names[:2]):
    print(col)

    # Choose one feature
    X_train_step = X_train.iloc[:, [i]]
    X_test_step = X_test.iloc[:, [i]]

    # Time series forest clf
    classifier = TimeSeriesForestClassifier()
    classifier.fit(X_train_step, y_train)
    y_pred = classifier.predict(X_test_step)
    
    # Metrics
    print(f'accuracy_test: {accuracy_score(y_test, y_pred)}')
    print(f"recall_test: {recall_score(y_test, y_pred)}")
    print(f"precisoin_test: {precision_score(y_test, y_pred)}")
    print(f"f1_test: {f1_score(y_test, y_pred)}")



# clf2 = pickle.loads(s)
# clf2.predict(X_test[0:1])

Пример #19
0
def main(model, input_training_raster, train_feature, input_test_raster,
         test_feature, input_test_csv, result_path, n_channels, n_jobs,
         model_path, raster_to_classify, patch_size, output_raster,
         train_ratio, n_estimators, max_depth, max_num_of_samples_per_class):
    # -- Creating output path if does not exist
    if not os.path.exists(result_path):
        os.makedirs(result_path)

    # ---- output files
    result_path = os.path.join(result_path, model)
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    print("Model: ", model)
    # Generatin train/test datasets
    train_list, test_list, _ = split_train_feature(train_feature, train_ratio)
    train_data = generate_training_data(input_training_raster, train_feature,
                                        train_list,
                                        max_num_of_samples_per_class)
    X_train, y_train = train_data[:, 1:], train_data[:, 0]

    if input_test_raster and test_feature:
        _, test_list, _ = split_train_feature(test_feature, train_ratio=0)
        test_data = generate_training_data(input_test_raster, test_feature,
                                           test_list)
        X_test, y_test = test_data[:, 1:], test_data[:, 0]
    elif input_test_csv:
        df = pd.read_csv(input_test_csv, sep=',', header=None)
        test_data = np.asarray(df.values)
        X_test, y_test = test_data[:, 2:], test_data[:, 0]
    else:
        test_data = generate_training_data(input_training_raster,
                                           train_feature, test_list,
                                           max_num_of_samples_per_class)
        X_test, y_test = test_data[:, 1:], test_data[:, 0]

    # Fitting the classifier into the Training set
    n_classes_test = len(np.unique(y_test))
    n_classes_train = len(np.unique(y_train))
    if (n_classes_test != n_classes_train):
        print("WARNING: different number of classes in train and test")
    n_classes = max(n_classes_train, n_classes_test)

    # Torch, numpy, whatever, all index from 0, if we did not assign landcover classes
    # with [0, 1, 2, 3, ...], it may cause problem, things get easier by reindex classes
    lc_ids_old = np.unique(y_train)
    lc_ids_old.sort()
    lc_ids_new = np.arange(n_classes_train)

    indexes = [np.where(y_train == lc_id)[0] for lc_id in lc_ids_old]
    for index, lc_id_new in zip(indexes, lc_ids_new):
        y_train[index] = lc_id_new

    indexes = [np.where(y_test == lc_id)[0] for lc_id in lc_ids_old]
    for index, lc_id_new in zip(indexes, lc_ids_new):
        y_test[index] = lc_id_new

    relation = np.vstack((lc_ids_old, lc_ids_new))

    if model in ["RF", "SVM"]:
        is_ts = False
        # ---- Normalizing the data per band,
        min_per = np.percentile(X_train, 2, axis=(0))
        max_per = np.percentile(X_train, 100 - 2, axis=(0))
        X_train = (X_train - min_per) / (max_per - min_per)
        X_test = (X_test - min_per) / (max_per - min_per)

        if model == "RF":
            clf = RandomForestClassifier(n_estimators=n_estimators,
                                         max_depth=max_depth,
                                         criterion='entropy',
                                         random_state=None,
                                         verbose=0,
                                         n_jobs=n_jobs)

        elif model == "SVM":
            clf = OneVsRestClassifier(
                BaggingClassifier(SVC(kernel='linear', cache_size=200),
                                  max_samples=1.0,
                                  n_estimators=n_estimators,
                                  verbose=0,
                                  n_jobs=n_jobs))

    elif model == "RF_TS":
        from sktime.classification.compose import TimeSeriesForestClassifier
        from sktime.transformations.panel.compose import ColumnConcatenator

        is_ts = True

        X_train = X_train.reshape(X_train.shape[0],
                                  int(X_train.shape[1] / n_channels),
                                  n_channels)
        X_test = X_test.reshape(X_test.shape[0],
                                int(X_test.shape[1] / n_channels), n_channels)

        # ---- Normalizing the data per band,
        min_per = np.percentile(X_train, 2, axis=(0, 1))
        max_per = np.percentile(X_train, 100 - 2, axis=(0, 1))
        X_train = (X_train - min_per) / (max_per - min_per)
        X_test = (X_test - min_per) / (max_per - min_per)

        steps = [
            ("concatenate", ColumnConcatenator()),
            ("classify",
             TimeSeriesForestClassifier(n_estimators=n_estimators,
                                        max_depth=max_depth,
                                        n_jobs=n_jobs)),
        ]
        clf = Pipeline(steps)

    # Train classifier
    clf.fit(X_train, y_train)
    # Save trained classifier
    if not model_path:
        model_path = os.path.join(result_path, 'Best_model.pkl')
    joblib.dump(clf, model_path)

    # Evaluation
    start = time.time()
    y_pred = clf.predict(X_test)

    Classes = [f'class {i}' for i in np.unique(y_test)]
    scores = metrics(y_test, y_pred, Classes)
    scores_msg = ", ".join([f"{k}={v}" for (k, v) in scores.items()])

    scores["time"] = (time.time() - start) / 60

    log = {k: [v] for k, v in scores.items()}
    log_df = pd.DataFrame(log)
    log_df.to_csv(os.path.join(result_path, "trainlog.csv"))

    print(
        scores["report"]
    )  # In report, precision means User_accuracy, recall means Producer_accuracy
    print(scores["confusion_matrix"])

    # ---- Save min_max
    minMaxVal_file = os.path.join(result_path, 'min_Max.txt')
    save_minMaxVal(minMaxVal_file, min_per, max_per)

    # Inference on raster
    if raster_to_classify:
        classify_image(raster_to_classify,
                       model_path,
                       output_raster,
                       n_channels,
                       patch_size=patch_size,
                       minmax=[min_per, max_per],
                       is_ts=is_ts,
                       relation=relation)
Пример #20
0
import pytest
from sktime.benchmarking.strategies import TSCStrategy
from sktime.benchmarking.tasks import TSCTask
from sktime.datasets import load_gunpoint
from sktime.datasets import load_italy_power_demand
from sktime.classification.compose import TimeSeriesForestClassifier

classifier = TimeSeriesForestClassifier(n_estimators=2)

DATASET_LOADERS = (load_gunpoint, load_italy_power_demand)


# Test output of time-series classification strategies
@pytest.mark.parametrize("dataset", DATASET_LOADERS)
def test_TSCStrategy(dataset):
    train = dataset(split='train')
    test = dataset(split='test')
    s = TSCStrategy(classifier)
    task = TSCTask(target='class_val')
    s.fit(task, train)
    y_pred = s.predict(test)
    assert y_pred.shape == test[task.target].shape