示例#1
0
 def __init__(self, n_clusters=200):
     self.knn = KNeighborsTimeSeriesClassifier(n_neighbors=1,
                                               weights='uniform',
                                               metric='euclidean',
                                               n_jobs=-1)
     self.d = None
     self.n_clusters = n_clusters
示例#2
0
class kNNClassifier:
    def __init__(self,
                 n_neighbours=5,
                 mac_neighbours=None,
                 weights="uniform",
                 metric_params={},
                 n_jobs=-1):
        self.n_neighbours = n_neighbours
        self.mac_neighbours = mac_neighbours
        self.weights = weights
        self.metric_params = metric_params
        self.n_jobs = n_jobs

    def get_params(self, deep=True):
        return {
            "n_neighbours": self.n_neighbours,
            "mac_neighbours": self.mac_neighbours,
            "weights": self.weights,
            "metric_params": self.metric_params,
            "n_jobs": self.n_jobs
        }

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

        self.model = KNeighborsTimeSeriesClassifier(
            n_neighbors=self.n_neighbours,
            metric="euclidean",
            weights=self.weights,
            n_jobs=self.n_jobs).fit(self.X_train, self.y_train)
        return self

    def predict(self, X_test):
        if self.mac_neighbours is None:
            return self.model.predict(X_test)
        else:
            y_hat = []
            k_neighbors = self.model.kneighbors(
                X_test, n_neighbors=self.mac_neighbours, return_distance=False)
            for idx, k in enumerate(k_neighbors):
                X_train = self.X_train[k]
                y_train = self.y_train[k]
                self.model = KNeighborsTimeSeriesClassifier(
                    n_neighbors=self.n_neighbours,
                    metric="dtw",
                    weights=self.weights,
                    n_jobs=self.n_jobs,
                    metric_params=self.metric_params).fit(X_train, y_train)
                pred = self.model.predict(X_test[idx])
                y_hat.append(pred)
        return y_hat
示例#3
0
    def __init__(self, n_neighbors):
        '''
            initialize KNN class with dynamic time warping distance metric

            hyperparameters:
                n_neighbors           : number of neighbors on which to make classification decision
        '''
        self.n_neighbors = n_neighbors
        self.knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=n_neighbors,
                                                      metric="dtw")
示例#4
0
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

        self.model = KNeighborsTimeSeriesClassifier(
            n_neighbors=self.n_neighbours,
            metric="euclidean",
            weights=self.weights,
            n_jobs=self.n_jobs).fit(self.X_train, self.y_train)
        return self
def test_variable_length_knn():
    X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9],
                                [3, 5, 6, 7, 8]])
    y = [0, 0, 1, 1]
    clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1)
    clf.fit(X, y)
    assert_allclose(clf.predict(X), [0, 0, 1, 1])

    clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1)
    clf.fit(X, y)
    assert_allclose(clf.predict(X), [0, 0, 1, 1])
示例#6
0
    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._knn = KNeighborsTimeSeriesClassifier(
            n_neighbors=self.hyperparams["n_neighbors"],
            metric=self.hyperparams["distance_metric"],
            weights=self.hyperparams["sample_weighting"],
        )
        self._scaler = TimeSeriesScalerMinMax()
        self._is_fit = False
示例#7
0
def test_serialize_knn_classifier():
    n, sz, d = 15, 10, 3
    rng = numpy.random.RandomState(0)
    X = rng.randn(n, sz, d)
    y = rng.randint(low=0, high=3, size=n)

    knc = KNeighborsTimeSeriesClassifier()

    _check_not_fitted(knc)

    knc.fit(X, y)

    _check_params_predict(knc, X, ['predict'])
示例#8
0
class Knn():
    def __init__(self, n_neighbors):
        '''
            initialize KNN class with dynamic time warping distance metric

            hyperparameters:
                n_neighbors           : number of neighbors on which to make classification decision
        '''
        self.n_neighbors = n_neighbors
        self.knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=n_neighbors,
                                                      metric="dtw")

    def __ScaleData(self, input_data):
        ''' 
            scale input data to range [0,1]

            parameters:
                input_data        : input data to rescale
        '''

        return TimeSeriesScalerMinMax().fit_transform(input_data)

    def fit(self, X_train, y_train):
        '''
            fit KNN classifier on training data

            parameters:
                X_train                : training time series
                y_train                : training labels
        '''
        # scale training data to between 0 and 1
        X_train_scaled = self.__ScaleData(X_train)
        self.knn_clf.fit(X_train_scaled, y_train)

    def predict(self, X_test):
        '''
            classifications for time series in test data set

            parameters:
                X_test:     test time series on which to predict classes

            returns: classifications for test data set
        '''
        # scale test data to between 0 and 1
        X_test_scaled = self.__ScaleData(X_test)
        return self.knn_clf.predict(X_test_scaled)
示例#9
0
def test_variable_cross_val():
    # TODO: here we just check that they can accept variable-length TS, not
    # that they do clever things
    X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [1, 2, 3, 4],
                                [1, 2, 3], [2, 5, 6, 7, 8, 9], [3, 5, 6, 7, 8],
                                [2, 5, 6, 7, 8, 9], [3, 5, 6, 7, 8]])
    y = [0, 0, 0, 0, 1, 1, 1, 1]
    rng = np.random.RandomState(0)

    cv = KFold(n_splits=2, shuffle=True, random_state=rng)
    for estimator in [
            TimeSeriesSVC(kernel="gak", random_state=rng),
            TimeSeriesSVR(kernel="gak"),
            KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1),
            KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1)
    ]:
        # TODO: cannot test for clustering methods since they don't have a
        # score method yet
        cross_val_score(estimator, X=X, y=y, cv=cv)
示例#10
0
 def predict(self, X_test):
     if self.mac_neighbours is None:
         return self.model.predict(X_test)
     else:
         y_hat = []
         k_neighbors = self.model.kneighbors(
             X_test, n_neighbors=self.mac_neighbours, return_distance=False)
         for idx, k in enumerate(k_neighbors):
             X_train = self.X_train[k]
             y_train = self.y_train[k]
             self.model = KNeighborsTimeSeriesClassifier(
                 n_neighbors=self.n_neighbours,
                 metric="dtw",
                 weights=self.weights,
                 n_jobs=self.n_jobs,
                 metric_params=self.metric_params).fit(X_train, y_train)
             pred = self.model.predict(X_test[idx])
             y_hat.append(pred)
     return y_hat
示例#11
0
def test_sax_scale():
    n, sz, d = 10, 10, 3
    rng = np.random.RandomState(0)
    X = rng.rand(n, sz, d)
    y = rng.choice([0, 1], size=n)

    sax = SymbolicAggregateApproximation(n_segments=3,
                                         alphabet_size_avg=2,
                                         scale=True)
    sax.fit(X)
    np.testing.assert_array_almost_equal(X,
                                         sax._unscale(sax._scale(X)))

    np.testing.assert_array_almost_equal(np.zeros((d, )),
                                         sax._scale(X).reshape((-1, d)).mean())
    np.testing.assert_array_almost_equal(np.ones((d, )),
                                         sax._scale(X).reshape((-1, d)).std())

    # Case of kNN-SAX
    knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="sax",
                                             metric_params={"scale": True})
    knn_sax.fit(X, y)
    X_scale_unscale = knn_sax._sax._unscale(knn_sax._sax._scale(X))
    np.testing.assert_array_almost_equal(X, X_scale_unscale)

    knn_sax.predict(X)
示例#12
0
文件: NCC.py 项目: zfbi/dtan
def NearestCentroidClassification(X_train, X_test, y_train_n, y_test_n,
                                  dataset_name):
    '''

    :param X_train: if using DTAN, should already be aligned
    :param X_test: if using DTAN, should already be aligned
    :param y_train_n: numerical labels (not one-hot)
    :param y_test_n: numerical labels (not one-hot)
    :param dataset_name:
    :return: test set NCC accuracy
    '''

    # vars and placeholders
    input_shape = X_train.shape[1:]
    n_classes = len(np.unique(y_train_n))
    class_names = np.unique(y_train_n, axis=0)

    aligned_means = np.zeros((n_classes, input_shape[0], input_shape[1]))
    ncc_labels = []

    # Train set within class Euclidean mean
    for class_num in class_names:
        train_class_idx = y_train_n == class_num  # get indices
        X_train_aligned_within_class = X_train[train_class_idx]
        aligned_means[int(class_num), :] = np.mean(
            X_train_aligned_within_class, axis=0)
        ncc_labels.append(class_num)

    ncc_labels = np.asarray(ncc_labels)

    # Nearest neighbor classification - using euclidean distance
    knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="euclidean")
    knn_clf.fit(aligned_means, ncc_labels)

    predicted_labels = knn_clf.predict(X_test)
    acc = accuracy_score(y_test_n, predicted_labels)

    print(f"{dataset_name} - NCC results: {acc}")
示例#13
0
def main():
    dm = EMGDataManager(PATH_TO_DATA,
                        path_to_timestamps=PATH_TO_ACTION_LABELS,
                        downsampler=True)
    clf = KNeighborsTimeSeriesClassifier(n_neighbors=N_NEIGHBOURS,
                                         metric="dtw",
                                         d=1)
    nbr_clf_rob = NeighbourClassifier(clf)
    nbr_clf_lap = NeighbourClassifier(clf)

    # LOADING DATA
    # we can only use downsampled data here
    rob_data_mus1, rob_data_mus2, rob_data_mus3, rob_data_mus4, rob_data_mus5, rob_data_mus6 = \
        dm.get_ROB_data_downsampled()
    _, _, timestamps = dm.get_ROB_metadata()
    nbr_clf_rob.load_data(rob_data_mus1, rob_data_mus2, rob_data_mus3,
                          rob_data_mus4, rob_data_mus5, rob_data_mus6,
                          timestamps)

    # PREDICTIONS
    predictions = nbr_clf_rob.predict()
    actual = nbr_clf_rob.get_test_timestamps()

    print("ROB Prediction")
    print(predictions)
    print("ROB Actual")
    print(actual)

    lap_data_mus1, lap_data_mus2, lap_data_mus3, lap_data_mus4, lap_data_mus5, lap_data_mus6 = \
        dm.get_LAP_data_downsampled()
    _, _, timestamps = dm.get_LAP_metadata()
    nbr_clf_lap.load_data(lap_data_mus1, lap_data_mus2, lap_data_mus3,
                          lap_data_mus4, lap_data_mus5, lap_data_mus6,
                          timestamps)

    # PREDICTIONS
    predictions = nbr_clf_lap.predict()
    actual = nbr_clf_lap.get_test_timestamps()
    print("LAP Prediction")
    print(predictions)
    print("LAP Actual")
    print(actual)
def NN1_DTWClassifier(X_train, Y_train):

    knn1_clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="dtw")
    knn1_clf.fit(X_train, Y_train)
    return knn1_clf
示例#15
0
    seq = np.genfromtxt(rep + dataset,
                        delimiter=' ',
                        dtype=str,
                        encoding="utf8")
    ids, counts = np.unique(seq[:, 0], return_counts=True)

    No = ids.shape[0]
    D = seq.shape[1] - 3
    arr = np.asarray((ids, counts)).T
    Max_Seq_Len = np.max(arr[:, 1].astype(np.int))

    out_X = np.zeros((No, Max_Seq_Len, D))
    out_Y = np.zeros((No, ))

    for idx, id in enumerate(ids):
        seq_cpy = seq[seq[:, 0] == id]
        out_X[idx] = seq_cpy[:, 3:]
        out_Y[idx] = seq_cpy[0, 2]
    return out_X, out_Y


x_train, y_train = convert_mts(rep, ds_train)
x_test, y_test = convert_mts(rep, ds_test)

clf = KNeighborsTimeSeriesClassifier(n_neighbors=2, metric="dtw")

y_test_pred = clf.fit(x_train, y=y_train).predict(x_test)

print("the accuracy score of the testing data is : " +
      accuracy_score(y_test, y_test_pred))
def get_models(device, include_knn=False):
    """
    Get the models used in the comparison.

    :param device: a PyTorch device used for training / inference
    :param include_knn: True if k-NN DTW model should be used
    :return: list of models
    """

    models = [
        FeatureBasedClassifier(
            LogisticRegression(n_jobs=-1,
                               class_weight='balanced',
                               C=100.0,
                               max_iter=1000)),
        FeatureBasedClassifier(RandomForestClassifier(
            class_weight='balanced',
            max_depth=20,
            min_samples_leaf=5,
            n_estimators=1000,
            n_jobs=-1,
        ),
                               pipeline=get_default_rf_pipeline()),
        FeatureBasedClassifier(XGBClassifier(
            colsample_bytree=0.8,
            gamma=0.1,
            learning_rate=0.1,
            max_depth=7,
            min_child_weight=4,
            n_estimators=1000,
            nthread=8,
            subsample=0.8,
        ),
                               pipeline=None),
        FeatureBasedClassifier(
            MLPClassifier(early_stopping=True,
                          hidden_layer_sizes=(512, ),
                          batch_size=128)),
        SignalBasedClassifier(ResNet18,
                              device,
                              batch_size=32,
                              epochs=20,
                              optimizer_args=dict(lr=3e-4, weight_decay=1e-4)),
        SignalBasedClassifier(CnnGru,
                              device,
                              batch_size=32,
                              epochs=10,
                              gru_dropout=0,
                              gru_layers=1,
                              optimizer_args=dict(lr=3e-4, weight_decay=1e-4))
    ]

    if include_knn:
        models.append(
            DistanceBasedClassifier(
                KNeighborsTimeSeriesClassifier(
                    metric='dtw',
                    n_jobs=-1,
                    n_neighbors=1,
                )))

    return models
X_train = X_shuffle[:n_ts_per_blob * n_blobs // 2]
X_test = X_shuffle[n_ts_per_blob * n_blobs // 2:]
y_train = y_shuffle[:n_ts_per_blob * n_blobs // 2]
y_test = y_shuffle[n_ts_per_blob * n_blobs // 2:]

# Nearest neighbor search
knn = KNeighborsTimeSeries(n_neighbors=3, metric="dtw")
knn.fit(X_train, y_train)
dists, ind = knn.kneighbors(X_test)
print("1. Nearest neighbour search")
print("Computed nearest neighbor indices (wrt DTW)\n", ind)
print("First nearest neighbor class:", y_test[ind[:, 0]])

# Nearest neighbor classification
knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw")
knn_clf.fit(X_train, y_train)
predicted_labels = knn_clf.predict(X_test)
print("\n2. Nearest neighbor classification using DTW")
print("Correct classification rate:", accuracy_score(y_test, predicted_labels))

# Nearest neighbor classification with a different metric (Euclidean distance)
knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean")
knn_clf.fit(X_train, y_train)
predicted_labels = knn_clf.predict(X_test)
print("\n3. Nearest neighbor classification using L2")
print("Correct classification rate:", accuracy_score(y_test, predicted_labels))

# Nearest neighbor classification  based on SAX representation
sax_trans = SymbolicAggregateApproximation(n_segments=10, alphabet_size_avg=5)
knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean")
示例#18
0
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                                     Hyperparams]):
    """
    Primitive that applies the k nearest neighbor classification algorithm to time series data.
    The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped.
    """

    metadata = metadata_base.PrimitiveMetadata({
        "id":
        "2d6d3223-1b3c-49cc-9ddd-50f571818268",
        "version":
        __version__,
        "name":
        "kanine",
        "keywords": [
            "time series",
            "knn",
            "k nearest neighbor",
            "time series classification",
        ],
        "source": {
            "name": __author__,
            "contact": __contact__,
            "uris": [
                "https://github.com/kungfuai/d3m-primitives",
            ],
        },
        "installation": [
            {
                "type": "PIP",
                "package": "cython",
                "version": "0.29.16"
            },
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
        ],
        "python_path":
        "d3m.primitives.time_series_classification.k_neighbors.Kanine",
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._knn = KNeighborsTimeSeriesClassifier(
            n_neighbors=self.hyperparams["n_neighbors"],
            metric=self.hyperparams["distance_metric"],
            weights=self.hyperparams["sample_weighting"],
        )
        self._scaler = TimeSeriesScalerMinMax()
        self._is_fit = False

    def get_params(self) -> Params:
        if not self._is_fit:
            return Params(scaler=None, classifier=None, output_columns=None)

        return Params(
            scaler=self._scaler,
            classifier=self._knn,
            output_columns=self._output_columns,
        )

    def set_params(self, *, params: Params) -> None:
        self._scaler = params["scaler"]
        self._knn = params["classifier"]
        self._output_columns = params["output_columns"]
        self._is_fit = all(param is not None for param in params.values())

    def _get_cols(self, input_metadata):
        """private util function that finds grouping column from input metadata

        Arguments:
            input_metadata {D3M Metadata object} -- D3M Metadata object for input frame

        Returns:
            list[int] -- list of column indices annotated with GroupingKey metadata
        """

        # find column with ts value through metadata
        grouping_column = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/GroupingKey", ))
        return grouping_column

    def _get_value_col(self, input_metadata):
        """
        private util function that finds the value column from input metadata

        Arguments:
        input_metadata {D3M Metadata object} -- D3M Metadata object for input frame

        Returns:
        int -- index of column that contains time series value after Time Series Formatter primitive
        """

        # find attribute column but not file column
        attributes = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/Attribute", ))
        # this is assuming alot, but timeseries formaters typicaly place value column at the end
        attribute_col = attributes[-1]
        return attribute_col

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """Sets primitive's training data

        Arguments:
            inputs {Inputs} -- D3M dataframe containing attributes
            outputs {Outputs} -- D3M dataframe containing targets
        """

        # load and reshape training data
        self._output_columns = outputs.columns
        outputs = np.array(outputs)
        n_ts = outputs.shape[0]
        ts_sz = inputs.shape[0] // n_ts

        attribute_col = self._get_value_col(inputs.metadata)
        self._X_train = inputs.iloc[:,
                                    attribute_col].values.reshape(n_ts, ts_sz)
        self._y_train = np.array(outputs).reshape(-1, )

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """Fits KNN model using training data from set_training_data and hyperparameters

        Keyword Arguments:
            timeout {float} -- timeout, not considered (default: {None})
            iterations {int} -- iterations, not considered (default: {None})

        Returns:
            CallResult[None]
        """

        scaled = self._scaler.fit_transform(self._X_train)
        self._knn.fit(scaled, self._y_train)
        self._is_fit = True
        return CallResult(None, has_finished=self._is_fit)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """Produce primitive's classifications for new time series data

        Arguments:
            inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target

        Keyword Arguments:
            timeout {float} -- timeout, not considered (default: {None})
            iterations {int} -- iterations, not considered (default: {None})

        Raises:
            PrimitiveNotFittedError: if primitive not fit

        Returns:
            CallResult[Outputs] -- dataframe with a column containing a predicted class
                for each input time series
        """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        # find column with ts value through metadata
        grouping_column = self._get_cols(inputs.metadata)

        n_ts = inputs.iloc[:, grouping_column[0]].nunique()
        ts_sz = inputs.shape[0] // n_ts
        attribute_col = self._get_value_col(inputs.metadata)
        x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz)

        # make predictions
        scaled = self._scaler.transform(x_vals)
        preds = self._knn.predict(scaled)

        # create output frame
        result_df = container.DataFrame({self._output_columns[0]: preds},
                                        generate_metadata=True)
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )

        return CallResult(result_df, has_finished=True)
def test_constrained_paths():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, sz, d)
    y = rng.randint(low=0, high=3, size=n)

    model_euc = KNeighborsTimeSeriesClassifier(n_neighbors=3,
                                               metric="euclidean")
    y_pred_euc = model_euc.fit(X, y).predict(X)
    model_dtw_sakoe = KNeighborsTimeSeriesClassifier(n_neighbors=3,
                                                     metric="dtw",
                                                     metric_params={
                                                         "global_constraint":
                                                         "sakoe_chiba",
                                                         "sakoe_chiba_radius":
                                                         0
                                                     })
    y_pred_sakoe = model_dtw_sakoe.fit(X, y).predict(X)
    np.testing.assert_equal(y_pred_euc, y_pred_sakoe)

    model_softdtw = KNeighborsTimeSeriesClassifier(
        n_neighbors=3, metric="softdtw", metric_params={"gamma": 1e-6})
    y_pred_softdtw = model_softdtw.fit(X, y).predict(X)

    model_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw")
    y_pred_dtw = model_dtw.fit(X, y).predict(X)

    np.testing.assert_equal(y_pred_dtw, y_pred_softdtw)
示例#20
0
    def create_model(self, model_type, X, y, model_params, search_params):
        """
        Executes random search hyper parameter optimization for the specified model. Refer to sklearn
        and tslearn documentation for details.
        Sources:
        # https://www.kaggle.com/hatone/mlpclassifier-with-gridsearchcv
        # https://en.wikipedia.org/wiki/Hyperparameter_optimization#Grid_search
        # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
        # alternative to grid search: https://github.com/sahilm89/lhsmdu
        :param model_type:
        :param X:
        :param y:
        :param model_params:
        :param search_params:
        :param test_size:
        :return:
        """
        try:

            if X is None or y is None or model_params is None or search_params is None:
                raise TypeError(self.messages.ILLEGAL_ARGUMENT_NONE_TYPE.value)
            if (not isinstance(X, pandas.DataFrame) and
                    not isinstance(X, pandas.core.frame.DataFrame) \
                    and not isinstance(X, pandas.core.series.Series)):
                raise TypeError(self.messages.ILLEGAL_ARGUMENT_TYPE.value)
            if (not isinstance(y, pandas.DataFrame) and
                    not isinstance(y, pandas.core.frame.DataFrame) \
                    and not isinstance(y, pandas.core.series.Series)):
                raise TypeError(self.messages.ILLEGAL_ARGUMENT_TYPE.value)

            if not isinstance(model_params, dict) or not isinstance(
                    search_params, list):
                raise TypeError(self.messages.ILLEGAL_ARGUMENT_TYPE.value)

            model = None

            if model_type == 'tssvc':
                model = TimeSeriesSVC(search_params[0])

            if model_type == 'knn_classifier':
                model = KNeighborsTimeSeriesClassifier(search_params[0])

            if model is None:
                raise ValueError(
                    self.messages.PROVIDED_MODE_DOESNT_EXIST.value)

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=search_params[6], stratify=y)
            clf = RandomizedSearchCV(model,
                                     model_params,
                                     n_jobs=search_params[0],
                                     verbose=search_params[1],
                                     cv=search_params[2],
                                     n_iter=search_params[3])
            clf.fit(X_train, y_train)
            if search_params[4]:
                with open(r"{}".format(search_params[5]), "wb") as output_file:
                    pickle.dump(clf, output_file)

            return {'clf': clf, 'X_test': X_test, 'y_test': y_test}

        except (TypeError, ValueError):
            self.logger.error(traceback.format_exc())
            os._exit(1)

        except Exception:
            self.logger.error(traceback.format_exc())
            os._exit(2)
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                                     Hyperparams]):
    """
        Primitive that applies the k nearest neighbor classification algorithm to time series data. 
        The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped.
        
        Training inputs: 1) Feature dataframe, 2) Target dataframe
        Outputs: Dataframe with predictions for specific time series at specific future time instances 
    
        Arguments:
            hyperparams {Hyperparams} -- D3M Hyperparameter object
        
        Keyword Arguments:
            random_seed {int} -- random seed (default: {0})
    """

    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        "id":
        "2d6d3223-1b3c-49cc-9ddd-50f571818268",
        "version":
        __version__,
        "name":
        "kanine",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        "keywords": [
            "time series",
            "knn",
            "k nearest neighbor",
            "time series classification",
        ],
        "source": {
            "name":
            __author__,
            "contact":
            __contact__,
            "uris": [
                # Unstructured URIs.
                "https://github.com/Yonder-OSS/D3M-Primitives",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        "installation": [
            {
                "type": "PIP",
                "package": "cython",
                "version": "0.29.14"
            },
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/Yonder-OSS/D3M-Primitives.git@{git_commit}#egg=yonder-primitives"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
        ],
        # The same path the primitive is registered with entry points in setup.py.
        "python_path":
        "d3m.primitives.time_series_classification.k_neighbors.Kanine",
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._knn = KNeighborsTimeSeriesClassifier(
            n_neighbors=self.hyperparams["n_neighbors"],
            metric=self.hyperparams["distance_metric"],
            weights=self.hyperparams["sample_weighting"],
        )
        self._scaler = TimeSeriesScalerMinMax()
        self._is_fit = False

    def get_params(self) -> Params:
        if not self._is_fit:
            return Params(scaler=None, classifier=None, output_columns=None)

        return Params(scaler=self._scaler,
                      classifier=self._knn,
                      output_columns=self._output_columns)

    def set_params(self, *, params: Params) -> None:
        self._scaler = params['scaler']
        self._knn = params['classifier']
        self._output_columns = params['output_columns']
        self._is_fit = all(param is not None for param in params.values())

    def _get_cols(self, input_metadata):
        """ private util function that finds grouping column from input metadata
        
        Arguments:
            input_metadata {D3M Metadata object} -- D3M Metadata object for input frame
        
        Returns:
            list[int] -- list of column indices annotated with GroupingKey metadata
        """

        # find column with ts value through metadata
        grouping_column = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/GroupingKey", ))
        return grouping_column

    def _get_value_col(self, input_metadata):
        """
        private util function that finds the value column from input metadata

        Arguments:
        input_metadata {D3M Metadata object} -- D3M Metadata object for input frame

        Returns:
        int -- index of column that contains time series value after Time Series Formatter primitive
        """

        # find attribute column but not file column
        attributes = input_metadata.list_columns_with_semantic_types(
            ('https://metadata.datadrivendiscovery.org/types/Attribute', ))
        # this is assuming alot, but timeseries formaters typicaly place value column at the end
        attribute_col = attributes[-1]
        return attribute_col

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """ Sets primitive's training data

            Arguments:
                inputs {Inputs} -- D3M dataframe containing attributes
                outputs {Outputs} -- D3M dataframe containing targets
        """

        # load and reshape training data
        self._output_columns = outputs.columns
        outputs = np.array(outputs)
        n_ts = outputs.shape[0]
        ts_sz = inputs.shape[0] // n_ts

        attribute_col = self._get_value_col(inputs.metadata)
        self._X_train = inputs.iloc[:,
                                    attribute_col].values.reshape(n_ts, ts_sz)
        self._y_train = np.array(outputs).reshape(-1, )

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """ Fits KNN model using training data from set_training_data and hyperparameters
            
            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})
            
            Returns:
                CallResult[None]
        """

        scaled = self._scaler.fit_transform(self._X_train)
        self._knn.fit(scaled, self._y_train)
        self._is_fit = True
        return CallResult(None, has_finished=self._is_fit)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """ Produce primitive's classifications for new time series data

            Arguments:
                inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target
            
            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})

            Raises:
                PrimitiveNotFittedError: if primitive not fit

            Returns:
                CallResult[Outputs] -- dataframe with a column containing a predicted class 
                    for each input time series
        """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        # find column with ts value through metadata
        grouping_column = self._get_cols(inputs.metadata)

        n_ts = inputs.iloc[:, grouping_column[0]].nunique()
        ts_sz = inputs.shape[0] // n_ts
        attribute_col = self._get_value_col(inputs.metadata)
        x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz)

        # make predictions
        scaled = self._scaler.transform(x_vals)
        preds = self._knn.predict(scaled)

        # create output frame
        result_df = container.DataFrame({self._output_columns[0]: preds},
                                        generate_metadata=True)
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )

        return CallResult(result_df, has_finished=True)
示例#22
0

# Set seed
numpy.random.seed(0)

# Defining dataset and the number of segments
data_loader = UCR_UEA_datasets()
datasets = [('SyntheticControl', 16), ('GunPoint', 64), ('FaceFour', 128),
            ('Lightning2', 256), ('Lightning7', 128), ('ECG200', 32),
            ('Plane', 64), ('Car', 256), ('Beef', 128), ('Coffee', 128),
            ('OliveOil', 256)]

# We will compare the accuracies & execution times of 1-NN using:
# (i) MINDIST on SAX representations, and
# (ii) euclidean distance on raw values
knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='sax')
knn_eucl = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='euclidean')

accuracies = {}
times = {}
for dataset, w in datasets:
    X_train, y_train, X_test, y_test = data_loader.load_dataset(dataset)

    ts_scaler = TimeSeriesScalerMeanVariance()
    X_train = ts_scaler.fit_transform(X_train)
    X_test = ts_scaler.fit_transform(X_test)

    # Fit 1-NN using SAX representation & MINDIST
    metric_params = {'n_segments': w, 'alphabet_size_avg': 10}
    knn_sax = clone(knn_sax).set_params(metric_params=metric_params)
    start = time.time()
示例#23
0
from flask import Flask
from flask_restful import Api, Resource, reqparse
import numpy as np
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from tslearn.shapelets import LearningShapelets
from os import path as os_path
from pathlib import Path

from tslearn.utils import to_time_series_dataset, to_time_series

APP = Flask(__name__)
API = Api(APP)

# Load models from disk
k_nn_model = KNeighborsTimeSeriesClassifier.from_pickle('./models/k_nn.pickle')
shapelets_model = LearningShapelets.from_pickle(
    './models/learning_shapelets.pickle')

working_dir_path = Path.cwd()

filename = os_path.join(working_dir_path, './models/mlp_nn.pickle')
mlp_nn_model = pickle.load(open(filename, 'rb'))

filename = os_path.join(working_dir_path, './models/gak_svm.pickle')
gak_svm_model = pickle.load(open(filename, 'rb'))


class Classify(Resource):
    @staticmethod
    def post():
示例#24
0
def test_constrained_paths():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, sz, d)
    y = rng.randint(low=0, high=3, size=n)

    model_euc = KNeighborsTimeSeriesClassifier(n_neighbors=3,
                                               metric="euclidean")
    y_pred_euc = model_euc.fit(X, y).predict(X)
    model_dtw_sakoe = KNeighborsTimeSeriesClassifier(n_neighbors=3,
                                                     metric="dtw",
                                                     metric_params={
                                                         "global_constraint":
                                                         "sakoe_chiba",
                                                         "sakoe_chiba_radius":
                                                         0
                                                     })
    y_pred_sakoe = model_dtw_sakoe.fit(X, y).predict(X)
    np.testing.assert_equal(y_pred_euc, y_pred_sakoe)

    model_softdtw = KNeighborsTimeSeriesClassifier(
        n_neighbors=3, metric="softdtw", metric_params={"gamma": 1e-6})
    y_pred_softdtw = model_softdtw.fit(X, y).predict(X)

    model_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw")
    y_pred_dtw = model_dtw.fit(X, y).predict(X)

    np.testing.assert_equal(y_pred_dtw, y_pred_softdtw)

    model_sax = KNeighborsTimeSeriesClassifier(n_neighbors=3,
                                               metric="sax",
                                               metric_params={
                                                   "alphabet_size_avg": 6,
                                                   "n_segments": 10
                                               })
    model_sax.fit(X, y)

    # The MINDIST of SAX is a lower bound of the euclidean distance
    euc_dist, _ = model_euc.kneighbors(X, n_neighbors=5)
    sax_dist, _ = model_sax.kneighbors(X, n_neighbors=5)

    # First column will contain zeroes
    np.testing.assert_array_less(sax_dist[:, 1:], euc_dist[:, 1:])
示例#25
0
    def fit(self, X, y):
        """
        Fit early classifier.

        Parameters
        ----------
        X : array-like of shape (n_series, n_timestamps, n_features)
            Training data, where `n_series` is the number of time series,
            `n_timestamps` is the number of timestamps in the series
            and `n_features` is the number of features recorded at each
            timestamp.

        y : array-like of shape (n_samples,)
            Target values. Will be cast to X's dtype if necessary

        Returns
        -------
        self : returns an instance of self.
        """

        X = check_array(X, allow_nd=True)
        X = check_dims(X)
        X = to_time_series_dataset(X)
        y_arr = np.array(y)
        label_set = np.unique(y_arr)

        self.cluster_ = TimeSeriesKMeans(n_clusters=self.n_clusters,
                                         random_state=self.random_state)
        if self.base_classifier is not None:
            clf = self.base_classifier
        else:
            clf = KNeighborsTimeSeriesClassifier(n_neighbors=1,
                                                 metric="euclidean")
        self.__n_classes_ = len(label_set)
        self._X_fit_dims = X.shape
        sz = X.shape[1]
        self.classifiers_ = {t: clone(clf)
                             for t in range(self.min_t, sz + 1)}
        self.pyhatyck_ = np.empty((sz - self.min_t + 1,
                                   self.n_clusters,
                                   self.__n_classes_, self.__n_classes_))
        c_k = self.cluster_.fit_predict(X)
        X1, X2, c_k1, c_k2, y1, y2 = train_test_split(
            X, c_k, y_arr,
            test_size=0.5,
            stratify=c_k,
            random_state=self.random_state
        )

        label_to_ind = {lab: ind for ind, lab in enumerate(label_set)}
        y_ = np.array([label_to_ind.get(lab, self.__n_classes_ + 1)
                       for lab in y_arr])

        vector_of_ones = np.ones((X.shape[0], ))
        self.pyck_ = coo_matrix(
            (vector_of_ones, (y_, c_k)),
            shape=(self.__n_classes_, self.n_clusters),
        ).toarray()
        self.pyck_ /= self.pyck_.sum(axis=0, keepdims=True)
        for t in range(self.min_t, sz + 1):
            self.classifiers_[t].fit(X1[:, :t], y1)
            for k in range(0, self.n_clusters):
                index = (c_k2 == k)
                if index.shape[0] != 0:
                    X2_current_cluster = X2[index, :t]
                    y2_current_cluster = y2[index]
                    y2_hat = self.classifiers_[t].predict(
                        X2_current_cluster[:, :t]
                    )
                    conf_matrix = confusion_matrix(y2_current_cluster, y2_hat,
                                                   labels=label_set)
                    # normalize parameter seems to be quite recent in sklearn,
                    # so let's do it ourselves
                    normalizer = conf_matrix.sum(axis=0, keepdims=True)
                    normalizer[normalizer == 0] = 1  # Avoid divide by 0
                    conf_matrix = conf_matrix / normalizer

                    # pyhatyck_ stores
                    # P_{t+\tau}(\hat{y} | y, c_k) \delta_{y \neq \hat{y}}
                    # elements so it should have a null diagonal because of
                    # the \delta_{y \neq \hat{y}} term
                    np.fill_diagonal(conf_matrix, 0)
                    self.pyhatyck_[t - self.min_t, k] = conf_matrix
        return self
示例#26
0
    working_dir_path = Path.cwd()
    sys.path.append(str(working_dir_path))

    # Load the dataset
    raw_data = pd.read_csv(os_path.join(working_dir_path,
                                        "./data/train_curves.csv"),
                           header=None)
    time_series_train = to_time_series_dataset(raw_data)

    labels_train = genfromtxt(os_path.join(
        working_dir_path, "./data/train_clustering_result.csv"),
                              delimiter=',')

    # Define the model
    knn_classification_model = KNeighborsTimeSeriesClassifier(n_neighbors=5,
                                                              metric="dtw",
                                                              n_jobs=4)

    # fit the model using the training data
    knn_classification_model.fit(time_series_train, labels_train)

    #############################################################################################
    # save model
    #############################################################################################

    print(
        "#############################################################################################"
    )

    # return string with current datetime
    now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
示例#27
0
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.datasets import CachedDatasets

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

import numpy as np

import matplotlib.pyplot as plt

# Our pipeline consists of two phases. First, data will be normalized using
# min-max normalization. Afterwards, it is fed to a KNN classifier. For the
# KNN classifier, we tune the n_neighbors and weights hyper-parameters.
n_splits = 3
pipeline = GridSearchCV(Pipeline([('normalize', TimeSeriesScalerMinMax()),
                                  ('knn', KNeighborsTimeSeriesClassifier())]),
                        {
                            'knn__n_neighbors': [5, 25],
                            'knn__weights': ['uniform', 'distance']
                        },
                        cv=StratifiedKFold(n_splits=n_splits,
                                           shuffle=True,
                                           random_state=42))

X_train, y_train, _, _ = CachedDatasets().load_dataset("Trace")

# Keep only timeseries of class 1, 2, 3
X_train = X_train[y_train > 0]
y_train = y_train[y_train > 0]

# Keep only the first 50 timeseries of both train and
示例#28
0
    
    correct = (preds == labels[:len(preds)])
    score = float(sum(correct))/len(correct)

    return score

# get train/test data/labels
inDataFile = 'data/160k_f100_20190908-1401.txt'
labels, data = dp.readProcFile(inDataFile)
labels = np.array(labels)
data = np.array(data)
trainData, trainLabels, testData, testLabels = dp.splitTestTrainSets(data, labels, 0.8, 'Stratified')
# z-normalisation
trainData, testData = dp.znorm(trainData, testData)

clf = KNeighborsTimeSeriesClassifier(n_jobs=-1)

print "Fitting..."
clf.fit(trainData, trainLabels)

print "Scoring..."
predictions = []
for i in range(len(testData)):
    if (i % 10 == 0) and (i > 0):
        print "{} complete...current score: {}".format(i, getScore(np.array(predictions), testLabels) )
    predictions += clf.predict([testData[i]]).tolist()
    
predictions = np.array(predictions)
test_acc = getScore(predictions, testLabels)
#test_acc = clf.score(testData, testLabels)
print test_acc
示例#29
0
def test_variable_length_knn():
    X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [9, 8, 7, 6, 5, 2],
                                [8, 7, 6, 5, 3]])
    y = [0, 0, 1, 1]

    clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1)
    clf.fit(X, y)
    assert_allclose(clf.predict(X), [0, 0, 1, 1])

    clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1)
    clf.fit(X, y)
    assert_allclose(clf.predict(X), [0, 0, 1, 1])

    scaler = TimeSeriesScalerMeanVariance()
    clf = KNeighborsTimeSeriesClassifier(metric="sax",
                                         n_neighbors=1,
                                         metric_params={'n_segments': 2})
    X_transf = scaler.fit_transform(X)
    clf.fit(X_transf, y)
    assert_allclose(clf.predict(X_transf), [0, 0, 1, 1])
示例#30
0
                tmpSum +=1
        # rsltCol.append([x,tmpSum])
        rsltCol.append(tmpSum)
    return [rsltRow, rsltCol]

def dataToSeries(dataset):
    rowArray = []
    # colArray = []
    for i in range(0, len(dataset)):
        row = mapper(dataset[i])
        rowArray.append(row)
        # colArray.append(col)
    return to_time_series(rowArray)

X_train, y_train, X_test, y_test = load_data('data/')
X_train_ts =  dataToSeries(X_train)
X_test_ts = dataToSeries(X_test)

knn_clf_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="dtw")
knn_clf_dtw.fit(X_train_ts, y_train)
predicted_labels_dtw = knn_clf_dtw.predict(X_test_ts)
print("knn with dtw: \n", accuracy_score(y_test, predicted_labels_dtw))
print("Classification report: \n", classification_report(y_test, predicted_labels_dtw))
print("Confusion matrix: \n", confusion_matrix(y_test, predicted_labels_dtw))


unlabaled =   df = pd.read_csv("data/test.csv")
unlabaled = unlabaled.values
unlabaled_ts = dataToSeries(unlabaled)
plt.imshow(unlabaled[165].reshape((28, 28)))
predicted_label_dtw = knn_clf_dtw.predict(unlabaled_ts)