def test_sax_scale(): n, sz, d = 10, 10, 3 rng = np.random.RandomState(0) X = rng.rand(n, sz, d) y = rng.choice([0, 1], size=n) sax = SymbolicAggregateApproximation(n_segments=3, alphabet_size_avg=2, scale=True) sax.fit(X) np.testing.assert_array_almost_equal(X, sax._unscale(sax._scale(X))) np.testing.assert_array_almost_equal(np.zeros((d, )), sax._scale(X).reshape((-1, d)).mean()) np.testing.assert_array_almost_equal(np.ones((d, )), sax._scale(X).reshape((-1, d)).std()) # Case of kNN-SAX knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="sax", metric_params={"scale": True}) knn_sax.fit(X, y) X_scale_unscale = knn_sax._sax._unscale(knn_sax._sax._scale(X)) np.testing.assert_array_almost_equal(X, X_scale_unscale) knn_sax.predict(X)
class kNNClassifier: def __init__(self, n_neighbours=5, mac_neighbours=None, weights="uniform", metric_params={}, n_jobs=-1): self.n_neighbours = n_neighbours self.mac_neighbours = mac_neighbours self.weights = weights self.metric_params = metric_params self.n_jobs = n_jobs def get_params(self, deep=True): return { "n_neighbours": self.n_neighbours, "mac_neighbours": self.mac_neighbours, "weights": self.weights, "metric_params": self.metric_params, "n_jobs": self.n_jobs } def set_params(self, **parameters): for parameter, value in parameters.items(): setattr(self, parameter, value) return self def fit(self, X_train, y_train): self.X_train = X_train self.y_train = y_train self.model = KNeighborsTimeSeriesClassifier( n_neighbors=self.n_neighbours, metric="euclidean", weights=self.weights, n_jobs=self.n_jobs).fit(self.X_train, self.y_train) return self def predict(self, X_test): if self.mac_neighbours is None: return self.model.predict(X_test) else: y_hat = [] k_neighbors = self.model.kneighbors( X_test, n_neighbors=self.mac_neighbours, return_distance=False) for idx, k in enumerate(k_neighbors): X_train = self.X_train[k] y_train = self.y_train[k] self.model = KNeighborsTimeSeriesClassifier( n_neighbors=self.n_neighbours, metric="dtw", weights=self.weights, n_jobs=self.n_jobs, metric_params=self.metric_params).fit(X_train, y_train) pred = self.model.predict(X_test[idx]) y_hat.append(pred) return y_hat
def test_variable_length_knn(): X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9], [3, 5, 6, 7, 8]]) y = [0, 0, 1, 1] clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1])
def test_variable_length_knn(): X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [9, 8, 7, 6, 5, 2], [8, 7, 6, 5, 3]]) y = [0, 0, 1, 1] clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) scaler = TimeSeriesScalerMeanVariance() clf = KNeighborsTimeSeriesClassifier(metric="sax", n_neighbors=1, metric_params={'n_segments': 2}) X_transf = scaler.fit_transform(X) clf.fit(X_transf, y) assert_allclose(clf.predict(X_transf), [0, 0, 1, 1])
class Knn(): def __init__(self, n_neighbors): ''' initialize KNN class with dynamic time warping distance metric hyperparameters: n_neighbors : number of neighbors on which to make classification decision ''' self.n_neighbors = n_neighbors self.knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=n_neighbors, metric="dtw") def __ScaleData(self, input_data): ''' scale input data to range [0,1] parameters: input_data : input data to rescale ''' return TimeSeriesScalerMinMax().fit_transform(input_data) def fit(self, X_train, y_train): ''' fit KNN classifier on training data parameters: X_train : training time series y_train : training labels ''' # scale training data to between 0 and 1 X_train_scaled = self.__ScaleData(X_train) self.knn_clf.fit(X_train_scaled, y_train) def predict(self, X_test): ''' classifications for time series in test data set parameters: X_test: test time series on which to predict classes returns: classifications for test data set ''' # scale test data to between 0 and 1 X_test_scaled = self.__ScaleData(X_test) return self.knn_clf.predict(X_test_scaled)
def NearestCentroidClassification(X_train, X_test, y_train_n, y_test_n, dataset_name): ''' :param X_train: if using DTAN, should already be aligned :param X_test: if using DTAN, should already be aligned :param y_train_n: numerical labels (not one-hot) :param y_test_n: numerical labels (not one-hot) :param dataset_name: :return: test set NCC accuracy ''' # vars and placeholders input_shape = X_train.shape[1:] n_classes = len(np.unique(y_train_n)) class_names = np.unique(y_train_n, axis=0) aligned_means = np.zeros((n_classes, input_shape[0], input_shape[1])) ncc_labels = [] # Train set within class Euclidean mean for class_num in class_names: train_class_idx = y_train_n == class_num # get indices X_train_aligned_within_class = X_train[train_class_idx] aligned_means[int(class_num), :] = np.mean( X_train_aligned_within_class, axis=0) ncc_labels.append(class_num) ncc_labels = np.asarray(ncc_labels) # Nearest neighbor classification - using euclidean distance knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="euclidean") knn_clf.fit(aligned_means, ncc_labels) predicted_labels = knn_clf.predict(X_test) acc = accuracy_score(y_test_n, predicted_labels) print(f"{dataset_name} - NCC results: {acc}")
X_test = X_shuffle[n_ts_per_blob * n_blobs // 2:] y_train = y_shuffle[:n_ts_per_blob * n_blobs // 2] y_test = y_shuffle[n_ts_per_blob * n_blobs // 2:] # Nearest neighbor search knn = KNeighborsTimeSeries(n_neighbors=3, metric="dtw") knn.fit(X_train, y_train) dists, ind = knn.kneighbors(X_test) print("1. Nearest neighbour search") print("Computed nearest neighbor indices (wrt DTW)\n", ind) print("First nearest neighbor class:", y_test[ind[:, 0]]) # Nearest neighbor classification knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw") knn_clf.fit(X_train, y_train) predicted_labels = knn_clf.predict(X_test) print("\n2. Nearest neighbor classification using DTW") print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) # Nearest neighbor classification with a different metric (Euclidean distance) knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean") knn_clf.fit(X_train, y_train) predicted_labels = knn_clf.predict(X_test) print("\n3. Nearest neighbor classification using L2") print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) # Nearest neighbor classification based on SAX representation sax_trans = SymbolicAggregateApproximation(n_segments=10, alphabet_size_avg=5) knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean") pipeline_model = Pipeline(steps=[('sax', sax_trans), ('knn', knn_clf)]) pipeline_model.fit(X_train, y_train)
correct = (preds == labels[:len(preds)]) score = float(sum(correct))/len(correct) return score # get train/test data/labels inDataFile = 'data/160k_f100_20190908-1401.txt' labels, data = dp.readProcFile(inDataFile) labels = np.array(labels) data = np.array(data) trainData, trainLabels, testData, testLabels = dp.splitTestTrainSets(data, labels, 0.8, 'Stratified') # z-normalisation trainData, testData = dp.znorm(trainData, testData) clf = KNeighborsTimeSeriesClassifier(n_jobs=-1) print "Fitting..." clf.fit(trainData, trainLabels) print "Scoring..." predictions = [] for i in range(len(testData)): if (i % 10 == 0) and (i > 0): print "{} complete...current score: {}".format(i, getScore(np.array(predictions), testLabels) ) predictions += clf.predict([testData[i]]).tolist() predictions = np.array(predictions) test_acc = getScore(predictions, testLabels) #test_acc = clf.score(testData, testLabels) print test_acc
knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='sax') knn_eucl = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='euclidean') accuracies = {} times = {} for dataset, w in datasets: X_train, y_train, X_test, y_test = data_loader.load_dataset(dataset) ts_scaler = TimeSeriesScalerMeanVariance() X_train = ts_scaler.fit_transform(X_train) X_test = ts_scaler.fit_transform(X_test) # Fit 1-NN using SAX representation & MINDIST metric_params = {'n_segments': w, 'alphabet_size_avg': 10} knn_sax = clone(knn_sax).set_params(metric_params=metric_params) start = time.time() knn_sax.fit(X_train, y_train) acc_sax = accuracy_score(y_test, knn_sax.predict(X_test)) time_sax = time.time() - start # Fit 1-NN using euclidean distance on raw values start = time.time() knn_eucl.fit(X_train, y_train) acc_euclidean = accuracy_score(y_test, knn_eucl.predict(X_test)) time_euclidean = time.time() - start accuracies[dataset] = (acc_sax, acc_euclidean) times[dataset] = (time_sax, time_euclidean) print_table(accuracies, times)
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that applies the k nearest neighbor classification algorithm to time series data. The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped. Training inputs: 1) Feature dataframe, 2) Target dataframe Outputs: Dataframe with predictions for specific time series at specific future time instances Arguments: hyperparams {Hyperparams} -- D3M Hyperparameter object Keyword Arguments: random_seed {int} -- random seed (default: {0}) """ metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". "id": "2d6d3223-1b3c-49cc-9ddd-50f571818268", "version": __version__, "name": "kanine", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. "keywords": [ "time series", "knn", "k nearest neighbor", "time series classification", ], "source": { "name": __author__, "contact": __contact__, "uris": [ # Unstructured URIs. "https://github.com/Yonder-OSS/D3M-Primitives", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ { "type": "PIP", "package": "cython", "version": "0.29.14" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/Yonder-OSS/D3M-Primitives.git@{git_commit}#egg=yonder-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], # The same path the primitive is registered with entry points in setup.py. "python_path": "d3m.primitives.time_series_classification.k_neighbors.Kanine", # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._knn = KNeighborsTimeSeriesClassifier( n_neighbors=self.hyperparams["n_neighbors"], metric=self.hyperparams["distance_metric"], weights=self.hyperparams["sample_weighting"], ) self._scaler = TimeSeriesScalerMinMax() self._is_fit = False def get_params(self) -> Params: if not self._is_fit: return Params(scaler=None, classifier=None, output_columns=None) return Params(scaler=self._scaler, classifier=self._knn, output_columns=self._output_columns) def set_params(self, *, params: Params) -> None: self._scaler = params['scaler'] self._knn = params['classifier'] self._output_columns = params['output_columns'] self._is_fit = all(param is not None for param in params.values()) def _get_cols(self, input_metadata): """ private util function that finds grouping column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: list[int] -- list of column indices annotated with GroupingKey metadata """ # find column with ts value through metadata grouping_column = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey", )) return grouping_column def _get_value_col(self, input_metadata): """ private util function that finds the value column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: int -- index of column that contains time series value after Time Series Formatter primitive """ # find attribute column but not file column attributes = input_metadata.list_columns_with_semantic_types( ('https://metadata.datadrivendiscovery.org/types/Attribute', )) # this is assuming alot, but timeseries formaters typicaly place value column at the end attribute_col = attributes[-1] return attribute_col def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """ Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets """ # load and reshape training data self._output_columns = outputs.columns outputs = np.array(outputs) n_ts = outputs.shape[0] ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) self._X_train = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) self._y_train = np.array(outputs).reshape(-1, ) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fits KNN model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Returns: CallResult[None] """ scaled = self._scaler.fit_transform(self._X_train) self._knn.fit(scaled, self._y_train) self._is_fit = True return CallResult(None, has_finished=self._is_fit) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's classifications for new time series data Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- dataframe with a column containing a predicted class for each input time series """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") # find column with ts value through metadata grouping_column = self._get_cols(inputs.metadata) n_ts = inputs.iloc[:, grouping_column[0]].nunique() ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) # make predictions scaled = self._scaler.transform(x_vals) preds = self._knn.predict(scaled) # create output frame result_df = container.DataFrame({self._output_columns[0]: preds}, generate_metadata=True) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=True)
tmpSum +=1 # rsltCol.append([x,tmpSum]) rsltCol.append(tmpSum) return [rsltRow, rsltCol] def dataToSeries(dataset): rowArray = [] # colArray = [] for i in range(0, len(dataset)): row = mapper(dataset[i]) rowArray.append(row) # colArray.append(col) return to_time_series(rowArray) X_train, y_train, X_test, y_test = load_data('data/') X_train_ts = dataToSeries(X_train) X_test_ts = dataToSeries(X_test) knn_clf_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="dtw") knn_clf_dtw.fit(X_train_ts, y_train) predicted_labels_dtw = knn_clf_dtw.predict(X_test_ts) print("knn with dtw: \n", accuracy_score(y_test, predicted_labels_dtw)) print("Classification report: \n", classification_report(y_test, predicted_labels_dtw)) print("Confusion matrix: \n", confusion_matrix(y_test, predicted_labels_dtw)) unlabaled = df = pd.read_csv("data/test.csv") unlabaled = unlabaled.values unlabaled_ts = dataToSeries(unlabaled) plt.imshow(unlabaled[165].reshape((28, 28))) predicted_label_dtw = knn_clf_dtw.predict(unlabaled_ts)
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that applies the k nearest neighbor classification algorithm to time series data. The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped. """ metadata = metadata_base.PrimitiveMetadata({ "id": "2d6d3223-1b3c-49cc-9ddd-50f571818268", "version": __version__, "name": "kanine", "keywords": [ "time series", "knn", "k nearest neighbor", "time series classification", ], "source": { "name": __author__, "contact": __contact__, "uris": [ "https://github.com/kungfuai/d3m-primitives", ], }, "installation": [ { "type": "PIP", "package": "cython", "version": "0.29.16" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "python_path": "d3m.primitives.time_series_classification.k_neighbors.Kanine", "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._knn = KNeighborsTimeSeriesClassifier( n_neighbors=self.hyperparams["n_neighbors"], metric=self.hyperparams["distance_metric"], weights=self.hyperparams["sample_weighting"], ) self._scaler = TimeSeriesScalerMinMax() self._is_fit = False def get_params(self) -> Params: if not self._is_fit: return Params(scaler=None, classifier=None, output_columns=None) return Params( scaler=self._scaler, classifier=self._knn, output_columns=self._output_columns, ) def set_params(self, *, params: Params) -> None: self._scaler = params["scaler"] self._knn = params["classifier"] self._output_columns = params["output_columns"] self._is_fit = all(param is not None for param in params.values()) def _get_cols(self, input_metadata): """private util function that finds grouping column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: list[int] -- list of column indices annotated with GroupingKey metadata """ # find column with ts value through metadata grouping_column = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey", )) return grouping_column def _get_value_col(self, input_metadata): """ private util function that finds the value column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: int -- index of column that contains time series value after Time Series Formatter primitive """ # find attribute column but not file column attributes = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/Attribute", )) # this is assuming alot, but timeseries formaters typicaly place value column at the end attribute_col = attributes[-1] return attribute_col def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets """ # load and reshape training data self._output_columns = outputs.columns outputs = np.array(outputs) n_ts = outputs.shape[0] ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) self._X_train = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) self._y_train = np.array(outputs).reshape(-1, ) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """Fits KNN model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Returns: CallResult[None] """ scaled = self._scaler.fit_transform(self._X_train) self._knn.fit(scaled, self._y_train) self._is_fit = True return CallResult(None, has_finished=self._is_fit) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """Produce primitive's classifications for new time series data Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- dataframe with a column containing a predicted class for each input time series """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") # find column with ts value through metadata grouping_column = self._get_cols(inputs.metadata) n_ts = inputs.iloc[:, grouping_column[0]].nunique() ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) # make predictions scaled = self._scaler.transform(x_vals) preds = self._knn.predict(scaled) # create output frame result_df = container.DataFrame({self._output_columns[0]: preds}, generate_metadata=True) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=True)