def test_sax_scale(): n, sz, d = 10, 10, 3 rng = np.random.RandomState(0) X = rng.rand(n, sz, d) y = rng.choice([0, 1], size=n) sax = SymbolicAggregateApproximation(n_segments=3, alphabet_size_avg=2, scale=True) sax.fit(X) np.testing.assert_array_almost_equal(X, sax._unscale(sax._scale(X))) np.testing.assert_array_almost_equal(np.zeros((d, )), sax._scale(X).reshape((-1, d)).mean()) np.testing.assert_array_almost_equal(np.ones((d, )), sax._scale(X).reshape((-1, d)).std()) # Case of kNN-SAX knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="sax", metric_params={"scale": True}) knn_sax.fit(X, y) X_scale_unscale = knn_sax._sax._unscale(knn_sax._sax._scale(X)) np.testing.assert_array_almost_equal(X, X_scale_unscale) knn_sax.predict(X)
def test_constrained_paths(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) X = rng.randn(n, sz, d) y = rng.randint(low=0, high=3, size=n) model_euc = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean") y_pred_euc = model_euc.fit(X, y).predict(X) model_dtw_sakoe = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw", metric_params={ "global_constraint": "sakoe_chiba", "sakoe_chiba_radius": 0 }) y_pred_sakoe = model_dtw_sakoe.fit(X, y).predict(X) np.testing.assert_equal(y_pred_euc, y_pred_sakoe) model_softdtw = KNeighborsTimeSeriesClassifier( n_neighbors=3, metric="softdtw", metric_params={"gamma": 1e-6}) y_pred_softdtw = model_softdtw.fit(X, y).predict(X) model_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw") y_pred_dtw = model_dtw.fit(X, y).predict(X) np.testing.assert_equal(y_pred_dtw, y_pred_softdtw)
def test_variable_length_knn(): X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9], [3, 5, 6, 7, 8]]) y = [0, 0, 1, 1] clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1])
def test_serialize_knn_classifier(): n, sz, d = 15, 10, 3 rng = numpy.random.RandomState(0) X = rng.randn(n, sz, d) y = rng.randint(low=0, high=3, size=n) knc = KNeighborsTimeSeriesClassifier() _check_not_fitted(knc) knc.fit(X, y) _check_params_predict(knc, X, ['predict'])
class Knn(): def __init__(self, n_neighbors): ''' initialize KNN class with dynamic time warping distance metric hyperparameters: n_neighbors : number of neighbors on which to make classification decision ''' self.n_neighbors = n_neighbors self.knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=n_neighbors, metric="dtw") def __ScaleData(self, input_data): ''' scale input data to range [0,1] parameters: input_data : input data to rescale ''' return TimeSeriesScalerMinMax().fit_transform(input_data) def fit(self, X_train, y_train): ''' fit KNN classifier on training data parameters: X_train : training time series y_train : training labels ''' # scale training data to between 0 and 1 X_train_scaled = self.__ScaleData(X_train) self.knn_clf.fit(X_train_scaled, y_train) def predict(self, X_test): ''' classifications for time series in test data set parameters: X_test: test time series on which to predict classes returns: classifications for test data set ''' # scale test data to between 0 and 1 X_test_scaled = self.__ScaleData(X_test) return self.knn_clf.predict(X_test_scaled)
def NearestCentroidClassification(X_train, X_test, y_train_n, y_test_n, dataset_name): ''' :param X_train: if using DTAN, should already be aligned :param X_test: if using DTAN, should already be aligned :param y_train_n: numerical labels (not one-hot) :param y_test_n: numerical labels (not one-hot) :param dataset_name: :return: test set NCC accuracy ''' # vars and placeholders input_shape = X_train.shape[1:] n_classes = len(np.unique(y_train_n)) class_names = np.unique(y_train_n, axis=0) aligned_means = np.zeros((n_classes, input_shape[0], input_shape[1])) ncc_labels = [] # Train set within class Euclidean mean for class_num in class_names: train_class_idx = y_train_n == class_num # get indices X_train_aligned_within_class = X_train[train_class_idx] aligned_means[int(class_num), :] = np.mean( X_train_aligned_within_class, axis=0) ncc_labels.append(class_num) ncc_labels = np.asarray(ncc_labels) # Nearest neighbor classification - using euclidean distance knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="euclidean") knn_clf.fit(aligned_means, ncc_labels) predicted_labels = knn_clf.predict(X_test) acc = accuracy_score(y_test_n, predicted_labels) print(f"{dataset_name} - NCC results: {acc}")
def test_constrained_paths(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) X = rng.randn(n, sz, d) y = rng.randint(low=0, high=3, size=n) model_euc = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean") y_pred_euc = model_euc.fit(X, y).predict(X) model_dtw_sakoe = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw", metric_params={ "global_constraint": "sakoe_chiba", "sakoe_chiba_radius": 0 }) y_pred_sakoe = model_dtw_sakoe.fit(X, y).predict(X) np.testing.assert_equal(y_pred_euc, y_pred_sakoe) model_softdtw = KNeighborsTimeSeriesClassifier( n_neighbors=3, metric="softdtw", metric_params={"gamma": 1e-6}) y_pred_softdtw = model_softdtw.fit(X, y).predict(X) model_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw") y_pred_dtw = model_dtw.fit(X, y).predict(X) np.testing.assert_equal(y_pred_dtw, y_pred_softdtw) model_ctw = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="ctw") # Just testing that things run, nothing smart here :( model_ctw.fit(X, y).predict(X) model_sax = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="sax", metric_params={ "alphabet_size_avg": 6, "n_segments": 10 }) model_sax.fit(X, y) # The MINDIST of SAX is a lower bound of the euclidean distance euc_dist, _ = model_euc.kneighbors(X, n_neighbors=5) sax_dist, _ = model_sax.kneighbors(X, n_neighbors=5) # First column will contain zeroes np.testing.assert_array_less(sax_dist[:, 1:], euc_dist[:, 1:])
def test_variable_length_knn(): X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [9, 8, 7, 6, 5, 2], [8, 7, 6, 5, 3]]) y = [0, 0, 1, 1] clf = KNeighborsTimeSeriesClassifier(metric="dtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) clf = KNeighborsTimeSeriesClassifier(metric="softdtw", n_neighbors=1) clf.fit(X, y) assert_allclose(clf.predict(X), [0, 0, 1, 1]) scaler = TimeSeriesScalerMeanVariance() clf = KNeighborsTimeSeriesClassifier(metric="sax", n_neighbors=1, metric_params={'n_segments': 2}) X_transf = scaler.fit_transform(X) clf.fit(X_transf, y) assert_allclose(clf.predict(X_transf), [0, 0, 1, 1])
X_train = X_shuffle[:n_ts_per_blob * n_blobs // 2] X_test = X_shuffle[n_ts_per_blob * n_blobs // 2:] y_train = y_shuffle[:n_ts_per_blob * n_blobs // 2] y_test = y_shuffle[n_ts_per_blob * n_blobs // 2:] # Nearest neighbor search knn = KNeighborsTimeSeries(n_neighbors=3, metric="dtw") knn.fit(X_train, y_train) dists, ind = knn.kneighbors(X_test) print("1. Nearest neighbour search") print("Computed nearest neighbor indices (wrt DTW)\n", ind) print("First nearest neighbor class:", y_test[ind[:, 0]]) # Nearest neighbor classification knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="dtw") knn_clf.fit(X_train, y_train) predicted_labels = knn_clf.predict(X_test) print("\n2. Nearest neighbor classification using DTW") print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) # Nearest neighbor classification with a different metric (Euclidean distance) knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean") knn_clf.fit(X_train, y_train) predicted_labels = knn_clf.predict(X_test) print("\n3. Nearest neighbor classification using L2") print("Correct classification rate:", accuracy_score(y_test, predicted_labels)) # Nearest neighbor classification based on SAX representation sax_trans = SymbolicAggregateApproximation(n_segments=10, alphabet_size_avg=5) knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, metric="euclidean") pipeline_model = Pipeline(steps=[('sax', sax_trans), ('knn', knn_clf)])
correct = (preds == labels[:len(preds)]) score = float(sum(correct))/len(correct) return score # get train/test data/labels inDataFile = 'data/160k_f100_20190908-1401.txt' labels, data = dp.readProcFile(inDataFile) labels = np.array(labels) data = np.array(data) trainData, trainLabels, testData, testLabels = dp.splitTestTrainSets(data, labels, 0.8, 'Stratified') # z-normalisation trainData, testData = dp.znorm(trainData, testData) clf = KNeighborsTimeSeriesClassifier(n_jobs=-1) print "Fitting..." clf.fit(trainData, trainLabels) print "Scoring..." predictions = [] for i in range(len(testData)): if (i % 10 == 0) and (i > 0): print "{} complete...current score: {}".format(i, getScore(np.array(predictions), testLabels) ) predictions += clf.predict([testData[i]]).tolist() predictions = np.array(predictions) test_acc = getScore(predictions, testLabels) #test_acc = clf.score(testData, testLabels) print test_acc
knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='sax') knn_eucl = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='euclidean') accuracies = {} times = {} for dataset, w in datasets: X_train, y_train, X_test, y_test = data_loader.load_dataset(dataset) ts_scaler = TimeSeriesScalerMeanVariance() X_train = ts_scaler.fit_transform(X_train) X_test = ts_scaler.fit_transform(X_test) # Fit 1-NN using SAX representation & MINDIST metric_params = {'n_segments': w, 'alphabet_size_avg': 10} knn_sax = clone(knn_sax).set_params(metric_params=metric_params) start = time.time() knn_sax.fit(X_train, y_train) acc_sax = accuracy_score(y_test, knn_sax.predict(X_test)) time_sax = time.time() - start # Fit 1-NN using euclidean distance on raw values start = time.time() knn_eucl.fit(X_train, y_train) acc_euclidean = accuracy_score(y_test, knn_eucl.predict(X_test)) time_euclidean = time.time() - start accuracies[dataset] = (acc_sax, acc_euclidean) times[dataset] = (time_sax, time_euclidean) print_table(accuracies, times)
raw_data = pd.read_csv(os_path.join(working_dir_path, "./data/train_curves.csv"), header=None) time_series_train = to_time_series_dataset(raw_data) labels_train = genfromtxt(os_path.join( working_dir_path, "./data/train_clustering_result.csv"), delimiter=',') # Define the model knn_classification_model = KNeighborsTimeSeriesClassifier(n_neighbors=5, metric="dtw", n_jobs=4) # fit the model using the training data knn_classification_model.fit(time_series_train, labels_train) ############################################################################################# # save model ############################################################################################# print( "#############################################################################################" ) # return string with current datetime now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") try: # save model to models folder
tmpSum +=1 # rsltCol.append([x,tmpSum]) rsltCol.append(tmpSum) return [rsltRow, rsltCol] def dataToSeries(dataset): rowArray = [] # colArray = [] for i in range(0, len(dataset)): row = mapper(dataset[i]) rowArray.append(row) # colArray.append(col) return to_time_series(rowArray) X_train, y_train, X_test, y_test = load_data('data/') X_train_ts = dataToSeries(X_train) X_test_ts = dataToSeries(X_test) knn_clf_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="dtw") knn_clf_dtw.fit(X_train_ts, y_train) predicted_labels_dtw = knn_clf_dtw.predict(X_test_ts) print("knn with dtw: \n", accuracy_score(y_test, predicted_labels_dtw)) print("Classification report: \n", classification_report(y_test, predicted_labels_dtw)) print("Confusion matrix: \n", confusion_matrix(y_test, predicted_labels_dtw)) unlabaled = df = pd.read_csv("data/test.csv") unlabaled = unlabaled.values unlabaled_ts = dataToSeries(unlabaled) plt.imshow(unlabaled[165].reshape((28, 28))) predicted_label_dtw = knn_clf_dtw.predict(unlabaled_ts)
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that applies the k nearest neighbor classification algorithm to time series data. The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped. """ metadata = metadata_base.PrimitiveMetadata({ "id": "2d6d3223-1b3c-49cc-9ddd-50f571818268", "version": __version__, "name": "kanine", "keywords": [ "time series", "knn", "k nearest neighbor", "time series classification", ], "source": { "name": __author__, "contact": __contact__, "uris": [ "https://github.com/kungfuai/d3m-primitives", ], }, "installation": [ { "type": "PIP", "package": "cython", "version": "0.29.16" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "python_path": "d3m.primitives.time_series_classification.k_neighbors.Kanine", "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._knn = KNeighborsTimeSeriesClassifier( n_neighbors=self.hyperparams["n_neighbors"], metric=self.hyperparams["distance_metric"], weights=self.hyperparams["sample_weighting"], ) self._scaler = TimeSeriesScalerMinMax() self._is_fit = False def get_params(self) -> Params: if not self._is_fit: return Params(scaler=None, classifier=None, output_columns=None) return Params( scaler=self._scaler, classifier=self._knn, output_columns=self._output_columns, ) def set_params(self, *, params: Params) -> None: self._scaler = params["scaler"] self._knn = params["classifier"] self._output_columns = params["output_columns"] self._is_fit = all(param is not None for param in params.values()) def _get_cols(self, input_metadata): """private util function that finds grouping column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: list[int] -- list of column indices annotated with GroupingKey metadata """ # find column with ts value through metadata grouping_column = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey", )) return grouping_column def _get_value_col(self, input_metadata): """ private util function that finds the value column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: int -- index of column that contains time series value after Time Series Formatter primitive """ # find attribute column but not file column attributes = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/Attribute", )) # this is assuming alot, but timeseries formaters typicaly place value column at the end attribute_col = attributes[-1] return attribute_col def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets """ # load and reshape training data self._output_columns = outputs.columns outputs = np.array(outputs) n_ts = outputs.shape[0] ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) self._X_train = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) self._y_train = np.array(outputs).reshape(-1, ) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """Fits KNN model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Returns: CallResult[None] """ scaled = self._scaler.fit_transform(self._X_train) self._knn.fit(scaled, self._y_train) self._is_fit = True return CallResult(None, has_finished=self._is_fit) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """Produce primitive's classifications for new time series data Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- dataframe with a column containing a predicted class for each input time series """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") # find column with ts value through metadata grouping_column = self._get_cols(inputs.metadata) n_ts = inputs.iloc[:, grouping_column[0]].nunique() ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) # make predictions scaled = self._scaler.transform(x_vals) preds = self._knn.predict(scaled) # create output frame result_df = container.DataFrame({self._output_columns[0]: preds}, generate_metadata=True) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=True)
def NN1_DTWClassifier(X_train, Y_train): knn1_clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="dtw") knn1_clf.fit(X_train, Y_train) return knn1_clf
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive that applies the k nearest neighbor classification algorithm to time series data. The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped. Training inputs: 1) Feature dataframe, 2) Target dataframe Outputs: Dataframe with predictions for specific time series at specific future time instances Arguments: hyperparams {Hyperparams} -- D3M Hyperparameter object Keyword Arguments: random_seed {int} -- random seed (default: {0}) """ metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". "id": "2d6d3223-1b3c-49cc-9ddd-50f571818268", "version": __version__, "name": "kanine", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. "keywords": [ "time series", "knn", "k nearest neighbor", "time series classification", ], "source": { "name": __author__, "contact": __contact__, "uris": [ # Unstructured URIs. "https://github.com/Yonder-OSS/D3M-Primitives", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ { "type": "PIP", "package": "cython", "version": "0.29.14" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/Yonder-OSS/D3M-Primitives.git@{git_commit}#egg=yonder-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], # The same path the primitive is registered with entry points in setup.py. "python_path": "d3m.primitives.time_series_classification.k_neighbors.Kanine", # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._knn = KNeighborsTimeSeriesClassifier( n_neighbors=self.hyperparams["n_neighbors"], metric=self.hyperparams["distance_metric"], weights=self.hyperparams["sample_weighting"], ) self._scaler = TimeSeriesScalerMinMax() self._is_fit = False def get_params(self) -> Params: if not self._is_fit: return Params(scaler=None, classifier=None, output_columns=None) return Params(scaler=self._scaler, classifier=self._knn, output_columns=self._output_columns) def set_params(self, *, params: Params) -> None: self._scaler = params['scaler'] self._knn = params['classifier'] self._output_columns = params['output_columns'] self._is_fit = all(param is not None for param in params.values()) def _get_cols(self, input_metadata): """ private util function that finds grouping column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: list[int] -- list of column indices annotated with GroupingKey metadata """ # find column with ts value through metadata grouping_column = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey", )) return grouping_column def _get_value_col(self, input_metadata): """ private util function that finds the value column from input metadata Arguments: input_metadata {D3M Metadata object} -- D3M Metadata object for input frame Returns: int -- index of column that contains time series value after Time Series Formatter primitive """ # find attribute column but not file column attributes = input_metadata.list_columns_with_semantic_types( ('https://metadata.datadrivendiscovery.org/types/Attribute', )) # this is assuming alot, but timeseries formaters typicaly place value column at the end attribute_col = attributes[-1] return attribute_col def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """ Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets """ # load and reshape training data self._output_columns = outputs.columns outputs = np.array(outputs) n_ts = outputs.shape[0] ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) self._X_train = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) self._y_train = np.array(outputs).reshape(-1, ) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fits KNN model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Returns: CallResult[None] """ scaled = self._scaler.fit_transform(self._X_train) self._knn.fit(scaled, self._y_train) self._is_fit = True return CallResult(None, has_finished=self._is_fit) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's classifications for new time series data Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- dataframe with a column containing a predicted class for each input time series """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") # find column with ts value through metadata grouping_column = self._get_cols(inputs.metadata) n_ts = inputs.iloc[:, grouping_column[0]].nunique() ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz) # make predictions scaled = self._scaler.transform(x_vals) preds = self._knn.predict(scaled) # create output frame result_df = container.DataFrame({self._output_columns[0]: preds}, generate_metadata=True) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=True)
seq = np.genfromtxt(rep + dataset, delimiter=' ', dtype=str, encoding="utf8") ids, counts = np.unique(seq[:, 0], return_counts=True) No = ids.shape[0] D = seq.shape[1] - 3 arr = np.asarray((ids, counts)).T Max_Seq_Len = np.max(arr[:, 1].astype(np.int)) out_X = np.zeros((No, Max_Seq_Len, D)) out_Y = np.zeros((No, )) for idx, id in enumerate(ids): seq_cpy = seq[seq[:, 0] == id] out_X[idx] = seq_cpy[:, 3:] out_Y[idx] = seq_cpy[0, 2] return out_X, out_Y x_train, y_train = convert_mts(rep, ds_train) x_test, y_test = convert_mts(rep, ds_test) clf = KNeighborsTimeSeriesClassifier(n_neighbors=2, metric="dtw") y_test_pred = clf.fit(x_train, y=y_train).predict(x_test) print("the accuracy score of the testing data is : " + accuracy_score(y_test, y_test_pred))
class AnomalyDetection(ClassifierMixin, BaseEstimator): """ Anomaly detection with 1-NN and automatic calculation of optimal threshold. """ def __init__(self, n_clusters=200): self.knn = KNeighborsTimeSeriesClassifier(n_neighbors=1, weights='uniform', metric='euclidean', n_jobs=-1) self.d = None self.n_clusters = n_clusters def fit(self, X, y): """ Fit the algorithm according to the given training data. Parameters ---------- X : array-like of shape (n_samples, n_features, n_channels) Training samples. y : array-like of shape (n_samples,) True labels for X. Returns ------- self: object Fitted model """ # Fit anomaly detection knn over k-means centroids X_good = X[np.where(y == 0)] X_bad = X[np.where(y != 0)] km = TimeSeriesKMeans(n_clusters=self.n_clusters, metric="euclidean", max_iter=100, random_state=0, n_jobs=-1).fit(X_good) self.knn.fit(km.cluster_centers_, np.zeros((self.n_clusters, ))) # Calculate distances to all samples in good and bad d_bad, _ = self.knn.kneighbors(X_bad) d_good, _ = self.knn.kneighbors(X_good) # Calculate ROC y_true = np.hstack( (np.zeros(X_good.shape[0]), np.ones(X_bad.shape[0]))) y_score = np.vstack((d_good, d_bad)) fpr, tpr, thresholds = roc_curve(y_true, y_score, pos_label=1) # Determine d by Youden index self.d = thresholds[np.argmax(tpr - fpr)] return self def predict(self, X): """ Perform a classification on samples in X. Parameters ---------- X : array-like of shape (n_samples, n_features, n_channels) Test samples. Returns ------- y_pred: array, shape (n_samples,) Predictions """ # Binary predictions of anomaly detector y_pred = np.squeeze(np.where(self.knn.kneighbors(X)[0] < self.d, 0, 1)) return y_pred