def transform(self, X, y=None): """ Takes as input a time series dataset and returns the matrix profile for each single time series of the dataset. Parameters ---------- X: pandas.DataFrame Time series dataset. Output ------ Xt: pandas.DataFrame Dataframe with the same number of rows as the input. The number of columns equals the number of subsequences of the desired length in each time series. """ # Input checks self.check_is_fitted() check_X(X, enforce_univariate=True) n_instances = X.shape[0] # Convert into tabular format tabulariser = Tabularizer() X = tabulariser.fit_transform(X) Xt = pd.DataFrame( stomp_self(np.array([X.iloc[i]]), self.m) for i in range(n_instances)) return Xt
def predict_proba(self, X, input_checks=True, **kwargs): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed (sktime format) If a Pandas data frame is passed, a check is performed that it only has one column. If not, an exception is thrown, since this classifier does not yet have multivariate capability. input_checks: boolean whether to check the X parameter Returns ------- output : array of shape = [n_instances, n_classes] of probabilities """ if input_checks: check_X(X) if isinstance(X, pd.DataFrame): if X.shape[1] > 1 or not isinstance(X.iloc[0, 0], pd.Series): raise TypeError( "Input should either be a 2d numpy array, or a pandas " "dataframe with a single column of Series objects " "(networks cannot yet handle multivariate problems") else: X = np.asarray([a.values for a in X.iloc[:, 0]]) if len(X.shape) == 2: # add a dimension to make it multivariate with one dimension X = X.reshape((X.shape[0], X.shape[1], 1)) probs = np.zeros((X.shape[0], self.nb_classes)) for skdl_model in self.skdl_models: if self.keep_in_memory: keras_model = skdl_model.model else: keras_model = keras.models.load_model( Path(self.model_save_directory) / (skdl_model + ".hdf5")) # keras models' predict is same as what we/sklearn means by # predict_proba, i.e. give prob distributions probs = probs + keras_model.predict(X, **kwargs) if not self.keep_in_memory: del keras_model gc.collect() keras.backend.clear_session() probs = probs / len(self.skdl_models) # check if binary classification if probs.shape[1] == 1: # first column is probability of class 0 and second is of class 1 probs = np.hstack([1 - probs, probs]) return probs
def test_check_X_enforce_min_columns(): X, y = make_classification_problem(n_columns=2) msg = r"columns" with pytest.raises(ValueError, match=msg): check_X(X, enforce_min_columns=3) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_min_columns=3)
def test_check_X_enforce_univariate(): X, y = make_classification_problem(n_columns=2) msg = r"univariate" with pytest.raises(ValueError, match=msg): check_X(X, enforce_univariate=True) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_univariate=True)
def test_check_enforce_min_instances(): X, y = make_classification_problem(n_instances=3) msg = r"instance" with pytest.raises(ValueError, match=msg): check_X(X, enforce_min_instances=4) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_min_instances=4) with pytest.raises(ValueError, match=msg): check_y(y, enforce_min_instances=4)
def _set_oob_score(self, X, y): """Compute out-of-bag score""" check_X_y(X, y) check_X(X, enforce_univariate=True) n_classes_ = self.n_classes_ n_samples = y.shape[0] oob_decision_function = [] oob_score = 0.0 predictions = [ np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_) ] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) for estimator in self.estimators_: final_estimator = estimator.steps[-1][1] unsampled_indices = _generate_unsampled_indices( final_estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :]) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") decision = (predictions[k] / predictions[k].sum(axis=1)[:, np.newaxis]) oob_decision_function.append(decision) oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1), axis=0) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def transform(self, X, y=None): """Transform X. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_columns] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame """ # input checks self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_pandas=True) # get column name column_name = X.columns[0] self._starts = [] self._lengths = [] # find plateaus (segments of the same value) for x in X.iloc[:, 0]: x = np.asarray(x) # find indices of transition if np.isnan(self.value): i = np.where(np.isnan(x), 1, 0) elif np.isinf(self.value): i = np.where(np.isinf(x), 1, 0) else: i = np.where(x == self.value, 1, 0) # pad and find where segments transition transitions = np.diff(np.hstack([0, i, 0])) # compute starts, ends and lengths of the segments starts = np.where(transitions == 1)[0] ends = np.where(transitions == -1)[0] lengths = ends - starts # filter out single points starts = starts[lengths >= self.min_length] lengths = lengths[lengths >= self.min_length] self._starts.append(starts) self._lengths.append(lengths) # put into dataframe Xt = pd.DataFrame() column_prefix = "%s_%s" % ( column_name, "nan" if np.isnan(self.value) else str(self.value), ) Xt["%s_starts" % column_prefix] = pd.Series(self._starts) Xt["%s_lengths" % column_prefix] = pd.Series(self._lengths) return Xt
def fit(self, X, y=None): """Calculate word breakpoints using _mcb Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The class labels. Returns ------- self : object """ if self.alphabet_size < 2 or self.alphabet_size > 4: raise ValueError( "Alphabet size must be an integer between 2 and 4") if self.word_length < 1 or self.word_length > 16: raise ValueError("Word length must be an integer between 1 and 16") if self.igb and y is None: raise ValueError( "Class values must be provided for information gain binning") X = check_X(X, enforce_univariate=True) X = tabularize(X, return_array=True) self.n_instances, self.series_length = X.shape self.breakpoints = self._igb(X, y) if self.igb else self._mcb(X) self._is_fitted = True return self
def _transform_words(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_pandas=True) if self.use_first_order_differences: X = self.add_first_order_differences(X) bag_all_words = [dict() for _ in range(len(X))] # On each dimension, perform SFA for ind, column in enumerate(self.col_names): X_dim = X[[column]] X_dim = from_nested_to_3d_numpy(X_dim) for i, window_size in enumerate(self.window_sizes[ind]): # SFA transform sfa_words = self.SFA_transformers[ind][i].transform(X_dim) bag = sfa_words[0] # .iloc[:, 0] # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length highest = np.int32(self.highest_bits[ind]) for j in range(len(bag)): for (key, value) in bag[j].items(): # append the prefices to the words to distinguish # between window-sizes word = MUSE.shift_left(key, highest, ind, self.highest_dim_bit, window_size) bag_all_words[j][word] = value return bag_all_words
def predict(self, X): """Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted values. """ self.check_is_fitted() # Check data X = check_X(X, enforce_univariate=True) X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Parallel loop y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(e.predict)(X, check_input=True) for e in self.estimators_) return np.sum(y_hat, axis=0) / len(self.estimators_)
def transform(self, X, y=None): """ Transform X. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, n_columns] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame """ self.check_is_fitted() X = check_X(X) n_instances, n_dims = X.shape arr = [X.iloc[i, :].values for i in range(n_instances)] max_length = _get_max_length(arr) if max_length > self.pad_length_: raise ValueError("Error: max_length of series \ is greater than the one found when fit or set.") pad = [ pd.Series([self._create_pad(series) for series in out]) for out in arr ] return pd.DataFrame(pad)
def predict(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=True) rng = check_random_state(self.random_state) classes = [] test_bags = self.transformer.transform(X) test_bags = test_bags.iloc[:, 0] for i, test_bag in enumerate(test_bags): best_dist = sys.float_info.max nn = None for n, bag in enumerate(self.transformed_data): dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist or (dist == best_dist and rng.random() < 0.5): best_dist = dist nn = self.class_vals[n] classes.append(nn) return np.array(classes)
def predict_proba(self, X): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : The training input samples. array-like or sparse matrix of shape = [n_test_instances, series_length] If a Pandas data frame is passed (sktime format) a check is performed that it only has one column. If not, an exception is thrown, since this classifier does not yet have multivariate capability. Returns ------- output : nd.array of shape = (n_instances, n_classes) Predicted probabilities """ self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) _, series_length = X.shape if series_length != self.series_length: raise TypeError( " ERROR number of attributes in the train does not match " "that in the test data") y_probas = Parallel(n_jobs=self.n_jobs)( delayed(_predict_proba_for_estimator)(X, self.estimators_[i], self.intervals_[i]) for i in range(self.n_estimators)) output = np.sum(y_probas, axis=0) / (np.ones(self.n_classes) * self.n_estimators) return output
def fit(self, X, y=None): """ Fit transformer, generating random interval indices. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : an instance of self. """ X = check_X(X, enforce_univariate=True) self.input_shape_ = X.shape # Retrieve time-series indexes from each column. self._time_index = get_time_index(X) if isinstance(self.intervals, np.ndarray): self.intervals_ = self.intervals elif is_int(self.intervals): self.intervals_ = np.array_split(self._time_index, self.intervals) else: raise ValueError( f"Intervals must be either an integer, an array with " f"start and end points, but found: {self.intervals}") self._is_fitted = True return self
def transform(self, X, y=None): """ Parameters ---------- X : nested pandas DataFrame of shape [n_instances, n_dims] Nested dataframe with multivariate time-series in cells. Returns ------- dims: Pandas data frame with first dimension in column zero, second in column one etc. """ # Check the data self.check_is_fitted() X = check_X(X, enforce_univariate=False) # Get information about the dataframe num_atts = len(X.iloc[0, 0]) col_names = X.columns # Check the parameters are appropriate self._check_parameters(num_atts) # On each dimension, perform PAA dataFrames = [] for x in col_names: dataFrames.append(self._perform_paa_along_dim(pd.DataFrame(X[x]))) # Combine the dimensions together result = pd.concat(dataFrames, axis=1, sort=False) result.columns = col_names return result
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample are computed as the mean predicted class probabilities of the trees in the forest. The class probability of a single tree is the fraction of samples of the same class in a leaf. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ # Check data self.check_is_fitted() X = check_X(X, enforce_univariate=True) X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(delayed(e.predict_proba)(X) for e in self.estimators_) return np.sum(all_proba, axis=0) / len(self.estimators_)
def transform(self, X, y=None): """Concatenate multivariate time series/panel data into long univariate time series/panel data by simply concatenating times series in time. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with same number of rows and single column """ self.check_is_fitted() X = check_X(X) # We concatenate by tabularizing all columns and then detabularizing # them into a single column if isinstance(X, pd.DataFrame): Xt = from_nested_to_2d_array(X) else: Xt = from_3d_numpy_to_2d_array(X) return from_2d_array_to_nested(Xt)
def predict(self, X): self.check_is_fitted() if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame): X = check_X(X, enforce_univariate=True) X = tabularize(X, return_array=True) rng = check_random_state(self.random_state) classes = [] test_bags = self.transformer.transform(X) test_bags = test_bags[0] # .iloc[:, 0] for test_bag in test_bags: best_sim = -1 nn = None for n, bag in enumerate(self.transformed_data): sim = histogram_intersection(test_bag, bag) if sim > best_sim or (sim == best_sim and rng.random() < 0.5): best_sim = sim nn = self.class_vals[n] classes.append(nn) return np.array(classes)
def transform(self, X, y=None): """ Transform X, transforms univariate time-series using sklearn's PCA class Parameters ---------- X : nested pandas DataFrame of shape [n_samples, 1] Nested dataframe with univariate time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with the same number of rows and the (potentially reduced) PCA transformed column. Time indices of the original column are replaced with 0:( n_components - 1). """ self.check_is_fitted() X = check_X(X, enforce_univariate=True) # Transform X using the fitted PCA Xtab = tabularize(X) Xpca = pd.DataFrame(data=self.pca.transform(Xtab), index=Xtab.index, columns=Xtab.columns[:self.pca.n_components_]) # Back-transform into time series data format Xt = detabularise(Xpca, index=X.index) Xt.columns = X.columns return Xt
def predict_proba(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=True) bag = self._transform_words(X) bag_dict = self.vectorizer.transform(bag) return self.clf.predict_proba(bag_dict)
def predict_proba(self, X): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed (sktime format) If a Pandas data frame is passed, a check is performed that it only has one column. If not, an exception is thrown, since this classifier does not yet have multivariate capability. Returns ------- output : array of shape = [n_instances, n_classes] of probabilities """ X = check_X(X, enforce_univariate=True) X = dataset_properties.negative_dataframe_indices(X) if self.n_jobs > 1 or self.n_jobs < 0: parallel = Parallel(self.n_jobs) distributions = parallel( delayed(self._predict_proba_tree)(X, tree) for tree in self.trees) else: distributions = [ self._predict_proba_tree(X, tree) for tree in self.trees ] distributions = np.array(distributions) distributions = np.sum(distributions, axis=0) normalize(distributions, copy=False, norm='l1') return distributions
def transform(self, X, y=None): """ Parameters ---------- X : pd.DataFrame Univariate time series to transform. y : pd.DataFrame, optional (default=False) Exogenous variables Returns ------- y_hat : pd.DataFrame Extracted parameters; columns are parameter values """ self.check_is_fitted() X = check_X(X, enforce_univariate=True) param_names = self._check_param_names(self.param_names) n_instances = X.shape[0] def _fit_extract(forecaster, x, param_names): forecaster.fit(x) params = forecaster.get_fitted_params() return np.hstack([params[name] for name in param_names]) # iterate over rows extracted_params = Parallel(n_jobs=self.n_jobs)( delayed(_fit_extract)(clone(self.forecaster), X.iloc[i, 0], param_names) for i in range(n_instances)) return pd.DataFrame(extracted_params, index=X.index, columns=param_names)
def predict(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=True) rng = check_random_state(self.random_state) num_insts = X.shape[0] classes = np.zeros(num_insts, dtype=np.int_) test_bags = self.transformer.transform(X) test_bags = [series.to_dict() for series in test_bags.iloc[:, 0]] for i, test_bag in enumerate(test_bags): best_dist = sys.float_info.max nn = -1 for n, bag in enumerate(self.transformed_data): dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist or (dist == best_dist and rng.random() < 0.5): best_dist = dist nn = self.class_vals[n] classes[i] = nn return classes
def transform(self, X, y=None): """Takes series in each cell, train linear interpolation and samples n. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- pandas DataFrame : Transformed pandas DataFrame with same number of rows and columns """ self.check_is_fitted() check_X(X) return X.apply(self._resize_col)
def _transform_words(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) bag_all_words = [dict() for _ in range(len(X))] for i, window_size in enumerate(self.window_sizes): # SFA transform sfa_words = self.SFA_transformers[i].transform(X) bag = sfa_words[0] # .iloc[:, 0] # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length for j in range(len(bag)): for (key, value) in bag[j].items(): # append the prefices to the words to distinguish # between window-sizes if isinstance(key, tuple): word = (((key[0] << self.highest_bit) | key[1]) << 3) | window_size else: # word = ((key << self.highest_bit) << 3) | window_size word = WEASEL.shift_left(key, self.highest_bit, window_size) bag_all_words[j][word] = value return bag_all_words
def fit(self, X, y=None): """ Fit transformer. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : an instance of self. """ X = check_X(X, coerce_to_pandas=True) if self.lower is None: n_instances, _ = X.shape arr = [X.iloc[i, :].values for i in range(n_instances)] self.lower_ = self.get_min_length(arr) else: self.lower_ = self.lower self._is_fitted = True return self
def transform(self, X, y=None): """ Transform X, transforms univariate time-series using sklearn's PCA class Parameters ---------- X : nested pandas DataFrame of shape [n_samples, 1] Nested dataframe with univariate time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with the same number of rows and the (potentially reduced) PCA transformed column. Time indices of the original column are replaced with 0:( n_components - 1). """ self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) # Transform X using the fitted PCA Xpca = pd.DataFrame(data=self.pca.transform(X)) # Back-transform into time series data format Xt = from_2d_array_to_nested(Xpca) return Xt
def predict_proba(self, X): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed (sktime format) If a Pandas data frame is passed, a check is performed that it only has one column. If not, an exception is thrown, since this classifier does not yet have multivariate capability. Returns ------- output : array of shape = [n_instances, n_classes] of probabilities """ X = check_X(X, enforce_univariate=True) X = dataset_properties.negative_dataframe_indices(X) distances = self.distance_to_exemplars(X) ones = np.ones(distances.shape) distances = np.add(distances, ones) distributions = np.divide(ones, distances) normalize(distributions, copy=False, norm='l1') return distributions
def fit(self, X, y=None): """Fit. Parameters ---------- X : pd.DataFrame nested pandas DataFrame of shape [n_samples, n_columns] y : pd.Series or np.array Target variable Returns ------- self : an instance of self """ check_X(X) self.default_fc_parameters_ = self._get_extraction_params() self._is_fitted = True return self
def find_closest_exemplar_indices(self, X): """ find the closest exemplar index for each instance in a dataframe :param X: the dataframe containing instances :return: 1d numpy array of indices, one for each instance, reflecting the index of the closest exemplar """ check_X( X) # todo make checks optional and propogate from forest downwards n_instances = X.shape[0] distances = self.distance_to_exemplars(X) indices = np.empty(X.shape[0], dtype=int) for index in range(n_instances): exemplar_distances = distances[index] closest_exemplar_index = comparison.arg_min( exemplar_distances, self.random_state) indices[index] = closest_exemplar_index return indices