def test_check_X_bad_input_args(X): """Test for the correct reaction for bad input in check_X.""" with pytest.raises(ValueError): check_X(X) with pytest.raises(ValueError): check_X_y(X, y)
def distance_to_exemplars(self, X): """Find distance to exemplars. Parameters ---------- X: the dataset containing a list of instances Returns ------- 2d numpy array of distances from each instance to each exemplar (instance by exemplar) """ check_X(X) if self.n_jobs > 1 or self.n_jobs < 0: parallel = Parallel(self.n_jobs) distances = parallel( delayed(self._distance_to_exemplars_inst)( self.X_exemplar, X.iloc[index, :], self.distance_measure) for index in range(X.shape[0])) else: distances = [ self._distance_to_exemplars_inst(self.X_exemplar, X.iloc[index, :], self.distance_measure) for index in range(X.shape[0]) ] distances = np.vstack(np.array(distances)) return distances
def test_check_X_enforce_univariate(): X, y = make_classification_problem(n_columns=2) msg = r"univariate" with pytest.raises(ValueError, match=msg): check_X(X, enforce_univariate=True) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_univariate=True)
def test_check_X_enforce_min_columns(): X, y = make_classification_problem(n_columns=2) msg = r"columns" with pytest.raises(ValueError, match=msg): check_X(X, enforce_min_columns=3) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_min_columns=3)
def test_check_enforce_min_instances(): X, y = make_classification_problem(n_instances=3) msg = r"instance" with pytest.raises(ValueError, match=msg): check_X(X, enforce_min_instances=4) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_min_instances=4) with pytest.raises(ValueError, match=msg): check_y(y, enforce_min_instances=4)
def _set_oob_score(self, X, y): """Compute out-of-bag score.""" check_X_y(X, y) check_X(X, enforce_univariate=True) n_classes_ = self.n_classes_ n_samples = y.shape[0] oob_decision_function = [] oob_score = 0.0 predictions = [ np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_) ] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) for estimator in self.estimators_: final_estimator = estimator.steps[-1][1] unsampled_indices = _generate_unsampled_indices( final_estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :]) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") decision = predictions[k] / predictions[k].sum(axis=1)[:, np.newaxis] oob_decision_function.append(decision) oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1), axis=0) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def transform(self, X, y=None): """ Parameters ---------- X : nested pandas DataFrame of shape [n_instances, n_dims] Nested dataframe with multivariate time-series in cells. Returns ------- dims: Pandas data frame with first dimension in column zero, second in column one etc. """ # Check the data self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_pandas=True) # Get information about the dataframe num_atts = len(X.iloc[0, 0]) col_names = X.columns # Check the parameters are appropriate self._check_parameters(num_atts) # On each dimension, perform PAA dataFrames = [] for x in col_names: dataFrames.append(self._perform_paa_along_dim(pd.DataFrame(X[x]))) # Combine the dimensions together result = pd.concat(dataFrames, axis=1, sort=False) result.columns = col_names return result
def predict(self, X): """Predict. Parameters ---------- X : pd.DataFrame or np.ndarray Panel data Returns ------- np.ndarray Predictions. """ self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) _, series_length = X.shape if series_length != self.series_length: raise TypeError( "The number of time points in the training data does not match " "that in the test data.") y_pred = Parallel(n_jobs=self.n_jobs)( delayed(_predict)(X, self.estimators_[i], self.intervals_[i]) for i in range(self.n_estimators)) return np.mean(y_pred, axis=0)
def _predict_proba_drcif(self, X, X_p, X_d, c22, n_intervals, intervals, dims, atts): """Embedded predict proba for the DrCIF classifier.""" if not self._is_fitted: raise NotFittedError( f"This instance of {self.__class__.__name__} has not " f"been fitted yet; please call `fit` first.") X = check_X(X, coerce_to_numpy=True) n_instances, n_dims, series_length = X.shape dists = np.zeros((n_instances, self.n_classes)) for i in range(n_instances): r = [ X[i].reshape((1, n_dims, series_length)), X_p[i].reshape((1, n_dims, X_p.shape[2])), X_d[i].reshape((1, n_dims, X_d.shape[2])), ] dists[i] = self.root.predict_proba_drcif( r, c22, n_intervals, intervals, dims, atts, self.n_classes, ) return dists
def _transform_words(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_pandas=True) if self.use_first_order_differences: X = self.add_first_order_differences(X) bag_all_words = [dict() for _ in range(len(X))] # On each dimension, perform SFA for ind, column in enumerate(self.col_names): X_dim = X[[column]] X_dim = from_nested_to_3d_numpy(X_dim) for i, window_size in enumerate(self.window_sizes[ind]): # SFA transform sfa_words = self.SFA_transformers[ind][i].transform(X_dim) bag = sfa_words[0] # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length highest = np.int32(self.highest_bits[ind]) for j in range(len(bag)): for (key, value) in bag[j].items(): # append the prefices to the words to distinguish # between window-sizes word = MUSE.shift_left(key, highest, ind, self.highest_dim_bit, window_size) bag_all_words[j][word] = value return bag_all_words
def fit(self, X, y): """Fit a random catch22 feature forest classifier. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X = check_X(X, enforce_univariate=False, coerce_to_numpy=True) self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] c22 = Catch22(outlier_norm=self.outlier_norm) c22_list = c22.fit_transform(X) self.classifier = RandomForestClassifier( n_jobs=self.n_jobs, n_estimators=self.n_estimators, random_state=self.random_state, ) X_c22 = np.nan_to_num(np.array(c22_list, dtype=np.float32), False, 0, 0, 0) self.classifier.fit(X_c22, y) self._is_fitted = True return self
def wrapper(self, data, labels=None, **kwargs): # Check if pandas so we can convert back is_pandas = True if isinstance(data, pd.DataFrame) else False pd_idx = data.index if is_pandas else None # Fit checks if check_fitted: self.check_is_fitted() # First convert to pandas so everything is the same format if labels is None: data = check_X(data, coerce_to_pandas=True) else: data, labels = check_X_y(data, labels, coerce_to_pandas=True) # Now convert it to a numpy array # Note sktime uses [N, C, L] whereas signature code uses shape # [N, L, C] (C being channels) so we must transpose. data = np.transpose(from_nested_to_3d_numpy(data), [0, 2, 1]) # Apply the function to the transposed array if labels is None: output = func(self, data, **kwargs) else: output = func(self, data, labels, **kwargs) # Convert back if all( [is_pandas, isinstance(output, np.ndarray), not force_numpy]): output = pd.DataFrame(index=pd_idx, data=output) return output
def predict_proba(self, X): """Predict class probabilities for n_instances in X. Parameters ---------- X : pd.DataFrame of shape (n_instances, n_dims) Returns ------- predicted_probs : array of shape (n_instances, n_classes) Predicted probability of each class. """ self.check_is_fitted() X = check_X(X) X_t = self._transformer.transform(X) X_t = np.nan_to_num(X_t, False, 0, 0, 0) m = getattr(self._estimator, "predict_proba", None) if callable(m): return self._estimator.predict_proba(X_t) else: dists = np.zeros((X.shape[0], self.n_classes)) preds = self._estimator.predict(X_t) for i in range(0, X.shape[0]): dists[i, np.where(self.classes_ == preds[i])] = 1 return dists
def predict_proba(self, X): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed (sktime format) If a Pandas data frame is passed, a check is performed that it only has one column. If not, an exception is thrown, since this classifier does not yet have multivariate capability. Returns ------- output : array of shape = [n_instances, n_classes] of probabilities """ X = check_X(X, enforce_univariate=True, coerce_to_pandas=True) X = dataset_properties.negative_dataframe_indices(X) if self.n_jobs > 1 or self.n_jobs < 0: parallel = Parallel(self.n_jobs) distributions = parallel( delayed(self._predict_proba_tree)(X, tree) for tree in self.trees) else: distributions = [ self._predict_proba_tree(X, tree) for tree in self.trees ] distributions = np.array(distributions) distributions = np.sum(distributions, axis=0) normalize(distributions, copy=False, norm="l1") return distributions
def predict(self, X): self.check_is_fitted() X = check_X(X, coerce_to_numpy=True) num_cases = X.shape[0] if self.n_dims > 1: words = [defaultdict(int) for _ in range(num_cases)] for i, dim in enumerate(self.dims): X_dim = X[:, dim, :].reshape(num_cases, 1, self.series_length) dim_words = self.transformers[i].transform(X_dim) dim_words = dim_words[0] for i in range(num_cases): for word, count in dim_words[i].items(): words[i][word << self.highest_dim_bit | dim] = count test_bags = words else: test_bags = self.transformers[0].transform(X) test_bags = test_bags[0] classes = Parallel(n_jobs=self.n_jobs)( delayed(self._test_nn)(test_bag, ) for test_bag in test_bags) return np.array(classes)
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample are computed as the mean predicted class probabilities of the trees in the forest. The class probability of a single tree is the fraction of samples of the same class in a leaf. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ # Check data self.check_is_fitted() X = check_X(X, enforce_univariate=True) X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(delayed(e.predict_proba)(X) for e in self.estimators_) return np.sum(all_proba, axis=0) / len(self.estimators_)
def transform(self, X, y=None): """ Transform X, transforms univariate time-series using sklearn's PCA class Parameters ---------- X : nested pandas DataFrame of shape [n_samples, 1] Nested dataframe with univariate time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with the same number of rows and the (potentially reduced) PCA transformed column. Time indices of the original column are replaced with 0:( n_components - 1). """ self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) # Transform X using the fitted PCA Xpca = pd.DataFrame(data=self.pca.transform(X)) # Back-transform into time series data format Xt = from_2d_array_to_nested(Xpca) return Xt
def transform(self, X, y=None): """Transform X. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_columns] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame """ # input checks self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_pandas=True) # get column name column_name = X.columns[0] self._starts = [] self._lengths = [] # find plateaus (segments of the same value) for x in X.iloc[:, 0]: x = np.asarray(x) # find indices of transition if np.isnan(self.value): i = np.where(np.isnan(x), 1, 0) elif np.isinf(self.value): i = np.where(np.isinf(x), 1, 0) else: i = np.where(x == self.value, 1, 0) # pad and find where segments transition transitions = np.diff(np.hstack([0, i, 0])) # compute starts, ends and lengths of the segments starts = np.where(transitions == 1)[0] ends = np.where(transitions == -1)[0] lengths = ends - starts # filter out single points starts = starts[lengths >= self.min_length] lengths = lengths[lengths >= self.min_length] self._starts.append(starts) self._lengths.append(lengths) # put into dataframe Xt = pd.DataFrame() column_prefix = "%s_%s" % ( column_name, "nan" if np.isnan(self.value) else str(self.value), ) Xt["%s_starts" % column_prefix] = pd.Series(self._starts) Xt["%s_lengths" % column_prefix] = pd.Series(self._lengths) return Xt
def predict_proba(self, X): """Find probability estimates for each class for all cases in X. Parameters ---------- X : The training input samples. array-like or sparse matrix of shape = [n_test_instances, series_length] If a Pandas data frame is passed (sktime format) a check is performed that it only has one column. If not, an exception is thrown, since this classifier does not yet have multivariate capability. Returns ------- output : array of shape = [n_test_instances, num_classes] of probabilities """ self.check_is_fitted() X = check_X(X, coerce_to_numpy=True) sums = np.zeros((X.shape[0], self.n_classes)) for n, clf in enumerate(self.estimators_): preds = clf.predict(X) for i in range(0, X.shape[0]): sums[i, self.class_dictionary[preds[i]]] += self.weights[n] return np.around(sums / (np.ones(self.n_classes) * self.weight_sum), 8)
def predict_proba(self, X) -> np.array: """Predicts labels probabilities for sequences in X. Parameters ---------- X : 2D np.array (univariate, equal length series) of shape = [n_instances, series_length] or 3D np.array (any number of dimensions, equal length series) of shape = [n_instances,n_dimensions,series_length] or pd.DataFrame with each column a dimension, each cell a pd.Series (any number of dimensions, equal or unequal length series) Returns ------- y : 2D array of shape = [n_instances, n_classes] - estimated class probabilities """ self.check_is_fitted() coerce_to_numpy = self.get_tag("coerce-X-to-numpy") coerce_to_pandas = self.get_tag("coerce-X-to-pandas") allow_multivariate = self.get_tag("capability:multivariate") X = check_X( X, coerce_to_numpy=coerce_to_numpy, coerce_to_pandas=coerce_to_pandas, enforce_univariate=not allow_multivariate, ) return self._predict_proba(X)
def fit(self, X, y=None): """Fit dilations and biases to input time series. Parameters ---------- X : pandas DataFrame, input time series (sktime format) y : array_like, target values (optional, ignored as irrelevant) Returns ------- self """ _X = check_X(X, coerce_to_numpy=True) if self.normalise: _X = (_X - _X.mean(axis=-1, keepdims=True)) / ( _X.std(axis=-1, keepdims=True) + 1e-8 ) if _X.shape[2] < 10: # handling very short series (like PensDigit from the MTSC archive) # series have to be at least a length of 10 (including differencing) _X1 = np.zeros((_X.shape[0], _X.shape[1], 10), dtype=_X.dtype) _X1[:, :, : _X.shape[2]] = _X _X = _X1 del _X1 self.parameter = self._get_parameter(_X) _X1 = np.diff(_X, 1) self.parameter1 = self._get_parameter(_X1) self._is_fitted = True return self
def transform(self, X, y=None): """ Transform X. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, n_columns] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame """ self.check_is_fitted() X = check_X(X, coerce_to_pandas=True) n_instances, n_dims = X.shape arr = [X.iloc[i, :].values for i in range(n_instances)] max_length = _get_max_length(arr) if max_length > self.pad_length_: raise ValueError( "Error: max_length of series \ is greater than the one found when fit or set." ) pad = [pd.Series([self._create_pad(series) for series in out]) for out in arr] return pd.DataFrame(pad)
def fit(self, X, y=None): """ Fit transformer. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : an instance of self. """ X = check_X(X, coerce_to_pandas=True) if self.lower is None: n_instances, _ = X.shape arr = [X.iloc[i, :].values for i in range(n_instances)] self.lower_ = self.get_min_length(arr) else: self.lower_ = self.lower self._is_fitted = True return self
def predict_proba(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=True) dists = np.zeros((X.shape[0], self.n_classes)) dists = np.add( dists, self.stc.predict_proba(X) * (np.ones(self.n_classes) * self.stc_weight), ) dists = np.add( dists, self.tsf.predict_proba(X) * (np.ones(self.n_classes) * self.tsf_weight), ) dists = np.add( dists, self.rise.predict_proba(X) * (np.ones(self.n_classes) * self.rise_weight), ) dists = np.add( dists, self.cboss.predict_proba(X) * (np.ones(self.n_classes) * self.cboss_weight), ) return dists / dists.sum(axis=1, keepdims=True)
def predict_proba(self, X): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : The training input samples. array-like or sparse matrix of shape = [n_test_instances, series_length] If a Pandas data frame is passed (sktime format) a check is performed that it only has one column. If not, an exception is thrown, since this classifier does not yet have multivariate capability. Returns ------- output : nd.array of shape = (n_instances, n_classes) Predicted probabilities """ self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) _, series_length = X.shape if series_length != self.series_length: raise TypeError( " ERROR number of attributes in the train does not match " "that in the test data") y_probas = Parallel(n_jobs=self.n_jobs)( delayed(_predict_proba_for_estimator)(X, self.estimators_[i], self.intervals_[i]) for i in range(self.n_estimators)) output = np.sum(y_probas, axis=0) / (np.ones(self.n_classes) * self.n_estimators) return output
def fit(self, X, y=None): """Fits dilations and biases to input time series. Parameters ---------- X : pandas DataFrame, input time series (sktime format) y : array_like, target values (optional, ignored as irrelevant) Returns ------- self """ X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) random_state = (np.int32(self.random_state) if isinstance( self.random_state, int) else None) X = X[:, 0, :].astype(np.float32) _, n_timepoints = X.shape if n_timepoints < 9: raise ValueError( (f"n_timepoints must be >= 9, but found {n_timepoints};" " zero pad shorter series so that n_timepoints == 9")) self.parameters = _fit(X, self.num_kernels, self.max_dilations_per_kernel, random_state) self._is_fitted = True return self
def predict_proba(self, X): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed (sktime format) If a Pandas data frame is passed, a check is performed that it only has one column. If not, an exception is thrown, since this classifier does not yet have multivariate capability. Returns ------- output : array of shape = [n_instances, n_classes] of probabilities """ X = check_X(X, enforce_univariate=True, coerce_to_pandas=True) X = dataset_properties.negative_dataframe_indices(X) distances = self.distance_to_exemplars(X) ones = np.ones(distances.shape) distances = np.add(distances, ones) distributions = np.divide(ones, distances) normalize(distributions, copy=False, norm="l1") return distributions
def transform(self, X, y=None): """Transform input time series. Parameters ---------- X : pandas DataFrame, input time series (sktime format) y : array_like, target values (optional, ignored as irrelevant) Returns ------- pandas DataFrame, transformed features """ self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) X = X[:, 0, :].astype(np.float32) # change n_jobs dependend on value and existing cores prev_threads = get_num_threads() if self.n_jobs < 1 or self.n_jobs > multiprocessing.cpu_count(): n_jobs = multiprocessing.cpu_count() else: n_jobs = self.n_jobs set_num_threads(n_jobs) X_ = _transform(X, self.parameters) set_num_threads(prev_threads) return pd.DataFrame(X_)
def transform(self, X, y=None): """Transforms input time series using random convolutional kernels. Parameters ---------- X : pandas DataFrame, input time series (sktime format) y : array_like, target values (optional, ignored as irrelevant) Returns ------- pandas DataFrame, transformed features """ self.check_is_fitted() _X = check_X(X, coerce_to_numpy=True) if self.normalise: _X = (_X - _X.mean(axis=-1, keepdims=True)) / ( _X.std(axis=-1, keepdims=True) + 1e-8) prev_threads = get_num_threads() if self.n_jobs < 1 or self.n_jobs > multiprocessing.cpu_count(): n_jobs = multiprocessing.cpu_count() else: n_jobs = self.n_jobs set_num_threads(n_jobs) t = pd.DataFrame(_apply_kernels(_X, self.kernels)) set_num_threads(prev_threads) return t
def transform(self, X, y=None): """Transform data into the catch22 features. Parameters ---------- X : pandas DataFrame or 3d numpy array, input time series. y : array_like, target values (optional, ignored). Returns ------- Pandas dataframe containing 22 features for each input series. """ self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_numpy=True) n_instances = X.shape[0] c22_list = Parallel(n_jobs=self.n_jobs)( delayed(self._transform_case)( X[i], ) for i in range(n_instances) ) if self.replace_nans: c22_list = np.nan_to_num(c22_list, False, 0, 0, 0) return pd.DataFrame(c22_list)