def predict_proba(self, X, input_checks = True): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed (sktime format) If a Pandas data frame is passed, a check is performed that it only has one column. If not, an exception is thrown, since this classifier does not yet have multivariate capability. input_checks: boolean whether to check the X parameter Returns ------- output : array of shape = [n_instances, n_classes] of probabilities """ if input_checks: validate_X(X) X = dataset_properties.negative_dataframe_indices(X) if self.n_jobs > 1 or self.n_jobs < 0: parallel = Parallel(self.n_jobs) distributions = parallel(delayed(self._predict_proba_tree)(X, tree) for tree in self.trees) else: distributions = [self._predict_proba_tree(X, tree) for tree in self.trees] distributions = np.array(distributions) distributions = np.sum(distributions, axis=0) normalize(distributions, copy=False, norm='l1') return distributions
def predict_proba(self, X, input_checks = True): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed (sktime format) If a Pandas data frame is passed, a check is performed that it only has one column. If not, an exception is thrown, since this classifier does not yet have multivariate capability. input_checks: boolean whether to check the X parameter Returns ------- output : array of shape = [n_instances, num_classes] of probabilities """ if input_checks: validate_X(X) X = dataset_properties.negative_dataframe_indices(X) distances = self.distance_to_exemplars(X) ones = np.ones(distances.shape) distances = np.add(distances, ones) distributions = np.divide(ones, distances) normalize(distributions, copy = False, norm = 'l1') return distributions
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample are computed as the mean predicted class probabilities of the trees in the forest. The class probability of a single tree is the fraction of samples of the same class in a leaf. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ check_is_fitted(self, 'estimators_') # Check data validate_X(X) check_X_is_univariate(X) X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(delayed(e.predict_proba)(X) for e in self.estimators_) return np.sum(all_proba, axis=0) / len(self.estimators_)
def predict(self, X): """Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted values. """ check_is_fitted(self, 'estimators_') # Check data validate_X(X) X = self._validate_X_predict(X) # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Parallel loop y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(e.predict)(X, check_input=True) for e in self.estimators_) return np.sum(y_hat, axis=0) / len(self.estimators_)
def transform(self, X): """ Takes as input a time series dataset and returns the matrix profile for each single time series of the dataset. Parameters ---------- X: pandas.DataFrame Time series dataset. Output ------ Xt: pandas.DataFrame Dataframe with the same number of rows as the input. The number of columns equals the number of subsequences of the desired length in each time series. """ # Input checks validate_X(X) check_X_is_univariate(X) n_instances = X.shape[0] # Convert into tabular format tabulariser = Tabulariser() X = tabulariser.transform(X) n_subs = X.shape[1]-self.m+1 Xt = pd.DataFrame(stomp_self(np.array([X.iloc[i]]), self.m) for i in range(0, n_instances)) return Xt
def fit(self, X, y=None): """ Fit transformer, generating random interval indices. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : RandomIntervalSegmenter This estimator """ validate_X(X) self.input_shape_ = X.shape # Retrieve time-series indexes from each column. # TODO generalise to columns with series of unequal length self._time_index = get_time_index(X) # Compute random intervals for each column. # TODO if multiple columns are passed, introduce option to compute one set of shared intervals, # or rely on ColumnTransformer? if self.n_intervals == 'random': self.intervals_ = self._rand_intervals_rand_n(self._time_index) else: self.intervals_ = self._rand_intervals_fixed_n( self._time_index, n_intervals=self.n_intervals) return self
def transform(self, X, y=None): """Transform X. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_columns] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame """ # input checks validate_X(X) check_X_is_univariate(X) # get column name column_name = X.columns[0] # find plateaus (segments of the same value) for x in X.iloc[:, 0]: x = np.asarray(x) # find indices of transition if np.isnan(self.value): i = np.where(np.isnan(x), 1, 0) elif np.isinf(self.value): i = np.where(np.isinf(x), 1, 0) else: i = np.where(x == self.value, 1, 0) # pad and find where segments transition transitions = np.diff(np.hstack([0, i, 0])) # compute starts, ends and lengths of the segments starts = np.where(transitions == 1)[0] ends = np.where(transitions == -1)[0] lengths = ends - starts # filter out single points starts = starts[lengths >= self.min_length] lengths = lengths[lengths >= self.min_length] self._starts.append(starts) self._lengths.append(lengths) # put into dataframe Xt = pd.DataFrame() column_prefix = "%s_%s" % (column_name, "nan" if np.isnan(self.value) else str(self.value)) Xt["%s_starts" % column_prefix] = pd.Series(self._starts) Xt["%s_lengths" % column_prefix] = pd.Series(self._lengths) return Xt
def transform(self, X, y=None): """Transform X. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with same number of rows and one column for each generated interval. """ if self.check_input: validate_X(X) if X.shape[1] > 1: raise NotImplementedError(f"Currently does not work on multiple columns, make use of ColumnTransformer " f"instead") self._input_shape = X.shape # when seasonal periodicity is equal to 1 return X unchanged if self.sp == 1: return X # keep time index as transform/inverse transform depends on it, e.g. to carry forward trend in inverse_transform self._time_index = get_time_index(X) # convert into tabular format tabulariser = Tabulariser() Xs = tabulariser.transform(X.iloc[:, :1]) check_is_fitted(self, 'is_fitted_') validate_X(X) # fit seasonal decomposition model seasonal_components = self._fit_seasonal_decomposition_model(Xs) # remove seasonal components from data if self.model == 'additive': Xt = Xs - seasonal_components else: Xt = Xs / seasonal_components # keep fitted seasonal components for inverse transform, they are repeated after the first seasonal # period so we only keep the components for the first seasonal period self.seasonal_components_ = seasonal_components[:, :self.sp] # convert back into nested format Xt = tabulariser.inverse_transform(pd.DataFrame(Xt)) Xt.columns = X.columns return Xt
def predict_proba(self, X): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, ------- output : array of shape = [n_samples, num_classes] of probabilities """ # tempX=self.shapelet_transform.transform(X) # X = np.asarray([a.values for a in tempX.iloc[:, 0]]) validate_X(X) return self.classifier.predict_proba(X)
def find_closest_exemplar_indices(self, X): """ find the closest exemplar index for each instance in a dataframe :param X: the dataframe containing instances :return: 1d numpy array of indices, one for each instance, reflecting the index of the closest exemplar """ validate_X(X) # todo make checks optional and propogate from forest downwards n_instances = X.shape[0] distances = self.distance_to_exemplars(X) indices = np.empty(X.shape[0], dtype=int) for index in range(n_instances): exemplar_distances = distances[index] closest_exemplar_index = comparison.arg_min(exemplar_distances, self.random_state) indices[index] = closest_exemplar_index return indices
def transform(self, X, y=None): """ Transform X, segments time-series in each column into random intervals using interval indices generated during `fit`. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with same number of rows and one column for each generated interval. """ # Check inputs. check_is_fitted(self, 'intervals_') validate_X(X) # Check that the input is of the same shape as the one passed # during fit. if X.shape[1] != self.input_shape_[1]: raise ValueError( 'Number of columns of input is different from what was seen' 'in `fit`') # # Input validation # if not all([np.array_equal(fit_idx, trans_idx) # for trans_idx, fit_idx in zip(check_equal_index(X), self._time_index)]): # raise ValueError('Indexes of input time-series are different from what was seen in `fit`') # Segment into intervals. # TODO generalise to non-equal-index cases intervals = [] colname = X.columns[0] colnames = [] arr = tabularize( X, return_array=True ) # Tabularise assuming series have equal indexes in any given column for start, end in self.intervals_: interval = arr[:, start:end] intervals.append(interval) colnames.append(f"{colname}_{start}_{end}") # Return nested pandas DataFrame. Xt = pd.DataFrame(concat_nested_arrays(intervals, return_arrays=True)) Xt.columns = colnames return Xt
def _apply_rowwise(self, func, X, y=None): """Helper function to apply transform or inverse_transform function on each row of data container""" check_is_fitted(self, '_is_fitted') validate_X(X) # 1st attempt: apply, relatively fast but not robust # try and except, but sometimes breaks in other cases than excepted ValueError # Works on single column, but on multiple columns only if columns have equal-length series. # try: # Xt = X.apply(self.transformer.fit_transform) # # # Otherwise call apply on each column separately. # except ValueError as e: # if str(e) == 'arrays must all be same length': # Xt = pd.concat([pd.Series(col.apply(self.transformer.fit_transform)) for _, col in X.items()], axis=1) # else: # raise # 2nd attempt: apply but iterate over columns, still relatively fast but still not very robust # but column is not 2d and thus breaks if transformer expects 2d input try: Xt = pd.concat( [pd.Series(col.apply(func)) for _, col in X.items()], axis=1) # 3rd attempt: explicit for-loops, most robust but very slow except: cols_t = [] for c in range(X.shape[1]): # loop over columns col = X.iloc[:, c] rows_t = [] for row in col: # loop over rows in each column row_2d = pd.DataFrame(row) # convert into 2d dataframe row_t = func(row_2d).ravel() # apply transform rows_t.append(row_t) # append transformed rows cols_t.append(rows_t) # append transformed columns # if series-to-series transform, flatten transformed series Xt = concat_nested_arrays( cols_t) # concatenate transformed columns # tabularise/unnest series-to-primitive transforms xt = Xt.iloc[0, 0] if isinstance(xt, (pd.Series, np.ndarray)) and len(xt) == 1: Xt = tabularize(Xt) return Xt
def fit(self, X, y=None): """ Empty fit function that does nothing. Parameters ---------- X : 1D array-like, pandas Series, shape (n_samples, 1) The training input samples. Shoould not be a DataFrame. y : None, as it is transformer on X Returns ------- self : object Returns self. """ validate_X(X) # fitting - this transformer needs no fitting self.is_fitted_ = True return self
def distance_to_exemplars(self, X): """ find distance to exemplars :param X: the dataset containing a list of instances :return: 2d numpy array of distances from each instance to each exemplar (instance by exemplar) """ validate_X(X) if self.n_jobs > 1 or self.n_jobs < 0: parallel = Parallel(self.n_jobs) distances = parallel(delayed(self._distance_to_exemplars_inst) (self.X_exemplar, X.iloc[index, :], self.distance_measure) for index in range(X.shape[0])) else: distances = [self._distance_to_exemplars_inst(self.X_exemplar, X.iloc[index, :], self.distance_measure) for index in range(X.shape[0])] distances = np.vstack(np.array(distances)) return distances
def predict_proba(self, X, input_checks=True): """ Find probability estimates for each class for all cases in X. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed (sktime format) If a Pandas data frame is passed, a check is performed that it only has one column. If not, an exception is thrown, since this classifier does not yet have multivariate capability. input_checks: boolean whether to check the X parameter Returns ------- output : array of shape = [n_instances, n_classes] of probabilities """ if input_checks: validate_X(X) X = dataset_properties.negative_dataframe_indices(X) closest_exemplar_indices = self.stump.find_closest_exemplar_indices(X) n_classes = len(self.label_encoder.classes_) distribution = np.zeros((X.shape[0], n_classes)) for index in range(len(self.branches)): indices = np.argwhere(closest_exemplar_indices == index) if indices.shape[0] > 0: indices = np.ravel(indices) sub_tree = self.branches[index] if sub_tree is None: sub_distribution = np.zeros((1, n_classes)) class_label = self.stump.y_exemplar[index] sub_distribution[0][class_label] = 1 else: sub_X = X.iloc[indices, :] sub_distribution = sub_tree.predict_proba(sub_X) assert sub_distribution.shape[1] == n_classes np.add.at(distribution, indices, sub_distribution) normalize(distribution, copy=False, norm='l1') return distribution
def inverse_transform(self, X, y=None): """Transform tabular pandas dataframe into nested dataframe. Parameters ---------- X : pandas DataFrame Tabular dataframe with primitives in cells. y : array-like, optional (default=None) Returns ------- Xt : pandas DataFrame Transformed dataframe with series in cells. """ check_is_fitted_in_transform(self, '_time_index') # TODO check if for each column, all rows have equal-index series if self.check_input: validate_X(X) Xit = detabularize(X, index=self._index, time_index=self._time_index) return Xit
def fit(self, X, y=None): """ Fit transformer, generating random interval indices. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : an instance of self. """ validate_X(X) check_X_is_univariate(X) self.input_shape_ = X.shape # Retrieve time-series indexes from each column. self._time_index = get_time_index(X) if isinstance(self.intervals, np.ndarray): self.intervals_ = self.intervals elif np.issubdtype(self.intervals, np.integer): self.intervals_ = np.array_split(self._time_index, self.intervals) else: raise ValueError( f"Intervals must be either an integer, a single array with " f"start and end points, but found: {self.intervals}") return self
def transform(self, X, y=None): """Transform nested pandas dataframe into tabular dataframe. Parameters ---------- X : pandas DataFrame Nested dataframe with pandas series or numpy arrays in cells. y : array-like, optional (default=None) Returns ------- Xt : pandas DataFrame Transformed dataframe with only primitives in cells. """ if self.check_input: validate_X(X) self._columns = X.columns self._index = X.index self._time_index = get_time_index(X) Xt = tabularize(X) return Xt
def transform(self, X, y=None): """ Transform X, segments time-series in each column into random intervals using interval indices generated during `fit` and extracts features from each interval. Parameters ---------- X : nested pandas.DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas.DataFrame Transformed pandas DataFrame with same number of rows and one column for each generated interval. """ # Check is fit had been called check_is_fitted(self, 'intervals_') validate_X(X) check_X_is_univariate(X) # Check that the input is of the same shape as the one passed # during fit. if X.shape[1] != self.input_shape_[1]: raise ValueError( 'Number of columns of input is different from what was seen' 'in `fit`') # Input validation # if not all([np.array_equal(fit_idx, trans_idx) for trans_idx, fit_idx in zip(check_equal_index(X), # self._time_index)]): # raise ValueError('Indexes of input time-series are different from what was seen in `fit`') n_rows, n_columns = X.shape n_features = len(self.features) n_intervals = len(self.intervals_) # Compute features on intervals. Xt = np.zeros( (n_rows, n_features * n_intervals)) # Allocate output array for transformed data self.columns_ = [] colname = X.columns[0] # Tabularize each column assuming series have equal indexes in any given column. # TODO generalise to non-equal-index cases arr = tabularize(X, return_array=True) i = 0 for func in self.features: # TODO generalise to series-to-series functions and function kwargs for start, end in self.intervals_: interval = arr[:, start:end] # Try to use optimised computations over axis if possible, otherwise iterate over rows. try: Xt[:, i] = func(interval, axis=1) except TypeError as e: if str( e ) == f"{func.__name__}() got an unexpected keyword argument 'axis'": Xt[:, i] = np.apply_along_axis(func, 1, interval) else: raise i += 1 self.columns_.append( f'{colname}_{start}_{end}_{func.__name__}') Xt = pd.DataFrame(Xt) Xt.columns = self.columns_ return Xt
def transform(self, X): """ Apply the `fit_transform()` method of the per-row transformer repeatedly on each row. Parameters ---------- X : 1D array-like, pandas Series, shape (n_samples, 1) The training input samples. Shoould not be a DataFrame. Returns ------- T : 1D array-like, pandas Series, shape (n_samples, ...) The transformed data """ # check the validity of input validate_X(X) check_is_fitted(self, 'is_fitted_') # 1st attempt: apply, relatively fast but not robust # try and except, but sometimes breaks in other cases than excepted ValueError # Works on single column, but on multiple columns only if columns have equal-length series. # try: # Xt = X.apply(self.transformer.fit_transform) # # # Otherwise call apply on each column separately. # except ValueError as e: # if str(e) == 'arrays must all be same length': # Xt = pd.concat([pd.Series(col.apply(self.transformer.fit_transform)) for _, col in X.items()], axis=1) # else: # raise # 2nd attempt: apply but iterate over columns, still relatively fast but still not very robust # but column is not 2d and thus breaks if transformer expects 2d input try: Xt = pd.concat([ pd.Series(col.apply(self.transformer.fit_transform)) for _, col in X.items() ], axis=1) # 3rd attempt: explicit for-loops, most robust but very slow except: cols_t = [] for c in range(X.shape[1]): # loop over columns col = X.iloc[:, c] rows_t = [] for row in col: # loop over rows in each column row_2d = pd.DataFrame(row) # convert into 2d dataframe row_t = self.transformer.fit_transform( row_2d).ravel() # apply transform rows_t.append(row_t) # append transformed rows cols_t.append(rows_t) # append transformed columns # if series-to-series transform, flatten transformed series Xt = concat_nested_arrays( cols_t) # concatenate transformed columns # tabularise/unnest series-to-primitive transforms xt = Xt.iloc[0, 0] if isinstance(xt, (pd.Series, np.ndarray)) and len(xt) == 1: Xt = tabularize(Xt) return Xt
def inverse_transform(self, X, y=None): """Inverse transform X Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with same number of rows and one column for each generated interval. """ if self.check_input: validate_X(X) check_X_is_univariate(X) # check that number of samples are the same, inverse transform depends on parameters fitted in transform and # hence only works on data with the same (number of) rows if not X.shape[0] == self._input_shape[0]: raise ValueError(f"Inverse transform only works on data with the same number samples " f"as seen during transform, but found: {X.shape[0]} samples " f"!= {self._input_shape[0]} samples (seen during transform)") # if the seasonal periodicity is 1, return unchanged X sp = self.sp if sp == 1: return X # check if seasonal decomposition model has been fitted in transform check_is_fitted_in_transform(self, 'seasonal_components_') # check if time index is aligned with time index seen during transform time_index = get_time_index(X) # align seasonal components with index of X if self._time_index.equals(time_index): # if time index is the same as used for fitting seasonal components, simply expand it to the size of X seasonal_components = self.seasonal_components_ else: # if time index is not aligned, make sure to align fitted seasonal components to new index seasonal_components = self._align_seasonal_components_to_index(time_index) # expand or shorten aligned seasonal components to same size as X n_obs = len(time_index) if n_obs > sp: n_tiles = np.int(np.ceil(n_obs / sp)) seasonal_components = np.tile(seasonal_components, n_tiles) seasonal_components = seasonal_components[:, :n_obs] # convert into tabular format tabulariser = Tabulariser() Xs = tabulariser.transform(X.iloc[:, :1]) # inverse transform data if self.model == 'additive': Xit = Xs + seasonal_components else: Xit = Xs * seasonal_components # convert back into nested format Xit = tabulariser.inverse_transform(pd.DataFrame(Xit)) Xit.columns = X.columns return Xit