def _get_extraction_params(self): """Helper function to set default parameters from tsfresh""" # make n_jobs compatible with scikit-learn self.n_jobs = check_n_jobs(self.n_jobs) # lazy imports to avoid hard dependency from tsfresh.defaults import CHUNKSIZE from tsfresh.defaults import DISABLE_PROGRESSBAR from tsfresh.utilities.dataframe_functions import impute from tsfresh.defaults import N_PROCESSES from tsfresh.defaults import PROFILING from tsfresh.defaults import PROFILING_FILENAME from tsfresh.defaults import PROFILING_SORTING from tsfresh.defaults import SHOW_WARNINGS from tsfresh.feature_extraction.settings import ComprehensiveFCParameters from tsfresh.feature_extraction.settings import EfficientFCParameters from tsfresh.feature_extraction.settings import MinimalFCParameters # Set defaults from tsfresh extraction_params = { "kind_to_fc_parameters": self.kind_to_fc_parameters, "n_jobs": N_PROCESSES, "chunksize": CHUNKSIZE, "show_warnings": SHOW_WARNINGS, "disable_progressbar": DISABLE_PROGRESSBAR, "impute_function": impute, "profiling_sorting": PROFILING_SORTING, "profiling_filename": PROFILING_FILENAME, "profile": PROFILING, } # Replace defaults with user defined parameters for name in extraction_params.keys(): if hasattr(self, name): value = getattr(self, name) if value is not None: extraction_params[name] = value # Convert convenience string arguments to tsfresh parameters classes fc_param_lookup = { "minimal": MinimalFCParameters(), "efficient": EfficientFCParameters(), "comprehensive": ComprehensiveFCParameters(), } if isinstance(self.default_fc_parameters, str): if self.default_fc_parameters not in fc_param_lookup: raise ValueError( f"If `default_fc_parameters` is passed as a " f"string, " f"it must be one of" f" {fc_param_lookup.keys()}, but found: " f"{self.default_fc_parameters}" ) else: fc_parameters = fc_param_lookup[self.default_fc_parameters] else: fc_parameters = self.default_fc_parameters extraction_params["default_fc_parameters"] = fc_parameters return extraction_params
def fit(self, X: TimeSeriesInstances, y=None) -> BaseEstimator: """Fit time series clusterer to training data. Parameters ---------- X : Training time series instances to cluster. np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape (n_instances, n_dimensions, series_length)) or pd.DataFrame (where each column is a dimension, each cell is a pd.Series (any number of dimensions, equal or unequal length series)). Converted to type _tags["X_inner_mtype"] y: ignored, exists for API consistency reasons. Returns ------- self: Fitted estimator. """ X = self._check_clusterer_input(X) multithread = self.get_tag("capability:multithreading") if multithread: try: self._threads_to_use = check_n_jobs(self.n_jobs) except NameError: raise AttributeError( "self.n_jobs must be set if capability:multithreading is True" ) start = int(round(time.time() * 1000)) self._fit(X) self.fit_time_ = int(round(time.time() * 1000)) - start self._is_fitted = True return self
def fit(self, X, y): """Fit time series classifier to training data. Parameters ---------- X : 3D np.array (any number of dimensions, equal length series) of shape [n_instances, n_dimensions, series_length] or 2D np.array (univariate, equal length series) of shape [n_instances, series_length] or pd.DataFrame with each column a dimension, each cell a pd.Series (any number of dimensions, equal or unequal length series) or of any other supported Panel mtype for list of mtypes, see datatypes.SCITYPE_REGISTER for specifications, see examples/AA_datatypes_and_datasets.ipynb y : 1D np.array of int, of shape [n_instances] - class labels for fitting indices correspond to instance indices in X Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ start = int(round(time.time() * 1000)) # convenience conversions to allow user flexibility: # if X is 2D array, convert to 3D, if y is Series, convert to numpy X, y = _internal_convert(X, y) X_metadata = _check_classifier_input(X, y) missing = X_metadata["has_nans"] multivariate = not X_metadata["is_univariate"] unequal = not X_metadata["is_equal_length"] # Check this classifier can handle characteristics self._check_capabilities(missing, multivariate, unequal) # Convert data as dictated by the classifier tags X = self._convert_X(X) multithread = self.get_tag("capability:multithreading") if multithread: try: self._threads_to_use = check_n_jobs(self.n_jobs) except NameError: raise AttributeError( "self.n_jobs must be set if capability:multithreading is True" ) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.shape[0] self._class_dictionary = {} for index, class_val in enumerate(self.classes_): self._class_dictionary[class_val] = index self._fit(X, y) self.fit_time_ = int(round(time.time() * 1000)) - start # this should happen last self._is_fitted = True return self
def fit(self, X, y): """Fit time series classifier to training data. Parameters ---------- X : 2D np.array (univariate, equal length series) of shape = [n_instances, series_length] or 3D np.array (any number of dimensions, equal length series) of shape = [n_instances,n_dimensions,series_length] or pd.DataFrame with each column a dimension, each cell a pd.Series (any number of dimensions, equal or unequal length series) y : 1D np.array of shape = [n_instances] - the class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ coerce_to_numpy = self.get_tag("coerce-X-to-numpy") coerce_to_pandas = self.get_tag("coerce-X-to-pandas") allow_multivariate = self.get_tag("capability:multivariate") X, y = check_X_y( X, y, coerce_to_numpy=coerce_to_numpy, coerce_to_pandas=coerce_to_pandas, enforce_univariate=not allow_multivariate, ) multithread = self.get_tag("capability:multithreading") if multithread: try: self._threads_to_use = check_n_jobs(self.n_jobs) except NameError: raise AttributeError( "self.n_jobs must be set if capability:multithreading is True" ) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.shape[0] for index, classVal in enumerate(self.classes_): self._class_dictionary[classVal] = index self._fit(X, y) # this should happen last self._is_fitted = True return self
def _fit(self, X, y): self._n_jobs = check_n_jobs(self.n_jobs) self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] if self.time_limit_in_minutes > 0: # contracting 2/3 transform (with 1/5 of that taken away for final # transform), 1/3 classifier third = self.time_limit_in_minutes / 3 self._classifier_limit_in_minutes = third self._transform_limit_in_minutes = (third * 2) / 5 * 4 elif self.transform_limit_in_minutes > 0: self._transform_limit_in_minutes = self.transform_limit_in_minutes self._transformer = RandomShapeletTransform( n_shapelet_samples=self.n_shapelet_samples, max_shapelets=self.max_shapelets, max_shapelet_length=self.max_shapelet_length, time_limit_in_minutes=self._transform_limit_in_minutes, contract_max_n_shapelet_samples=self. contract_max_n_shapelet_samples, n_jobs=self.n_jobs, batch_size=self.batch_size, random_state=self.random_state, ) self._estimator = _clone_estimator( RotationForest() if self.estimator is None else self.estimator, self.random_state, ) if isinstance(self._estimator, RotationForest): self._estimator.save_transformed_data = self.save_transformed_data m = getattr(self._estimator, "n_jobs", None) if m is not None: self._estimator.n_jobs = self._n_jobs m = getattr(self._estimator, "time_limit_in_minutes", None) if m is not None and self.time_limit_in_minutes > 0: self._estimator.time_limit_in_minutes = self._classifier_limit_in_minutes X_t = self._transformer.fit_transform(X, y).to_numpy() if self.save_transformed_data: self.transformed_data = X_t self._estimator.fit(X_t, y)
def fit(self, X, y): """Build a forest of trees from the training set (X, y). Parameters ---------- Xt: np.ndarray or pd.DataFrame Panel training data. y : np.ndarray The class labels. Returns ------- self : object An fitted instance of the classifier """ X, y = check_X_y( X, y, enforce_univariate=not self.capabilities["multivariate"], coerce_to_numpy=True, ) X = X.squeeze(1) n_instances, self.series_length = X.shape n_jobs = check_n_jobs(self.n_jobs) rng = check_random_state(self.random_state) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_intervals = int(math.sqrt(self.series_length)) if self.n_intervals == 0: self.n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length self.intervals_ = [ _get_intervals(self.n_intervals, self.min_interval, self.series_length, rng) for _ in range(self.n_estimators) ] self.estimators_ = Parallel(n_jobs=n_jobs)( delayed(_fit_estimator)(_clone_estimator(self.base_estimator, rng), X, y, self.intervals_[i]) for i in range(self.n_estimators)) self._is_fitted = True return self
def _instantiate_model(self): n_jobs = check_n_jobs(self.n_jobs) sp = check_sp(self.sp, enforce_list=True) return self._ModelClass( use_box_cox=self.use_box_cox, box_cox_bounds=self.box_cox_bounds, use_trend=self.use_trend, use_damped_trend=self.use_damped_trend, seasonal_periods=sp, use_arma_errors=self.use_arma_errors, show_warnings=self.show_warnings, n_jobs=n_jobs, multiprocessing_start_method=self.multiprocessing_start_method, context=self.context, )
def _fit(self, X, y): self._n_jobs = check_n_jobs(self.n_jobs) self.n_instances, self.n_dims, self.series_length = X.shape self._class_vals = y self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self._class_dictionary[classVal] = index # select dimensions using accuracy estimate if multivariate if self.n_dims > 1: self._dims, self._transformers = self._select_dims(X, y) words = [defaultdict(int) for _ in range(self.n_instances)] for i, dim in enumerate(self._dims): X_dim = X[:, dim, :].reshape(self.n_instances, 1, self.series_length) dim_words = self._transformers[i].transform(X_dim, y) dim_words = dim_words[0] for n in range(self.n_instances): for word, count in dim_words[n].items(): words[n][word << self._highest_dim_bit | dim] = count self._transformed_data = words else: self._transformers.append( SFA( word_length=self.word_length, alphabet_size=self.alphabet_size, window_size=self.window_size, norm=self.norm, levels=self.levels, binning_method="information-gain" if self.igb else "equi-depth", bigrams=self.bigrams, remove_repeat_words=True, lower_bounding=False, save_words=False, use_fallback_dft=True, n_jobs=self._n_jobs, )) sfa = self._transformers[0].fit_transform(X, y) self._transformed_data = sfa[0]
def _fit(self, X, y): self._n_jobs = check_n_jobs(self.n_jobs) self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] if self.base_estimator == "DTC": self._base_estimator = DecisionTreeClassifier(criterion="entropy") elif self.base_estimator == "CIT": self._base_estimator = ContinuousIntervalTree() elif isinstance(self.base_estimator, BaseEstimator): self._base_estimator = self.base_estimator else: raise ValueError("DrCIF invalid base estimator given.") if self.n_intervals is None: self._n_intervals = int( math.sqrt(self.series_length) * math.sqrt(self.n_dims) ) if self._n_intervals <= 0: self._n_intervals = 1 if self.att_subsample_size > 25: self._att_subsample_size = 25 if self.series_length < self.min_interval: self._min_interval = self.series_length elif self.min_interval < 3: self._min_interval = 3 if self.max_interval is None: self._max_interval = self.series_length / 2 if self._max_interval < self._min_interval: self._max_interval = self._min_interval fit = Parallel(n_jobs=self._n_jobs)( delayed(self._fit_estimator)( X, y, i, ) for i in range(self.n_estimators) ) self.estimators_, self.intervals, self.dims, self.atts = zip(*fit)
def fit(self, X, y): """ Build an ensemble of pipelines containing the ROCKET transformer and RidgeClassifierCV classifier. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y) n_jobs = check_n_jobs(self.n_jobs) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index base_estimator = _make_estimator(self.num_kernels, self.random_state) self.estimators_ = Parallel(n_jobs=n_jobs)( delayed(_fit_estimator)(_clone_estimator(base_estimator, self.random_state), X, y) for _ in range(self.n_estimators)) self.weights = [] self.weight_sum = 0 for rocket_pipeline in self.estimators_: weight = rocket_pipeline.steps[1][1].best_score_ self.weights.append(weight) self.weight_sum += weight self._is_fitted = True return self
def _fit(self, X, y): self._n_jobs = check_n_jobs(self.n_jobs) if self.n_parameter_samples <= self.randomly_selected_params: print( # noqa "TDE Warning: n_parameter_samples <= randomly_selected_params, ", "ensemble member parameters will be fully randomly selected.", ) time_limit = self.time_limit_in_minutes * 60 self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self._class_dictionary[classVal] = index self.estimators_ = [] self.weights = [] self._prev_parameters_x = [] self._prev_parameters_y = [] # Window length parameter space dependent on series length max_window_searches = self.series_length / 4 max_window = int(self.series_length * self.max_win_len_prop) win_inc = int((max_window - self.min_window) / max_window_searches) if win_inc < 1: win_inc = 1 if self.min_window > max_window + 1: raise ValueError( f"Error in TemporalDictionaryEnsemble, min_window =" f"{self.min_window} is bigger" f" than max_window ={max_window}," f" series length is {self.series_length}" f" try set min_window to be smaller than series length in " f"the constructor, but the classifier may not work at " f"all with very short series") possible_parameters = self._unique_parameters(max_window, win_inc) num_classifiers = 0 subsample_size = int(self.n_instances * 0.7) lowest_acc = 1 lowest_acc_idx = 0 time_limit = self.time_limit_in_minutes * 60 start_time = time.time() train_time = 0 if time_limit > 0: n_parameter_samples = 0 else: n_parameter_samples = self.n_parameter_samples rng = check_random_state(self.random_state) if self.bigrams is None: if self.n_dims > 1: use_bigrams = False else: use_bigrams = True else: use_bigrams = self.bigrams # use time limit or n_parameter_samples if limit is 0 while (train_time < time_limit or num_classifiers < n_parameter_samples ) and len(possible_parameters) > 0: if num_classifiers < self.randomly_selected_params: parameters = possible_parameters.pop( rng.randint(0, len(possible_parameters))) else: scaler = preprocessing.StandardScaler() scaler.fit(self._prev_parameters_x) gp = KernelRidge(kernel="poly", degree=1) gp.fit(scaler.transform(self._prev_parameters_x), self._prev_parameters_y) preds = gp.predict(scaler.transform(possible_parameters)) parameters = possible_parameters.pop( rng.choice(np.flatnonzero(preds == preds.max()))) subsample = rng.choice(self.n_instances, size=subsample_size, replace=False) X_subsample = X[subsample] y_subsample = y[subsample] tde = IndividualTDE( *parameters, alphabet_size=self._alphabet_size, bigrams=use_bigrams, dim_threshold=self.dim_threshold, max_dims=self.max_dims, random_state=self.random_state, ) tde.fit(X_subsample, y_subsample) tde._subsample = subsample if self.save_train_predictions: tde._train_predictions = np.zeros(subsample_size) tde._accuracy = self._individual_train_acc( tde, y_subsample, subsample_size, 0 if num_classifiers < self.max_ensemble_size else lowest_acc, ) if tde._accuracy > 0: weight = math.pow(tde._accuracy, 4) else: weight = 0.000000001 if num_classifiers < self.max_ensemble_size: if tde._accuracy < lowest_acc: lowest_acc = tde._accuracy lowest_acc_idx = num_classifiers self.weights.append(weight) self.estimators_.append(tde) elif tde._accuracy > lowest_acc: self.weights[lowest_acc_idx] = weight self.estimators_[lowest_acc_idx] = tde lowest_acc, lowest_acc_idx = self._worst_ensemble_acc() self._prev_parameters_x.append(parameters) self._prev_parameters_y.append(tde._accuracy) num_classifiers += 1 train_time = time.time() - start_time self.n_estimators = len(self.estimators_) self._weight_sum = np.sum(self.weights)
def _fit(self, X, y): self._n_jobs = check_n_jobs(self.n_jobs) self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] time_limit = self.time_limit_in_minutes * 60 start_time = time.time() train_time = 0 if self.base_estimator == "DTC": self._base_estimator = DecisionTreeClassifier(criterion="entropy") elif self.base_estimator == "CIT": self._base_estimator = ContinuousIntervalTree() elif isinstance(self.base_estimator, BaseEstimator): self._base_estimator = self.base_estimator else: raise ValueError("DrCIF invalid base estimator given.") X_p = np.zeros( ( self.n_instances, self.n_dims, int( math.pow(2, math.ceil(math.log(self.series_length, 2))) - self.series_length ), ) ) X_p = np.concatenate((X, X_p), axis=2) X_p = np.abs(np.fft.fft(X_p)[:, :, : int(X_p.shape[2] / 2)]) X_d = np.diff(X, 1) if self.n_intervals is None: self._n_intervals = [None, None, None] self._n_intervals[0] = 4 + int( (math.sqrt(self.series_length) * math.sqrt(self.n_dims)) / 3 ) self._n_intervals[1] = 4 + int( (math.sqrt(X_p.shape[2]) * math.sqrt(self.n_dims)) / 3 ) self._n_intervals[2] = 4 + int( (math.sqrt(X_d.shape[2]) * math.sqrt(self.n_dims)) / 3 ) elif isinstance(self.n_intervals, int): self._n_intervals = [self.n_intervals, self.n_intervals, self.n_intervals] elif isinstance(self.n_intervals, list) and len(self.n_intervals) == 3: self._n_intervals = self.n_intervals else: raise ValueError("DrCIF n_intervals must be an int or list of length 3.") for i, n in enumerate(self._n_intervals): if n <= 0: self._n_intervals[i] = 1 if self.att_subsample_size > 25: self._att_subsample_size = 25 if isinstance(self.min_interval, int): self._min_interval = [ self.min_interval, self.min_interval, self.min_interval, ] elif isinstance(self.min_interval, list) and len(self.min_interval) == 3: self._min_interval = self.min_interval else: raise ValueError("DrCIF min_interval must be an int or list of length 3.") if self.series_length < self._min_interval[0]: self._min_interval[0] = self.series_length if X_p.shape[2] < self._min_interval[1]: self._min_interval[1] = X_p.shape[2] if X_d.shape[2] < self._min_interval[2]: self._min_interval[2] = X_d.shape[2] if self.max_interval is None: self._max_interval = [ self.series_length / 2, X_p.shape[2] / 2, X_d.shape[2] / 2, ] elif isinstance(self.max_interval, int): self._max_interval = [ self.max_interval, self.max_interval, self.max_interval, ] elif isinstance(self.max_interval, list) and len(self.max_interval) == 3: self._max_interval = self.max_interval else: raise ValueError("DrCIF max_interval must be an int or list of length 3.") for i, n in enumerate(self._max_interval): if n < self._min_interval[i]: self._max_interval[i] = self._min_interval[i] self.total_intervals = sum(self._n_intervals) if time_limit > 0: self._n_estimators = 0 self.estimators_ = [] self.intervals = [] self.atts = [] self.dims = [] self.transformed_data = [] while ( train_time < time_limit and self._n_estimators < self.contract_max_n_estimators ): fit = Parallel(n_jobs=self._n_jobs)( delayed(self._fit_estimator)( X, X_p, X_d, y, i, ) for i in range(self._n_jobs) ) ( estimators, intervals, dims, atts, transformed_data, ) = zip(*fit) self.estimators_ += estimators self.intervals += intervals self.atts += atts self.dims += dims self.transformed_data += transformed_data self._n_estimators += self._n_jobs train_time = time.time() - start_time else: fit = Parallel(n_jobs=self._n_jobs)( delayed(self._fit_estimator)( X, X_p, X_d, y, i, ) for i in range(self._n_estimators) ) ( self.estimators_, self.intervals, self.dims, self.atts, self.transformed_data, ) = zip(*fit)
def _fit(self, X, y): """Fit HIVE-COTE 1.0 to training data. Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The training data. y : array-like, shape = [n_instances] The class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ self.n_jobs_ = check_n_jobs(self.n_jobs) self.n_classes_ = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] # Default values from HC1 paper if self.stc_params is None: self._stc_params = {"transform_limit_in_minutes": 120} if self.tsf_params is None: self._tsf_params = {"n_estimators": 500} if self.rise_params is None: self._rise_params = {"n_estimators": 500} if self.cboss_params is None: self._cboss_params = {} # Cross-validation size for TSF and RISE cv_size = 10 _, counts = np.unique(y, return_counts=True) min_class = np.min(counts) if min_class < cv_size: cv_size = min_class # Build STC self._stc = ShapeletTransformClassifier( **self._stc_params, save_transformed_data=True, random_state=self.random_state, n_jobs=self.n_jobs_, ) self._stc.fit(X, y) if self.verbose > 0: print("STC ", datetime.now().strftime("%H:%M:%S %d/%m/%Y")) # noqa # Find STC weight using train set estimate train_probs = self._stc._get_train_probs(X, y) train_preds = self._stc.classes_[np.argmax(train_probs, axis=1)] self.stc_weight_ = accuracy_score(y, train_preds)**4 if self.verbose > 0: print( # noqa "STC train estimate ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"), ) print("STC weight = " + str(self.stc_weight_)) # noqa # Build TSF self._tsf = TimeSeriesForestClassifier( **self._tsf_params, random_state=self.random_state, n_jobs=self.n_jobs_, ) self._tsf.fit(X, y) if self.verbose > 0: print("TSF ", datetime.now().strftime("%H:%M:%S %d/%m/%Y")) # noqa # Find TSF weight using train set estimate found through CV train_preds = cross_val_predict( TimeSeriesForestClassifier(**self._tsf_params, random_state=self.random_state), X=X, y=y, cv=cv_size, n_jobs=self.n_jobs_, ) self.tsf_weight_ = accuracy_score(y, train_preds)**4 if self.verbose > 0: print( # noqa "TSF train estimate ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"), ) print("TSF weight = " + str(self.tsf_weight_)) # noqa # Build RISE self._rise = RandomIntervalSpectralForest( **self._rise_params, random_state=self.random_state, n_jobs=self.n_jobs_, ) self._rise.fit(X, y) if self.verbose > 0: print("RISE ", datetime.now().strftime("%H:%M:%S %d/%m/%Y")) # noqa # Find RISE weight using train set estimate found through CV train_preds = cross_val_predict( RandomIntervalSpectralForest( **self._rise_params, random_state=self.random_state, ), X=X, y=y, cv=cv_size, n_jobs=self.n_jobs_, ) self.rise_weight_ = accuracy_score(y, train_preds)**4 if self.verbose > 0: print( # noqa "RISE train estimate ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"), ) print("RISE weight = " + str(self.rise_weight_)) # noqa # Build cBOSS self._cboss = ContractableBOSS(**self._cboss_params, random_state=self.random_state, n_jobs=self.n_jobs_) self._cboss.fit(X, y) # Find cBOSS weight using train set estimate train_probs = self._cboss._get_train_probs(X, y) train_preds = self._cboss.classes_[np.argmax(train_probs, axis=1)] self.cboss_weight_ = accuracy_score(y, train_preds)**4 if self.verbose > 0: print( # noqa "cBOSS (estimate included)", datetime.now().strftime("%H:%M:%S %d/%m/%Y"), ) print("cBOSS weight = " + str(self.cboss_weight_)) # noqa
def _fit(self, X, y): self._n_jobs = check_n_jobs(self.n_jobs) self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self._class_dictionary[classVal] = index time_limit = self.time_limit_in_minutes * 60 start_time = time.time() train_time = 0 base_rocket = Rocket(num_kernels=self.num_kernels) if time_limit > 0: self.n_estimators = 0 self.estimators_ = [] self.transformed_data = [] while ( train_time < time_limit and self.n_estimators < self.contract_max_n_estimators ): fit = Parallel(n_jobs=self._n_jobs)( delayed(self._fit_estimator)( _clone_estimator( base_rocket, None if self.random_state is None else (255 if self.random_state == 0 else self.random_state) * 37 * (i + 1), ), X, y, ) for i in range(self._n_jobs) ) estimators, transformed_data = zip(*fit) self.estimators_ += estimators self.transformed_data += transformed_data self.n_estimators += self._n_jobs train_time = time.time() - start_time else: fit = Parallel(n_jobs=self._n_jobs)( delayed(self._fit_estimator)( _clone_estimator( base_rocket, None if self.random_state is None else (255 if self.random_state == 0 else self.random_state) * 37 * (i + 1), ), X, y, ) for i in range(self.n_estimators) ) self.estimators_, self.transformed_data = zip(*fit) self.weights = [] self._weight_sum = 0 for rocket_pipeline in self.estimators_: weight = rocket_pipeline.steps[1][1].best_score_ self.weights.append(weight) self._weight_sum += weight
def fit(self, X, y): """Fit a forest of trees on cases (X,y), where y is the target variable. Parameters ---------- X : ndarray of shape = [n_instances,n_attributes] The training input samples. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ if isinstance(X, np.ndarray) and len(X.shape) == 3 and X.shape[1] == 1: X = np.reshape(X, (X.shape[0], -1)) elif isinstance(X, pd.DataFrame) and len(X.shape) == 2: X = X.to_numpy() elif not isinstance(X, np.ndarray) or len(X.shape) > 2: raise ValueError("RotationForest is not a time series classifier. " "A 2d numpy array is required.") X, y = check_X_y(X, y) self._n_jobs = check_n_jobs(self.n_jobs) self.n_instances, self.n_atts = X.shape self.classes_ = np.unique(y) self.n_classes = self.classes_.shape[0] for index, classVal in enumerate(self.classes_): self._class_dictionary[classVal] = index time_limit = self.time_limit_in_minutes * 60 start_time = time.time() train_time = 0 if self.base_estimator is None: self._base_estimator = DecisionTreeClassifier(criterion="entropy") # replace missing values with 0 and remove useless attributes X = np.nan_to_num(X, False, 0, 0, 0) self._useful_atts = ~np.all(X[1:] == X[:-1], axis=0) X = X[:, self._useful_atts] self._n_atts = X.shape[1] # normalise attributes self._min = X.min(axis=0) self._ptp = X.max(axis=0) - self._min X = (X - self._min) / self._ptp X_cls_split = [X[np.where(y == i)] for i in self.classes_] if time_limit > 0: self._n_estimators = 0 self.estimators_ = [] self._pcas = [] self._groups = [] while (train_time < time_limit and self._n_estimators < self.contract_max_n_estimators): fit = Parallel(n_jobs=self._n_jobs)( delayed(self._fit_estimator)( X, X_cls_split, y, i, ) for i in range(self._n_jobs)) estimators, pcas, groups, transformed_data = zip(*fit) self.estimators_ += estimators self._pcas += pcas self._groups += groups self.transformed_data += transformed_data self._n_estimators += self._n_jobs train_time = time.time() - start_time else: fit = Parallel(n_jobs=self._n_jobs)(delayed(self._fit_estimator)( X, X_cls_split, y, i, ) for i in range(self._n_estimators)) self.estimators_, self._pcas, self._groups, self.transformed_data = zip( *fit) self._is_fitted = True return self
def _fit(self, X, y=None): """Fit the shapelet transform to a specified X and y. Parameters ---------- X: pandas DataFrame or np.ndarray The training input samples. y: array-like or list The class values for X. Returns ------- self : RandomShapeletTransform This estimator. """ # this is a few versions away currently, and heaps dont support the replacement warnings.simplefilter("ignore", category=NumbaPendingDeprecationWarning) self._n_jobs = check_n_jobs(self.n_jobs) self.classes_, self._class_counts = np.unique(y, return_counts=True) self.n_classes = self.classes_.shape[0] for index, classVal in enumerate(self.classes_): self._class_dictionary[classVal] = index le = preprocessing.LabelEncoder() y = le.fit_transform(y) self.n_instances, self.n_dims, self.series_length = X.shape if self.max_shapelets is None: self._max_shapelets = min(10 * self.n_instances, 1000) if self.max_shapelet_length is None: self._max_shapelet_length = self.series_length time_limit = self.time_limit_in_minutes * 60 start_time = time.time() fit_time = 0 max_shapelets_per_class = self._max_shapelets / self.n_classes shapelets = [[(-1.0, -1, -1, -1, -1, -1)] for _ in range(self.n_classes)] n_shapelets_extracted = 0 if time_limit > 0: while (fit_time < time_limit and n_shapelets_extracted < self.contract_max_n_shapelet_samples): candidate_shapelets = Parallel(n_jobs=self._n_jobs)( delayed(self._extract_random_shapelet)( X, y, n_shapelets_extracted + i, shapelets, max_shapelets_per_class, ) for i in range(self._batch_size)) for i, heap in enumerate(shapelets): RandomShapeletTransform._merge_shapelets( heap, candidate_shapelets, max_shapelets_per_class, i, ) if self.remove_self_similar: for i, heap in enumerate(shapelets): to_keep = (RandomShapeletTransform. _remove_self_similar_shapelets(heap)) shapelets[i] = [ n for (n, b) in zip(heap, to_keep) if b ] n_shapelets_extracted += self._batch_size fit_time = time.time() - start_time else: while n_shapelets_extracted < self._n_shapelet_samples: n_shapelets_to_extract = ( self._batch_size if n_shapelets_extracted + self._batch_size <= self._n_shapelet_samples else self._n_shapelet_samples - n_shapelets_extracted) candidate_shapelets = Parallel(n_jobs=self._n_jobs)( delayed(self._extract_random_shapelet)( X, y, n_shapelets_extracted + i, shapelets, max_shapelets_per_class, ) for i in range(n_shapelets_to_extract)) for i, heap in enumerate(shapelets): RandomShapeletTransform._merge_shapelets( heap, candidate_shapelets, max_shapelets_per_class, i, ) if self.remove_self_similar: for i, heap in enumerate(shapelets): to_keep = (RandomShapeletTransform. _remove_self_similar_shapelets(heap)) shapelets[i] = [ n for (n, b) in zip(heap, to_keep) if b ] n_shapelets_extracted += n_shapelets_to_extract self.shapelets = [( s[0], s[1], s[2], s[3], s[4], self.classes_[s[5]], z_normalise_series(X[s[4], s[3], s[2]:s[2] + s[1]]), ) for class_shapelets in shapelets for s in class_shapelets if s[0] > 0] self.shapelets.sort(reverse=True, key=lambda s: (s[0], s[1], s[2], s[3], s[4])) to_keep = RandomShapeletTransform._remove_identical_shapelets( self.shapelets) self.shapelets = [n for (n, b) in zip(self.shapelets, to_keep) if b] self._sorted_indicies = [] for s in self.shapelets: sabs = np.abs(s[6]) self._sorted_indicies.append( sorted(range(s[1]), reverse=True, key=lambda i: sabs[i])) return self
def _fit(self, X, y): """Fit HIVE-COTE 2.0 to training data. Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The training data. y : array-like, shape = [n_instances] The class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ self.n_jobs_ = check_n_jobs(self.n_jobs) self.n_classes_ = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] # Default values from HC2 paper if self.stc_params is None: self._stc_params = {"transform_limit_in_minutes": 120} if self.drcif_params is None: self._drcif_params = {"n_estimators": 500} if self.arsenal_params is None: self._arsenal_params = {} if self.tde_params is None: self._tde_params = {} # If we are contracting split the contract time between each algorithm if self.time_limit_in_minutes > 0: # Leave 1/3 for train estimates ct = self.time_limit_in_minutes / 6 self._stc_params["time_limit_in_minutes"] = ct self._drcif_params["time_limit_in_minutes"] = ct self._arsenal_params["time_limit_in_minutes"] = ct self._tde_params["time_limit_in_minutes"] = ct # Build STC self._stc = ShapeletTransformClassifier( **self._stc_params, save_transformed_data=True, random_state=self.random_state, n_jobs=self.n_jobs_, ) self._stc.fit(X, y) if self.verbose > 0: print("STC ", datetime.now().strftime("%H:%M:%S %d/%m/%Y")) # noqa # Find STC weight using train set estimate train_probs = self._stc._get_train_probs(X, y) train_preds = self._stc.classes_[np.argmax(train_probs, axis=1)] self.stc_weight_ = accuracy_score(y, train_preds) ** 4 if self.verbose > 0: print( # noqa "STC train estimate ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"), ) print("STC weight = " + str(self.stc_weight_)) # noqa # Build DrCIF self._drcif = DrCIF( **self._drcif_params, save_transformed_data=True, random_state=self.random_state, n_jobs=self.n_jobs_, ) self._drcif.fit(X, y) if self.verbose > 0: print("DrCIF ", datetime.now().strftime("%H:%M:%S %d/%m/%Y")) # noqa # Find DrCIF weight using train set estimate train_probs = self._drcif._get_train_probs(X, y) train_preds = self._drcif.classes_[np.argmax(train_probs, axis=1)] self.drcif_weight_ = accuracy_score(y, train_preds) ** 4 if self.verbose > 0: print( # noqa "DrCIF train estimate ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"), ) print("DrCIF weight = " + str(self.drcif_weight_)) # noqa # Build Arsenal self._arsenal = Arsenal( **self._arsenal_params, save_transformed_data=True, random_state=self.random_state, n_jobs=self.n_jobs_, ) self._arsenal.fit(X, y) if self.verbose > 0: print("Arsenal ", datetime.now().strftime("%H:%M:%S %d/%m/%Y")) # noqa # Find Arsenal weight using train set estimate train_probs = self._arsenal._get_train_probs(X, y) train_preds = self._arsenal.classes_[np.argmax(train_probs, axis=1)] self.arsenal_weight_ = accuracy_score(y, train_preds) ** 4 if self.verbose > 0: print( # noqa "Arsenal train estimate ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"), ) print("Arsenal weight = " + str(self.arsenal_weight_)) # noqa # Build TDE self._tde = TemporalDictionaryEnsemble( **self._tde_params, save_train_predictions=True, random_state=self.random_state, n_jobs=self.n_jobs_, ) self._tde.fit(X, y) if self.verbose > 0: print("TDE ", datetime.now().strftime("%H:%M:%S %d/%m/%Y")) # noqa # Find TDE weight using train set estimate train_probs = self._tde._get_train_probs(X, y, train_estimate_method="oob") train_preds = self._tde.classes_[np.argmax(train_probs, axis=1)] self.tde_weight_ = accuracy_score(y, train_preds) ** 4 if self.verbose > 0: print( # noqa "TDE train estimate ", datetime.now().strftime("%H:%M:%S %d/%m/%Y"), ) print("TDE weight = " + str(self.tde_weight_)) # noqa