def test_check_X_enforce_min_columns(): X, y = make_classification_problem(n_columns=2) msg = r"columns" with pytest.raises(ValueError, match=msg): check_X(X, enforce_min_columns=3) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_min_columns=3)
def test_check_X_enforce_univariate(): X, y = make_classification_problem(n_columns=2) msg = r"univariate" with pytest.raises(ValueError, match=msg): check_X(X, enforce_univariate=True) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_univariate=True)
def test_check_enforce_min_instances(): X, y = make_classification_problem(n_instances=3) msg = r"instance" with pytest.raises(ValueError, match=msg): check_X(X, enforce_min_instances=4) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_min_instances=4) with pytest.raises(ValueError, match=msg): check_y(y, enforce_min_instances=4)
def _set_oob_score(self, X, y): """Compute out-of-bag score""" check_X_y(X, y) check_X(X, enforce_univariate=True) n_classes_ = self.n_classes_ n_samples = y.shape[0] oob_decision_function = [] oob_score = 0.0 predictions = [ np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_) ] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) for estimator in self.estimators_: final_estimator = estimator.steps[-1][1] unsampled_indices = _generate_unsampled_indices( final_estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :]) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") decision = (predictions[k] / predictions[k].sum(axis=1)[:, np.newaxis]) oob_decision_function.append(decision) oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1), axis=0) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def fit(self, X, y): """Perform a shapelet transform then builds a random forest. Contract default for ST is 5 hours ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. RISE has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.classifier.fit(X, y) # self.shapelet_transform.fit(X,y) # print("Shapelet Search complete") # self.st_X =self.shapelet_transform.transform(X) # print("Transform complete") # X = np.asarray([a.values for a in X.iloc[:, 0]]) # self.classifier.fit(X,y) # print("Build classifier complete") self._is_fitted = True return self
def fit(self, X, y): """ Build the classifier on the training set (X, y) ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, column 0 is extracted. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) self.X = dataset_properties.positive_dataframe_indices(X) self.random_state = check_random_state(self.random_state) # setup label encoding if self.label_encoder is None: self.label_encoder = LabelEncoder() y = self.label_encoder.fit_transform(y) self.y = y self.classes_ = self.label_encoder.classes_ if self.distance_measure is None: if self.get_distance_measure is None: self.get_distance_measure = self.setup_distance_measure(self) self.distance_measure = self.get_distance_measure(self) self.X_exemplar, self.y_exemplar = self.pick_exemplars(self) self._is_fitted = True return self
def fit(self, X, y): """ Build the classifier on the training set (X, y) ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, column 0 is extracted. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) self.X = dataset_properties.positive_dataframe_indices(X) self.random_state = check_random_state(self.random_state) if self.find_stump is None: self.find_stump = best_of_n_stumps(self.n_stump_evaluations) # setup label encoding if self.label_encoder is None: self.label_encoder = LabelEncoder() y = self.label_encoder.fit_transform(y) self.y = y self.classes_ = self.label_encoder.classes_ if self.distance_measure is None: if self.get_distance_measure is None: self.get_distance_measure = self.setup_distance_measure(self) self.distance_measure = self.get_distance_measure(self) self.stump = self.find_stump(self) n_branches = len(self.stump.y_exemplar) self.branches = [None] * n_branches if self.depth < self.max_depth: for index in range(n_branches): sub_y = self.stump.y_branches[index] if not self.is_leaf(sub_y): sub_tree = ProximityTree( random_state=self.random_state, get_exemplars=self.get_exemplars, distance_measure=self.distance_measure, setup_distance_measure=self.setup_distance_measure, get_distance_measure=self.get_distance_measure, get_gain=self.get_gain, is_leaf=self.is_leaf, verbosity=self.verbosity, max_depth=self.max_depth, n_jobs=self.n_jobs, ) sub_tree.label_encoder = self.label_encoder sub_tree.depth = self.depth + 1 self.branches[index] = sub_tree sub_X = self.stump.X_branches[index] sub_tree.fit(sub_X, sub_y) self._is_fitted = True return self
def check_and_clean_data(X, y=None, input_checks=True): ''' Performs basic sktime data checks and prepares the train data for input to Keras models. :param X: the train data :param y: teh train labels :param input_checks: whether to perform the basic sktime checks :return: X ''' if input_checks: if y is None: check_X(X) else: check_X_y(X, y) # want data in form: [instances = n][timepoints = m][dimensions = d] if isinstance(X, pd.DataFrame): if _is_nested_dataframe(X): if X.shape[1] > 1: # we have multiple columns, AND each cell contains a series, # so this is a multidimensional problem X = _multivariate_nested_df_to_array(X) else: # we have a single column containing a series, treat this as # a univariate problem X = _univariate_nested_df_to_array(X) else: # we have multiple columns each containing a primitive, treat as # univariate series X = _univariate_df_to_array(X) if len(X.shape) == 2: # add a dimension to make it multivariate with one dimension X = X.reshape( X.shape[0], X.shape[1], 1 ) # go from [n][m] to [n][m][d=1] return X
def fit(self, X, y): X, y = check_X_y(X, y, enforce_univariate=True) sfa = self.transformer.fit_transform(X, y) self.transformed_data = [series.to_dict() for series in sfa.iloc[:, 0]] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using random intervals and summary features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. RISE has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) n_instances, self.series_length = X.shape rng = check_random_state(self.random_state) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_intervals = int(math.sqrt(self.series_length)) if self.n_intervals == 0: self.n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length self.intervals_ = [ _get_intervals(self.n_intervals, self.min_interval, self.series_length, rng) for _ in range(self.n_estimators) ] self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_estimator)( X, y, self.base_estimator, self.intervals_[i], self.random_state, ) for i in range(self.n_estimators)) self._is_fitted = True return self
def fit(self, X, y=None): """Fit. Parameters ---------- X : pd.DataFrame nested pandas DataFrame of shape [n_samples, n_columns] y : pd.Series or np.array Target variable Returns ------- self : an instance of self """ # lazy imports to avoid hard dependency from tsfresh.transformers.feature_selector import FeatureSelector # input checks if y is None: raise ValueError( f"{self.__class__.__name__} requires `y` in `fit`.") X, y = check_X_y(X, y, coerce_to_pandas=True) self.extractor_ = TSFreshFeatureExtractor( default_fc_parameters=self.default_fc_parameters, kind_to_fc_parameters=self.kind_to_fc_parameters, chunksize=self.chunksize, n_jobs=self.n_jobs, show_warnings=self.show_warnings, disable_progressbar=self.disable_progressbar, profiling=self.profiling, profiling_filename=self.profiling_filename, profiling_sorting=self.profiling_sorting, ) selection_params = self._get_selection_params() extraction_param = self._get_extraction_params() self.selector_ = FeatureSelector( n_jobs=extraction_param["n_jobs"], chunksize=extraction_param["chunksize"], ml_task=self.ml_task, **selection_params, ) Xt = self.extractor_.fit_transform(X) self.selector_.fit(Xt, y) self._is_fitted = True return self
def _set_oob_score(self, X, y): """ Compute out-of-bag scores.""" X, y = check_X_y(X, y, enforce_univariate=True) n_samples = y.shape[0] predictions = np.zeros((n_samples, self.n_outputs_)) n_predictions = np.zeros((n_samples, self.n_outputs_)) n_samples_bootstrap = _get_n_samples_bootstrap( n_samples, self.max_samples ) for estimator in self.estimators_: final_estimator = estimator.steps[-1][1] unsampled_indices = _generate_unsampled_indices( final_estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict( X[unsampled_indices, :], check_input=False) if self.n_outputs_ == 1: p_estimator = p_estimator[:, np.newaxis] predictions[unsampled_indices, :] += p_estimator n_predictions[unsampled_indices, :] += 1 if (n_predictions == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions self.oob_prediction_ = predictions if self.n_outputs_ == 1: self.oob_prediction_ = \ self.oob_prediction_.reshape((n_samples, )) self.oob_score_ = 0.0 for k in range(self.n_outputs_): self.oob_score_ += r2_score(y[:, k], predictions[:, k]) self.oob_score_ /= self.n_outputs_
def fit(self, X, y): if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame): X, y = check_X_y(X, y, enforce_univariate=True) X = tabularize(X, return_array=True) sfa = self.transformer.fit_transform(X, y) self.transformed_data = sfa[0] # .iloc[:, 0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self
def fit(self, X, y): """ Method to perform training on the classifier. Parameters ---------- X - pandas dataframe of training data of shape [n_instances,1]. y - list of class labels of shape [n_instances]. Returns ------- self : the shapeDTW object """ # Perform preprocessing on params. if not (isinstance(self.shape_descriptor_function, str)): raise TypeError("shape_descriptor_function must be an 'str'. \ Found '" + type(self.shape_descriptor_function).__name__ + "' instead.") X, y = check_X_y(X, y, enforce_univariate=False) if self.metric_params is None: self.metric_params = {} # If the shape descriptor is 'compound', # calculate the appropriate weighting_factor if self.shape_descriptor_function == "compound": self._calculate_weighting_factor_value(X, y) # Fit the SlidingWindowSegmenter sw = SlidingWindowSegmenter(self.subsequence_length) sw.fit(X) self.sw = sw # Transform the training data. X = self._preprocess(X) # Fit the kNN classifier self.knn = KNeighborsTimeSeriesClassifier(n_neighbors=self.n_neighbors) self.knn.fit(X, y) self.classes_ = self.knn.classes_ return self
def fit(self, X, y): """ Build the classifier on the training set (X, y) ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, column 0 is extracted. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) self.X = dataset_properties.positive_dataframe_indices(X) self.random_state = check_random_state(self.random_state) # setup label encoding if self.label_encoder is None: self.label_encoder = LabelEncoder() y = self.label_encoder.fit_transform(y) self.y = y self.classes_ = self.label_encoder.classes_ if self.distance_measure is None: if self.get_distance_measure is None: self.get_distance_measure = self.setup_distance_measure_getter( self) self.distance_measure = self.get_distance_measure(self) if self.n_jobs > 1 or self.n_jobs < 0: parallel = Parallel(self.n_jobs) self.trees = parallel( delayed(self._fit_tree) (X, y, index, self.random_state.randint(0, self.n_estimators)) for index in range(self.n_estimators)) else: self.trees = [ self._fit_tree(X, y, index, self.random_state.randint(0, self.n_estimators)) for index in range(self.n_estimators) ] self._is_fitted = True return self
def fit(self, X, y): """Perform a shapelet transform then builds a random forest. Contract default for ST is 5 hours ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. RISE has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) # if y is a pd.series then convert to array. if isinstance(y, pd.Series): y = y.to_numpy() # generate pipeline in fit so that random state can be propogated properly. self.classifier_ = Pipeline([ ('st', ContractedShapeletTransform( time_contract_in_mins=self.time_contract_in_mins, verbose=False, random_state=self.random_state)), ('rf', RandomForestClassifier(n_estimators=self.n_estimators, random_state=self.random_state)) ]) self.n_classes_ = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.classifier_.fit(X, y) self._is_fitted = True return self
def fit(self, X, y): """Build an ensemble of 1-NN classifiers from th training set (X, y), Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, it must have a single column. BOSS not configured to handle multivariate y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) # Derivative DTW (DDTW) uses the regular DTW algorithm on data that # are transformed into derivatives. # To increase the efficiency of DDTW we can pre-transform the data # into derivatives, and then call the # standard DTW algorithm on it, rather than transforming each series # every time a distance calculation # is made. Please note that using DDTW elsewhere will not benefit # from this speed enhancement if self.distance_measures.__contains__( ddtw_c) or self.distance_measures.__contains__(wddtw_c): der_X = DerivativeSlopeTransformer().fit_transform(X) # reshape X for use with the efficient cython distance measures der_X = np.array( [np.asarray([x]).reshape(len(x), 1) for x in der_X.iloc[:, 0]]) else: der_X = None # reshape X for use with the efficient cython distance measures X = np.array( [np.asarray([x]).reshape(len(x), 1) for x in X.iloc[:, 0]]) self.train_accs_by_classifier = np.zeros(len(self.distance_measures)) self.train_preds_by_classifier = [None] * len(self.distance_measures) self.estimators_ = [None] * len(self.distance_measures) self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] rand = np.random.RandomState(self.random_state) # The default EE uses all training instances for setting parameters, # and 100 parameter options per # elastic measure. The prop_train_in_param_finding and # prop_of_param_options attributes of this class # can be used to control this however, using less cases to optimise # parameters on the training data # and/or using less parameter options. # # For using less training instances the appropriate number of cases # must be sampled from the data. # This is achieved through the use of a deterministic # StratifiedShuffleSplit # # For using less parameter options a RandomizedSearchCV is used in # place of a GridSearchCV param_train_x = None der_param_train_x = None param_train_y = None # If using less cases for parameter optimisation, use the # StratifiedShuffleSplit: if self.proportion_train_in_param_finding < 1: if self.verbose > 0: print( "Restricting training cases for parameter optimisation: ", end="") sss = StratifiedShuffleSplit( n_splits=1, test_size=1 - self.proportion_train_in_param_finding, random_state=rand) for train_index, test_index in sss.split(X, y): param_train_x = X[train_index, :] param_train_y = y[train_index] if der_X is not None: der_param_train_x = der_X[train_index, :] if self.verbose > 0: print("using " + str(len(param_train_x)) + " training cases instead of " + str(len(X)) + " for parameter optimisation") # else, use the full training data for optimising parameters else: if self.verbose > 0: print("Using all training cases for parameter optimisation") param_train_x = X param_train_y = y if der_X is not None: der_param_train_x = der_X self.constituent_build_times = [] if self.verbose > 0: print("Using " + str(100 * self.proportion_of_param_options) + " parameter " "options per " "measure") for dm in range(0, len(self.distance_measures)): this_measure = self.distance_measures[dm] # uses the appropriate training data as required (either full or # smaller sample as per the StratifiedShuffleSplit) param_train_to_use = param_train_x full_train_to_use = X if this_measure is ddtw_c or dm is wddtw_c: param_train_to_use = der_param_train_x full_train_to_use = der_X if this_measure is ddtw_c: this_measure = dtw_c elif this_measure is wddtw_c: this_measure = wdtw_c start_build_time = time.time() if self.verbose > 0: if self.distance_measures[dm] is ddtw_c or \ self.distance_measures[dm] is wddtw_c: print("Currently evaluating " + str(self.distance_measures[dm].__name__) + " (implemented as " + str(this_measure.__name__) + " with pre-transformed derivative data)") else: print("Currently evaluating " + str(self.distance_measures[dm].__name__)) # If 100 parameter options are being considered per measure, # use a GridSearchCV if self.proportion_of_param_options == 1: grid = GridSearchCV( estimator=KNeighborsTimeSeriesClassifier( metric=this_measure, n_neighbors=1, algorithm="brute"), param_grid=ElasticEnsemble._get_100_param_options( self.distance_measures[dm], X), cv=LeaveOneOut(), scoring='accuracy', verbose=self.verbose) grid.fit(param_train_to_use, param_train_y) # Else, used RandomizedSearchCV to randomly sample parameter # options for each measure else: grid = RandomizedSearchCV( estimator=KNeighborsTimeSeriesClassifier( metric=this_measure, n_neighbors=1, algorithm="brute"), param_distributions=ElasticEnsemble._get_100_param_options( self.distance_measures[dm], X), cv=LeaveOneOut(), scoring='accuracy', n_iter=100 * self.proportion_of_param_options, random_state=rand, verbose=self.verbose) grid.fit(param_train_to_use, param_train_y) # once the best parameter option has been estimated on the # training data, perform a final pass with this parameter option # to get the individual predictions with cross_cal_predict ( # Note: optimisation potentially possible here if a GridSearchCV # was used previously. TO-DO: determine how to extract # predictions for the best param option from GridSearchCV) best_model = KNeighborsTimeSeriesClassifier( algorithm="brute", n_neighbors=1, metric=this_measure, metric_params=grid.best_params_['metric_params']) preds = cross_val_predict(best_model, full_train_to_use, y, cv=LeaveOneOut()) acc = accuracy_score(y, preds) if self.verbose > 0: print("Training accuracy for " + str(self.distance_measures[dm].__name__) + ": " + str(acc) + " (with parameter setting: " + str(grid.best_params_['metric_params']) + ")") # Finally, reset the classifier for this measure and parameter # option, ready to be called for test classification best_model = KNeighborsTimeSeriesClassifier( algorithm="brute", n_neighbors=1, metric=this_measure, metric_params=grid.best_params_['metric_params']) best_model.fit(full_train_to_use, y) end_build_time = time.time() self.constituent_build_times.append( str(end_build_time - start_build_time)) self.estimators_[dm] = best_model self.train_accs_by_classifier[dm] = acc self.train_preds_by_classifier[dm] = preds self._is_fitted = True return self
def fit(self, X, y): """Fit the model using X as training data and y as target values Parameters ---------- X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]), or numpy ndarray with shape([n_cases,n_readings,n_dimensions]) y : {array-like, sparse matrix} Target values of shape = [n_samples] """ X, y = check_X_y(X, y, enforce_univariate=False) y = np.asarray(y) X = nested_to_3d_numpy(X) check_classification_targets(y) # print(X) # if internal cv is desired, the relevant flag forces a grid search # to evaluate the possible values, # find the best, and then set this classifier's params to match if self._cv_for_params: grid = GridSearchCV( estimator=KNeighborsTimeSeriesClassifier(metric=self.metric, n_neighbors=1, algorithm="brute"), param_grid=self._param_matrix, cv=LeaveOneOut(), scoring='accuracy' ) grid.fit(X, y) self.metric_params = grid.best_params_['metric_params'] if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: if y.ndim != 1: warnings.warn("A column-vector y was passed when a 1d array " "was expected. Please change the shape of y to " "(n_samples, ), for example using ravel().", DataConversionWarning, stacklevel=2) self.outputs_2d_ = False y = y.reshape((-1, 1)) else: self.outputs_2d_ = True self.classes_ = [] self._y = np.empty(y.shape, dtype=np.int) for k in range(self._y.shape[1]): classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes) if not self.outputs_2d_: self.classes_ = self.classes_[0] self._y = self._y.ravel() if hasattr(check_array, '__wrapped__'): temp = check_array.__wrapped__.__code__ check_array.__wrapped__.__code__ = _check_array_ts.__code__ else: temp = check_array.__code__ check_array.__code__ = _check_array_ts.__code__ fx = self._fit(X) if hasattr(check_array, '__wrapped__'): check_array.__wrapped__.__code__ = temp else: check_array.__code__ = temp self._is_fitted = True return fx
def test_check_X_bad_input_args(X): with pytest.raises(ValueError): check_X(X) with pytest.raises(ValueError): check_X_y(X, y)
def fit(self, X, y): """Build a WEASEL+MUSE classifiers from the training set (X, y), Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, coerce_to_pandas=True) y = np.asarray(y) # add first order differences in each dimension to TS if self.use_first_order_differences: X = self.add_first_order_differences(X) # Window length parameter space dependent on series length self.col_names = X.columns rng = check_random_state(self.random_state) self.n_dims = len(self.col_names) self.highest_dim_bit = (math.ceil(math.log2(self.n_dims))) + 1 self.highest_bits = np.zeros(self.n_dims) self.SFA_transformers = [[] for _ in range(self.n_dims)] # the words of all dimensions and all time series all_words = [dict() for _ in range(X.shape[0])] # On each dimension, perform SFA for ind, column in enumerate(self.col_names): X_dim = X[[column]] X_dim = from_nested_to_3d_numpy(X_dim) series_length = X_dim.shape[ -1] # TODO compute minimum over all ts ? # increment window size in steps of 'win_inc' win_inc = self.compute_window_inc(series_length) self.max_window = int(min(series_length, self.max_window)) self.window_sizes.append( list(range(self.min_window, self.max_window, win_inc))) self.highest_bits[ind] = math.ceil(math.log2(self.max_window)) + 1 for window_size in self.window_sizes[ind]: transformer = SFA( word_length=rng.choice(self.word_lengths), alphabet_size=self.alphabet_size, window_size=window_size, norm=rng.choice(self.norm_options), anova=self.anova, binning_method=rng.choice(self.binning_strategies), bigrams=self.bigrams, remove_repeat_words=False, lower_bounding=False, save_words=False, ) sfa_words = transformer.fit_transform(X_dim, y) self.SFA_transformers[ind].append(transformer) bag = sfa_words[0] # .iloc[:, 0] # chi-squared test to keep only relevant features relevant_features = {} apply_chi_squared = self.chi2_threshold > 0 if apply_chi_squared: bag_vec = DictVectorizer(sparse=False).fit_transform(bag) chi2_statistics, p = chi2(bag_vec, y) relevant_features = np.where( chi2_statistics >= self.chi2_threshold)[0] # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length highest = np.int32(self.highest_bits[ind]) for j in range(len(bag)): for (key, value) in bag[j].items(): # chi-squared test if (not apply_chi_squared) or (key in relevant_features): # append the prefices to the words to # distinguish between window-sizes word = MUSE.shift_left(key, highest, ind, self.highest_dim_bit, window_size) all_words[j][word] = value self.clf = make_pipeline( DictVectorizer(sparse=False), StandardScaler(with_mean=True, copy=False), LogisticRegression( max_iter=5000, solver="liblinear", dual=True, # class_weight="balanced", penalty="l2", random_state=self.random_state, ), ) self.clf.fit(all_words, y) self._is_fitted = True return self
def fit(self, X, y=None): """A method to fit the shapelet transform to a specified X and y Parameters ---------- X: pandas DataFrame The training input samples. y: array-like or list The class values for X Returns ------- self : FullShapeletTransform This estimator """ X, y = check_X_y(X, y, enforce_univariate=True) # if y is a pd.series then convert to array. if isinstance(y, pd.Series): y = y.to_numpy() if type( self) is ContractedShapeletTransform and \ self.time_contract_in_mins <= 0: raise ValueError( "Error: time limit cannot be equal to or less than 0") X_lens = np.array([ len(X.iloc[r, 0]) for r in range(len(X)) ]) # note, assumes all dimensions of a case are the same # length. A shapelet would not be well defined if indices do not match! X = np.array([[X.iloc[r, c].values for c in range(len(X.columns))] for r in range(len(X)) ]) # may need to pad with nans here for uneq length, # look at later num_ins = len(y) distinct_class_vals = \ class_distribution(np.asarray(y).reshape(-1, 1))[0][0] candidates_evaluated = 0 if type(self) is _RandomEnumerationShapeletTransform: num_series_to_visit = min(self.num_cases_to_sample, len(y)) else: num_series_to_visit = num_ins shapelet_heaps_by_class = { i: ShapeletPQ() for i in distinct_class_vals } self._random_state = check_random_state(self.random_state) # Here we establish the order of cases to sample. We need to sample # x cases and y shapelets from each (where x = num_cases_to_sample # and y = num_shapelets_to_sample_per_case). We could simply sample # x cases without replacement and y shapelets from each case, but # the idea is that if we are using a time contract we may extract # all y shapelets from each x candidate and still have time remaining. # Therefore, if we get a list of the indices of the series and # shuffle them appropriately, we can go through the list again and # extract # another y shapelets from each series (if we have time). # We also want to ensure that we visit all classes so we will visit # in round-robin order. Therefore, the code below extracts the indices # of all series by class, shuffles the indices for each class # independently, and then combines them in alternating order. This # results in # a shuffled list of indices that are in alternating class order ( # e.g. 1,2,3,1,2,3,1,2,3,1...) def _round_robin(*iterables): sentinel = object() return (a for x in zip_longest(*iterables, fillvalue=sentinel) for a in x if a != sentinel) case_ids_by_class = { i: np.where(y == i)[0] for i in distinct_class_vals } # if transform is random/contract then shuffle the data initially # when determining which cases to visit if type(self) is _RandomEnumerationShapeletTransform or type( self) is ContractedShapeletTransform: for i in range(len(distinct_class_vals)): self._random_state.shuffle( case_ids_by_class[distinct_class_vals[i]]) num_train_per_class = { i: len(case_ids_by_class[i]) for i in case_ids_by_class } round_robin_case_order = _round_robin( *[list(v) for k, v in case_ids_by_class.items()]) cases_to_visit = [(i, y[i]) for i in round_robin_case_order] # this dictionary will be used to store all possible starting # positions and shapelet lengths for a give series length. This # is because we enumerate all possible candidates and sample without # replacement when assessing a series. If we have two series # of the same length then they will obviously have the same valid # shapelet starting positions and lengths (especially in standard # datasets where all series are equal length) so it makes sense to # store the possible candidates and reuse, rather than # recalculating each time # Initially the dictionary will be empty, and each time a new series # length is seen the dict will be updated. Next time that length # is used the dict will have an entry so can simply reuse possible_candidates_per_series_length = {} # a flag to indicate if extraction should stop (contract has ended) time_finished = False # max time calculating a shapelet # for timing the extraction when contracting start_time = time.time() def time_taken(): return time.time() - start_time max_time_calc_shapelet = -1 time_last_shapelet = time_taken() # for every series case_idx = 0 while case_idx < len(cases_to_visit): series_id = cases_to_visit[case_idx][0] this_class_val = cases_to_visit[case_idx][1] # minus 1 to remove this candidate from sums binary_ig_this_class_count = num_train_per_class[this_class_val] - 1 binary_ig_other_class_count = (num_ins - binary_ig_this_class_count - 1) if self.verbose: if type(self) == _RandomEnumerationShapeletTransform: print("visiting series: " + str(series_id) + " (#" + str(case_idx + 1) + "/" + str(num_series_to_visit) + ")") else: print("visiting series: " + str(series_id) + " (#" + str(case_idx + 1) + ")") this_series_len = len(X[series_id][0]) # The bound on possible shapelet lengths will differ # series-to-series if using unequal length data. # However, shapelets cannot be longer than the series, so set to # the minimum of the series length # and max shapelet length (which is inf by default) if self.max_shapelet_length == -1: this_shapelet_length_upper_bound = this_series_len else: this_shapelet_length_upper_bound = min( this_series_len, self.max_shapelet_length) # all possible start and lengths for shapelets within this # series (calculates if series length is new, a simple look-up # if not) # enumerate all possible candidate starting positions and lengths. # First, try to reuse if they have been calculated for a series # of the same length before. candidate_starts_and_lens = \ possible_candidates_per_series_length.get( this_series_len) # else calculate them for this series length and store for # possible use again if candidate_starts_and_lens is None: candidate_starts_and_lens = [ [start, length] for start in range( 0, this_series_len - self.min_shapelet_length + 1) for length in range(self.min_shapelet_length, this_shapelet_length_upper_bound + 1) if start + length <= this_series_len ] possible_candidates_per_series_length[ this_series_len] = candidate_starts_and_lens # default for full transform candidates_to_visit = candidate_starts_and_lens num_candidates_per_case = len(candidate_starts_and_lens) # limit search otherwise: if hasattr(self, "num_candidates_to_sample_per_case"): num_candidates_per_case = min( self.num_candidates_to_sample_per_case, num_candidates_per_case) cand_idx = list( self._random_state.choice(list( range(0, len(candidate_starts_and_lens))), num_candidates_per_case, replace=False)) candidates_to_visit = [ candidate_starts_and_lens[x] for x in cand_idx ] for candidate_idx in range(num_candidates_per_case): # if shapelet heap for this class is not full yet, set entry # criteria to be the predetermined IG threshold ig_cutoff = self.predefined_ig_rejection_level # otherwise if we have max shapelets already, set the # threshold as the IG of the current 'worst' shapelet we have if shapelet_heaps_by_class[ this_class_val].get_size() >= \ self.max_shapelets_to_store_per_class: ig_cutoff = max( shapelet_heaps_by_class[this_class_val].peek()[0], ig_cutoff) cand_start_pos = candidates_to_visit[candidate_idx][0] cand_len = candidates_to_visit[candidate_idx][1] candidate = ShapeletTransform.zscore( X[series_id][:, cand_start_pos:cand_start_pos + cand_len]) # now go through all other series and get a distance from # the candidate to each orderline = [] # initialise here as copy, decrease the new val each time we # evaluate a comparison series num_visited_this_class = 0 num_visited_other_class = 0 candidate_rejected = False for comparison_series_idx in range(len(cases_to_visit)): i = cases_to_visit[comparison_series_idx][0] if y[i] != cases_to_visit[comparison_series_idx][1]: raise ValueError("class match sanity test broken") if i == series_id: # don't evaluate candidate against own series continue if y[i] == this_class_val: num_visited_this_class += 1 binary_class_identifier = 1 # positive for this class else: num_visited_other_class += 1 binary_class_identifier = -1 # negative for any # other class bsf_dist = np.inf start_left = cand_start_pos start_right = cand_start_pos + 1 if X_lens[i] == cand_len: start_left = 0 start_right = 0 for num_cals in range( max(1, int(np.ceil( (X_lens[i] - cand_len) / 2)))): # max # used to force iteration where series len == # candidate len if start_left < 0: start_left = X_lens[i] - 1 - cand_len comparison = ShapeletTransform.zscore( X[i][:, start_left:start_left + cand_len]) dist_left = np.linalg.norm(candidate - comparison) bsf_dist = min(dist_left * dist_left, bsf_dist) # for odd lengths if start_left == start_right: continue # right if start_right == X_lens[i] - cand_len + 1: start_right = 0 comparison = ShapeletTransform.zscore( X[i][:, start_right:start_right + cand_len]) dist_right = np.linalg.norm(candidate - comparison) bsf_dist = min(dist_right * dist_right, bsf_dist) start_left -= 1 start_right += 1 orderline.append((bsf_dist, binary_class_identifier)) # sorting required after each add for early IG abandon. # timsort should be efficient as array is almost in # order - insertion-sort like behaviour in this case. # Can't use heap as need to traverse in order multiple # times, not just access root orderline.sort() if len(orderline) > 2: ig_upper_bound = \ ShapeletTransform.calc_early_binary_ig( orderline, num_visited_this_class, num_visited_other_class, binary_ig_this_class_count - num_visited_this_class, binary_ig_other_class_count - num_visited_other_class) # print("upper: "+str(ig_upper_bound)) if ig_upper_bound <= ig_cutoff: candidate_rejected = True break candidates_evaluated += 1 if self.verbose > 3 and candidates_evaluated % 100 == 0: print("candidates evaluated: " + str(candidates_evaluated)) # only do if candidate was not rejected if candidate_rejected is False: final_ig = ShapeletTransform.calc_binary_ig( orderline, binary_ig_this_class_count, binary_ig_other_class_count) accepted_candidate = Shapelet(series_id, cand_start_pos, cand_len, final_ig, candidate) # add to min heap to store shapelets for this class shapelet_heaps_by_class[this_class_val].push( accepted_candidate) # informal, but extra 10% allowance for self similar later if shapelet_heaps_by_class[ this_class_val].get_size() > \ self.max_shapelets_to_store_per_class * 3: shapelet_heaps_by_class[this_class_val].pop() # Takes into account the use of the MAX shapelet calculation # time to not exceed the time_limit (not exact, but likely a # good guess). if hasattr(self, 'time_contract_in_mins') and \ self.time_contract_in_mins \ > 0: time_now = time_taken() time_this_shapelet = (time_now - time_last_shapelet) if time_this_shapelet > max_time_calc_shapelet: max_time_calc_shapelet = time_this_shapelet print(max_time_calc_shapelet) time_last_shapelet = time_now # add a little 1% leeway to the timing incase one run was slightly faster than # another based on the CPU. time_in_seconds = self.time_contract_in_mins * 60 max_shapelet_time_percentage = (max_time_calc_shapelet / 100.0) * 0.75 if (time_now + max_shapelet_time_percentage) > \ time_in_seconds: if self.verbose > 0: print("No more time available! It's been {0:02d}:{" "1:02}".format( int(round(time_now / 60, 3)), int((round(time_now / 60, 3) - int(round(time_now / 60, 3))) * 60))) time_finished = True break else: if self.verbose > 0: if candidate_rejected is False: print( "Candidate finished. {0:02d}:{1:02} " "remaining".format( int( round( self.time_contract_in_mins - time_now / 60, 3)), int((round( self.time_contract_in_mins - time_now / 60, 3) - int( round( (self.time_contract_in_mins - time_now) / 60, 3))) * 60))) else: print( "Candidate rejected. {0:02d}:{1:02} " "remaining".format( int( round((self.time_contract_in_mins - time_now) / 60, 3)), int((round( (self.time_contract_in_mins - time_now) / 60, 3) - int( round( (self.time_contract_in_mins - time_now) / 60, 3))) * 60))) # stopping condition: in case of iterative transform (i.e. # num_cases_to_sample have been visited) # in case of contracted transform (i.e. time # limit has been reached) case_idx += 1 if case_idx >= num_series_to_visit: if hasattr(self, 'time_contract_in_mins') and time_finished is not \ True: case_idx = 0 elif case_idx >= num_series_to_visit or time_finished: if self.verbose > 0: print("Stopping search") break # remove self similar here # for each class value # get list of shapelets # sort by quality # remove self similar self.shapelets = [] for class_val in distinct_class_vals: by_class_descending_ig = sorted( shapelet_heaps_by_class[class_val].get_array(), key=itemgetter(0), reverse=True) if self.remove_self_similar and len(by_class_descending_ig) > 0: by_class_descending_ig = \ ShapeletTransform.remove_self_similar_shapelets( by_class_descending_ig) else: # need to extract shapelets from tuples by_class_descending_ig = [x[2] for x in by_class_descending_ig] # if we have more than max_shapelet_per_class, trim to that # amount here if len(by_class_descending_ig) > \ self.max_shapelets_to_store_per_class: max_n = self.max_shapelets_to_store_per_class by_class_descending_ig = by_class_descending_ig[:max_n] self.shapelets.extend(by_class_descending_ig) # final sort so that all shapelets from all classes are in # descending order of information gain self.shapelets.sort(key=lambda x: x.info_gain, reverse=True) self.is_fitted_ = True # warn the user if fit did not produce any valid shapelets if len(self.shapelets) == 0: warnings.warn( "No valid shapelets were extracted from this dataset and " "calling the transform method " "will raise an Exception. Please re-fit the transform with " "other data and/or " "parameter options.") self._is_fitted = True return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using random intervals and summary features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. RISE has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) X = tabularize(X, return_array=True) n_instances, self.series_length = X.shape rng = check_random_state(self.random_state) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_intervals = int(math.sqrt(self.series_length)) if self.n_intervals == 0: self.n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length self.intervals = np.zeros((self.n_estimators, self.n_intervals, 2), dtype=int) for i in range(self.n_estimators): transformed_x = np.empty(shape=(3 * self.n_intervals, n_instances)) # Find the random intervals for classifier i and concatentate # features for j in range(self.n_intervals): self.intervals[i][j][0] = rng.randint(self.series_length - self.min_interval) length = rng.randint(self.series_length - self.intervals[i][j][0] - 1) if length < self.min_interval: length = self.min_interval self.intervals[i][j][1] = self.intervals[i][j][0] + length # Transforms here, just hard coding it, so not configurable means = np.mean( X[:, self.intervals[i][j][0]:self.intervals[i][j][1]], axis=1) std_dev = np.std( X[:, self.intervals[i][j][0]:self.intervals[i][j][1]], axis=1) slope = self._lsq_fit( X[:, self.intervals[i][j][0]:self.intervals[i][j][1]]) transformed_x[3 * j] = means transformed_x[3 * j + 1] = std_dev transformed_x[3 * j + 2] = slope tree = clone(self.base_estimator) tree.set_params(**{"random_state": self.random_state}) transformed_x = transformed_x.T tree.fit(transformed_x, y) self.classifiers.append(tree) self._is_fitted = True return self
def fit(self, X, y, sample_weight=None): """ Build a forest of trees from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape (n_samples, n_features) The training input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csc_matrix``. y : array-like of shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) # Validate or convert input data if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() # Remap output self.n_columns = X.shape[1] self.n_features_ = X.shape[1] if X.ndim == 2 else 1 y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( n_samples=X.shape[0], max_samples=self.max_samples) # Check parameters self._validate_estimator() if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): # Free allocated memory, if any self.estimators_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) trees = [ self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators) ] # Parallel loop: for standard random forests, the threading # backend is preferred as the Cython code for fitting the trees # is internally releasing the Python GIL making threading more # efficient than multiprocessing in that case. # However, in this case,for fitting pipelines in parallel, # multiprocessing is more efficient. trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(_parallel_build_trees)( t, self, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap) for i, t in enumerate(trees)) # Collect newly grown trees self.estimators_.extend(trees) if self.oob_score: self._set_oob_score(X, y) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] self._is_fitted = True return self
def fit(self, X, y): """Build a WEASEL classifiers from the training set (X, y), Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) # Window length parameter space dependent on series length self.n_instances, self.series_length = X.shape[0], X.shape[-1] win_inc = self.compute_window_inc() self.max_window = int(min(self.series_length, self.max_window)) self.window_sizes = list( range(self.min_window, self.max_window, win_inc)) self.highest_bit = (math.ceil(math.log2(self.max_window))) + 1 rng = check_random_state(self.random_state) all_words = [dict() for x in range(len(X))] for window_size in self.window_sizes: transformer = SFA( word_length=rng.choice(self.word_lengths), alphabet_size=self.alphabet_size, window_size=window_size, norm=rng.choice(self.norm_options), anova=self.anova, # levels=rng.choice([1, 2, 3]), binning_method=self.binning_strategy, bigrams=self.bigrams, remove_repeat_words=False, lower_bounding=False, save_words=False, ) sfa_words = transformer.fit_transform(X, y) self.SFA_transformers.append(transformer) bag = sfa_words[0] # .iloc[:, 0] # chi-squared test to keep only relevant features relevant_features = {} apply_chi_squared = self.chi2_threshold > 0 if apply_chi_squared: bag_vec = DictVectorizer(sparse=False).fit_transform(bag) chi2_statistics, p = chi2(bag_vec, y) relevant_features = np.where( chi2_statistics >= self.chi2_threshold)[0] # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length for j in range(len(bag)): for (key, value) in bag[j].items(): # chi-squared test if (not apply_chi_squared) or (key in relevant_features): # append the prefices to the words to # distinguish between window-sizes if isinstance(key, tuple): word = (((key[0] << self.highest_bit) | key[1]) << 3) | window_size else: # word = ((key << self.highest_bit) << 3) \ # | window_size word = WEASEL.shift_left(key, self.highest_bit, window_size) all_words[j][word] = value self.clf = make_pipeline( DictVectorizer(sparse=False), StandardScaler(with_mean=True, copy=False), LogisticRegression( max_iter=5000, solver="liblinear", dual=True, # class_weight="balanced", penalty="l2", random_state=self.random_state, ), ) self.clf.fit(all_words, y) self._is_fitted = True return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using random intervals and spectral features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. RISE has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) n_instances, self.series_length = X.shape rng = check_random_state(self.random_state) self.estimators_ = [] self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.intervals = np.zeros((self.n_estimators, 2), dtype=int) self.intervals[0][0] = 0 self.intervals[0][1] = self.series_length for i in range(1, self.n_estimators): self.intervals[i][0] = rng.randint(self.series_length - self.min_interval) self.intervals[i][1] = rng.randint( self.intervals[i][0] + self.min_interval, self.series_length) # Check lag against global properties self.acf_lag_ = self.acf_lag if self.acf_lag > self.series_length - self.acf_min_values: self.acf_lag_ = self.series_length - self.acf_min_values if self.acf_lag < 0: self.acf_lag_ = 1 self.lags = np.zeros(self.n_estimators, dtype=int) for i in range(0, self.n_estimators): temp_lag = self.acf_lag_ if (temp_lag > self.intervals[i][1] - self.intervals[i][0] - self.acf_min_values): temp_lag = (self.intervals[i][1] - self.intervals[i][0] - self.acf_min_values) if temp_lag < 0: temp_lag = 1 self.lags[i] = int(temp_lag) acf_x = np.empty(shape=(n_instances, self.lags[i])) ps_len = (self.intervals[i][1] - self.intervals[i][0]) / 2 ps_x = np.empty(shape=(n_instances, int(ps_len))) for j in range(0, n_instances): acf_x[j] = acf(X[j, self.intervals[i][0]:self.intervals[i][1]], temp_lag) ps_x[j] = ps(X[j, self.intervals[i][0]:self.intervals[i][1]]) transformed_x = np.concatenate((acf_x, ps_x), axis=1) # transformed_x=acf_x tree = clone(self.base_estimator) # set random state, but not the same, so that estimators vary tree.set_params( **{"random_state": rng.randint(np.iinfo(np.int32).max)}) tree.fit(transformed_x, y) self.estimators_.append(tree) self._is_fitted = True return self
def fit(self, X, y): """Build an ensemble of individual TDE classifiers from the training set (X,y), through randomising over the parameter space to a set number of times then selecting new parameters using Gaussian processes Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) y = y.values if isinstance(y, pd.Series) else y self.time_limit = self.time_limit * 60 self.n_instances, self.series_length = X.shape[0], len(X.iloc[0, 0]) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self.classifiers = [] self.weights = [] self.prev_parameters_x = [] self.prev_parameters_y = [] # Window length parameter space dependent on series length max_window_searches = self.series_length / 4 max_window = int(self.series_length * self.max_win_len_prop) win_inc = int((max_window - self.min_window) / max_window_searches) if win_inc < 1: win_inc = 1 possible_parameters = self._unique_parameters(max_window, win_inc) num_classifiers = 0 start_time = time.time() train_time = 0 subsample_size = int(self.n_instances * 0.7) lowest_acc = 0 lowest_acc_idx = 0 if self.time_limit > 0: self.n_parameter_samples = 0 rng = check_random_state(self.random_state) while (train_time < self.time_limit or num_classifiers < self.n_parameter_samples) and len(possible_parameters) > 0: if num_classifiers < self.randomly_selected_params: parameters = possible_parameters.pop( rng.randint(0, len(possible_parameters))) else: gp = GaussianProcessRegressor(random_state=self.random_state) gp.fit(self.prev_parameters_x, self.prev_parameters_y) preds = gp.predict(possible_parameters) parameters = possible_parameters.pop( rng.choice(np.flatnonzero(preds == preds.max()))) subsample = rng.choice(self.n_instances, size=subsample_size, replace=False) X_subsample = X.iloc[subsample, :] y_subsample = y[subsample] tde = IndividualTDE(*parameters, alphabet_size=self.alphabet_size, random_state=self.random_state) tde.fit(X_subsample, y_subsample) tde.accuracy = self._individual_train_acc(tde, y_subsample, subsample_size, lowest_acc) weight = math.pow(tde.accuracy, 4) if num_classifiers < self.max_ensemble_size: if tde.accuracy < lowest_acc: lowest_acc = tde.accuracy lowest_acc_idx = num_classifiers self.weights.append(weight) self.classifiers.append(tde) elif tde.accuracy > lowest_acc: self.weights[lowest_acc_idx] = weight self.classifiers[lowest_acc_idx] = tde lowest_acc, lowest_acc_idx = self._worst_ensemble_acc() self.prev_parameters_x.append(parameters) self.prev_parameters_y.append(tde.accuracy) num_classifiers += 1 train_time = time.time() - start_time self.n_estimators = len(self.classifiers) self.weight_sum = np.sum(self.weights) self._is_fitted = True return self
def fit(self, X, y): """Build an ensemble of BOSS classifiers from the training set (X, y), either through randomising over the para space to make a fixed size ensemble quickly or by creating a variable size ensemble of those within a threshold of the best Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) y = y.values if isinstance(y, pd.Series) else y self.time_limit = self.time_limit * 60 self.n_instances, self.series_length = X.shape[0], len(X.iloc[0, 0]) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self.classifiers = [] self.weights = [] # Window length parameter space dependent on series length max_window_searches = self.series_length / 4 max_window = int(self.series_length * self.max_win_len_prop) win_inc = int((max_window - self.min_window) / max_window_searches) if win_inc < 1: win_inc = 1 # cBOSS if self.randomised_ensemble: possible_parameters = self._unique_parameters(max_window, win_inc) num_classifiers = 0 start_time = time.time() train_time = 0 subsample_size = int(self.n_instances * 0.7) lowest_acc = 0 lowest_acc_idx = 0 rng = check_random_state(self.random_state) if self.time_limit > 0: self.n_parameter_samples = 0 while (train_time < self.time_limit or num_classifiers < self.n_parameter_samples) and len(possible_parameters) > 0: parameters = possible_parameters.pop( rng.randint(0, len(possible_parameters))) subsample = rng.choice(self.n_instances, size=subsample_size, replace=False) X_subsample = X.iloc[subsample, :] y_subsample = y[subsample] boss = BOSSIndividual(*parameters, alphabet_size=self.alphabet_size, save_words=False, random_state=self.random_state) boss.fit(X_subsample, y_subsample) boss._clean() boss.accuracy = self._individual_train_acc(boss, y_subsample, subsample_size, lowest_acc) weight = math.pow(boss.accuracy, 4) if num_classifiers < self.max_ensemble_size: if boss.accuracy < lowest_acc: lowest_acc = boss.accuracy lowest_acc_idx = num_classifiers self.weights.append(weight) self.classifiers.append(boss) elif boss.accuracy > lowest_acc: self.weights[lowest_acc_idx] = weight self.classifiers[lowest_acc_idx] = boss lowest_acc, lowest_acc_idx = self._worst_ensemble_acc() num_classifiers += 1 train_time = time.time() - start_time # BOSS else: max_acc = -1 min_max_acc = -1 for i, normalise in enumerate(self.norm_options): for win_size in range(self.min_window, max_window + 1, win_inc): boss = BOSSIndividual(win_size, self.word_lengths[0], normalise, self.alphabet_size, save_words=True, random_state=self.random_state) boss.fit(X, y) best_classifier_for_win_size = boss best_acc_for_win_size = -1 # the used work length may be shorter best_word_len = boss.transformer.word_length for n, word_len in enumerate(self.word_lengths): if n > 0: boss = boss._shorten_bags(word_len) boss.accuracy = self._individual_train_acc( boss, y, self.n_instances, best_acc_for_win_size) # print(win_size, boss.accuracy) if boss.accuracy >= best_acc_for_win_size: best_acc_for_win_size = boss.accuracy best_classifier_for_win_size = boss best_word_len = word_len if self._include_in_ensemble(best_acc_for_win_size, max_acc, min_max_acc, len(self.classifiers)): best_classifier_for_win_size._clean() best_classifier_for_win_size._set_word_len( best_word_len) self.classifiers.append(best_classifier_for_win_size) # print("appending", best_acc_for_win_size, win_size) if best_acc_for_win_size > max_acc: max_acc = best_acc_for_win_size self.classifiers = list(compress( self.classifiers, [ classifier.accuracy >= max_acc * self.threshold for c, classifier in enumerate(self.classifiers)])) min_max_acc, min_acc_ind = \ self._worst_ensemble_acc() if len(self.classifiers) > self.max_ensemble_size: if min_acc_ind > -1: del self.classifiers[min_acc_ind] min_max_acc, min_acc_ind = \ self._worst_ensemble_acc() self.weights = [1 for n in range(len(self.classifiers))] self.n_estimators = len(self.classifiers) self.weight_sum = np.sum(self.weights) self._is_fitted = True return self
def fit(self, X, y): """Build a WEASEL classifiers from the training set (X, y), Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) y = y.values if isinstance(y, pd.Series) else y # Window length parameter space dependent on series length self.n_instances, self.series_length = X.shape[0], len(X.iloc[0, 0]) self.max_window = min(self.series_length, self.max_window) self.window_sizes = list( range(self.min_window, self.max_window, self.win_inc)) max_acc = -1 self.highest_bit = (math.ceil(math.log2(self.max_window)) + 1) final_bag_vec = None for norm in self.norm_options: # transformers = [] for w, word_length in enumerate(self.word_lengths): all_words = [dict() for x in range(len(X))] transformers = [] for i, window_size in enumerate(self.window_sizes): # if w == 0: # only compute once, otherwise shorten transformer = SFA(word_length=np.max(word_length), alphabet_size=self.alphabet_size, window_size=window_size, norm=norm, anova=self.anova, binning_method=self.binning_strategy, bigrams=self.bigrams, remove_repeat_words=False, lower_bounding=False, save_words=False) sfa_words = transformer.fit_transform(X, y) transformers.append(transformer) # use the shortening of words trick # sfa_words = transformers[i]._shorten_bags(word_length) # TODO refactor? dicts not really needed here ... bag = sfa_words.iloc[:, 0] # chi-squared test to keep only relevent features # bag_vec = DictVectorizer(sparse=False).fit_transform(bag) # chi2_statistics, p = chi2(bag_vec, y) # relevant_features = np.where( # chi2_statistics >= self.chi2_threshold)[0] # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length for j in range(len(bag)): for (key, value) in bag[j].items(): # if key in relevant_features: # chi-squared test # append the prefices to the words to # distinguish between window-sizes word = (key << self.highest_bit) | window_size # X_all_words[j].append((word, value)) all_words[j][word] = value # TODO use CountVectorizer instead on actual words ... ??? vectorizer = DictVectorizer(sparse=True) bag_vec = vectorizer.fit_transform(all_words) clf = LogisticRegression(max_iter=5000, solver="liblinear", dual=True, penalty="l2", random_state=self.random_state) current_acc = cross_val_score(clf, bag_vec, y, cv=5).mean() # clf = RandomForestClassifier(oob_score=True, # n_estimators=1000, # n_jobs=-1).fit(bag_vec, y) # current_acc = clf.oob_score_ # print("Train acc:", norm, word_length, current_acc) if current_acc > max_acc: max_acc = current_acc self.vectorizer = vectorizer self.clf = clf self.SFA_transformers = transformers self.best_word_length = word_length final_bag_vec = bag_vec if max_acc == 1.0: break # there can be no better model than 1.0 # # fit final model using all words # for i, window_size in enumerate(self.window_sizes): # self.SFA_transformers[i] = \ # SFA(word_length=np.max(self.word_lengths), # alphabet_size=self.alphabet_size, # window_size=window_size, # norm=norm, # anova=self.anova, # binning_method=self.binning_strategy, # bigrams=self.bigrams, # remove_repeat_words=False, # lower_bounding=False, # save_words=False) # self.SFA_transformers[i].fit_transform(X, y) self.clf.fit(final_bag_vec, y) self._is_fitted = True return self