def __init__( self, window_size=10, word_length=8, norm=False, alphabet_size=4, save_words=True, random_state=None, ): self.window_size = window_size self.word_length = word_length self.norm = norm self.alphabet_size = alphabet_size self.save_words = save_words self.random_state = random_state self.transformer = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, norm=norm, remove_repeat_words=True, bigrams=False, save_words=save_words, ) self.transformed_data = [] self.accuracy = 0 self.class_vals = [] self.num_classes = 0 self.classes_ = [] self.class_dictionary = {} super(IndividualBOSS, self).__init__()
def _parallel_fit(window_size, ): rng = check_random_state(window_size) all_words = [dict() for x in range(len(X))] relevant_features_count = 0 # for window_size in self.window_sizes: transformer = SFA( word_length=rng.choice(self.word_lengths), alphabet_size=self.alphabet_size, window_size=window_size, norm=rng.choice(self.norm_options), anova=self.anova, # levels=rng.choice([1, 2, 3]), binning_method=self.binning_strategy, bigrams=self.bigrams, remove_repeat_words=False, lower_bounding=False, save_words=False, ) sfa_words = transformer.fit_transform(X, y) # self.SFA_transformers.append(transformer) bag = sfa_words[0] apply_chi_squared = self.p_threshold < 1 # chi-squared test to keep only relevant features if apply_chi_squared: vectorizer = DictVectorizer(sparse=True, dtype=np.int32, sort=False) bag_vec = vectorizer.fit_transform(bag) chi2_statistics, p = chi2(bag_vec, y) relevant_features_idx = np.where(p <= self.p_threshold)[0] relevant_features = set( np.array(vectorizer.feature_names_)[relevant_features_idx]) relevant_features_count += len(relevant_features_idx) # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length for j in range(len(bag)): for (key, value) in bag[j].items(): # chi-squared test if (not apply_chi_squared) or (key in relevant_features): # append the prefixes to the words to # distinguish between window-sizes word = WEASEL.shift_left(key, self.highest_bit, window_size) all_words[j][word] = value return all_words, transformer, relevant_features_count
def __init__( self, window_size=10, word_length=8, norm=False, levels=1, igb=False, alphabet_size=4, random_state=None, ): self.window_size = window_size self.word_length = word_length self.norm = norm self.levels = levels self.igb = igb self.alphabet_size = alphabet_size self.random_state = random_state binning_method = "information-gain" if igb else "equi-depth" self.transformer = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, norm=norm, levels=levels, binning_method=binning_method, bigrams=True, remove_repeat_words=True, save_words=False, ) self.transformed_data = [] self.accuracy = 0 self.class_vals = [] self.num_classes = 0 self.classes_ = [] self.class_dictionary = {} super(IndividualTDE, self).__init__()
class IndividualTDE(BaseClassifier): """Single TDE classifier, based off the Bag of SFA Symbols (BOSS) model""" def __init__( self, window_size=10, word_length=8, norm=False, levels=1, igb=False, alphabet_size=4, random_state=None, ): self.window_size = window_size self.word_length = word_length self.norm = norm self.levels = levels self.igb = igb self.alphabet_size = alphabet_size self.random_state = random_state binning_method = "information-gain" if igb else "equi-depth" self.transformer = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, norm=norm, levels=levels, binning_method=binning_method, bigrams=True, remove_repeat_words=True, save_words=False, ) self.transformed_data = [] self.accuracy = 0 self.class_vals = [] self.num_classes = 0 self.classes_ = [] self.class_dictionary = {} super(IndividualTDE, self).__init__() def fit(self, X, y): X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) sfa = self.transformer.fit_transform(X, y) self.transformed_data = sfa[0] # .iloc[:, 0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self def predict(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) rng = check_random_state(self.random_state) classes = [] test_bags = self.transformer.transform(X) test_bags = test_bags[0] # .iloc[:, 0] for test_bag in test_bags: best_sim = -1 nn = None for n, bag in enumerate(self.transformed_data): sim = histogram_intersection(test_bag, bag) if sim > best_sim or (sim == best_sim and rng.random() < 0.5): best_sim = sim nn = self.class_vals[n] classes.append(nn) return np.array(classes) def predict_proba(self, X): preds = self.predict(X) dists = np.zeros((X.shape[0], self.num_classes)) for i in range(0, X.shape[0]): dists[i, self.class_dictionary.get(preds[i])] += 1 return dists def _train_predict(self, train_num): test_bag = self.transformed_data[train_num] best_sim = -1 nn = None for n, bag in enumerate(self.transformed_data): if n == train_num: continue sim = histogram_intersection(test_bag, bag) if sim > best_sim: best_sim = sim nn = self.class_vals[n] return nn
def fit(self, X, y): """Build a WEASEL+MUSE classifiers from the training set (X, y), Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, coerce_to_pandas=True) y = np.asarray(y) # add first order differences in each dimension to TS if self.use_first_order_differences: X = self.add_first_order_differences(X) # Window length parameter space dependent on series length self.col_names = X.columns rng = check_random_state(self.random_state) self.n_dims = len(self.col_names) self.highest_dim_bit = (math.ceil(math.log2(self.n_dims))) + 1 self.highest_bits = np.zeros(self.n_dims) self.SFA_transformers = [[] for _ in range(self.n_dims)] # the words of all dimensions and all time series all_words = [dict() for _ in range(X.shape[0])] # On each dimension, perform SFA for ind, column in enumerate(self.col_names): X_dim = X[[column]] X_dim = from_nested_to_3d_numpy(X_dim) series_length = X_dim.shape[ -1] # TODO compute minimum over all ts ? # increment window size in steps of 'win_inc' win_inc = self.compute_window_inc(series_length) self.max_window = int(min(series_length, self.max_window)) self.window_sizes.append( list(range(self.min_window, self.max_window, win_inc))) self.highest_bits[ind] = math.ceil(math.log2(self.max_window)) + 1 for window_size in self.window_sizes[ind]: transformer = SFA( word_length=rng.choice(self.word_lengths), alphabet_size=self.alphabet_size, window_size=window_size, norm=rng.choice(self.norm_options), anova=self.anova, binning_method=rng.choice(self.binning_strategies), bigrams=self.bigrams, remove_repeat_words=False, lower_bounding=False, save_words=False, ) sfa_words = transformer.fit_transform(X_dim, y) self.SFA_transformers[ind].append(transformer) bag = sfa_words[0] # chi-squared test to keep only relevant features relevant_features = {} apply_chi_squared = self.p_threshold < 1 if apply_chi_squared: vectorizer = DictVectorizer(sparse=True, dtype=np.int32, sort=False) bag_vec = vectorizer.fit_transform(bag) chi2_statistics, p = chi2(bag_vec, y) relevant_features_idx = np.where(p <= self.p_threshold)[0] relevant_features = set( np.array( vectorizer.feature_names_)[relevant_features_idx]) # merging bag-of-patterns of different window_sizes # to single bag-of-patterns with prefix indicating # the used window-length highest = np.int32(self.highest_bits[ind]) for j in range(len(bag)): for (key, value) in bag[j].items(): # chi-squared test if (not apply_chi_squared) or (key in relevant_features): # append the prefices to the words to # distinguish between window-sizes word = MUSE.shift_left(key, highest, ind, self.highest_dim_bit, window_size) all_words[j][word] = value self.clf = make_pipeline( DictVectorizer(sparse=True, sort=False), # StandardScaler(with_mean=True, copy=False), LogisticRegression( max_iter=5000, solver="liblinear", dual=True, # class_weight="balanced", penalty="l2", random_state=self.random_state, ), ) self.clf.fit(all_words, y) self._is_fitted = True return self
class IndividualBOSS(BaseClassifier): """Single Bag of SFA Symbols (BOSS) classifier Bag of SFA Symbols Ensemble: implementation of BOSS from Schaffer : @article """ def __init__( self, window_size=10, word_length=8, norm=False, alphabet_size=4, save_words=True, random_state=None, ): self.window_size = window_size self.word_length = word_length self.norm = norm self.alphabet_size = alphabet_size self.save_words = save_words self.random_state = random_state self.transformer = SFA( word_length=word_length, alphabet_size=alphabet_size, window_size=window_size, norm=norm, remove_repeat_words=True, bigrams=False, save_words=save_words, ) self.transformed_data = [] self.accuracy = 0 self.class_vals = [] self.num_classes = 0 self.classes_ = [] self.class_dictionary = {} super(IndividualBOSS, self).__init__() def fit(self, X, y): X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) sfa = self.transformer.fit_transform(X) self.transformed_data = sfa[0] # .iloc[:, 0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self def predict(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) rng = check_random_state(self.random_state) classes = [] test_bags = self.transformer.transform(X) test_bags = test_bags[0] # .iloc[:, 0] for test_bag in test_bags: best_dist = sys.float_info.max nn = None for n, bag in enumerate(self.transformed_data): dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist or (dist == best_dist and rng.random() < 0.5): best_dist = dist nn = self.class_vals[n] classes.append(nn) return np.array(classes) def predict_proba(self, X): preds = self.predict(X) dists = np.zeros((X.shape[0], self.num_classes)) for i in range(0, X.shape[0]): dists[i, self.class_dictionary.get(preds[i])] += 1 return dists def _train_predict(self, train_num): test_bag = self.transformed_data[train_num] best_dist = sys.float_info.max nn = None for n, bag in enumerate(self.transformed_data): if n == train_num: continue dist = boss_distance(test_bag, bag, best_dist) if dist < best_dist: best_dist = dist nn = self.class_vals[n] return nn def _shorten_bags(self, word_len): new_boss = IndividualBOSS( self.window_size, word_len, self.norm, self.alphabet_size, save_words=self.save_words, random_state=self.random_state, ) new_boss.transformer = self.transformer sfa = self.transformer._shorten_bags(word_len) new_boss.transformed_data = sfa[0] # .iloc[:, 0] new_boss.class_vals = self.class_vals new_boss.num_classes = self.num_classes new_boss.classes_ = self.classes_ new_boss.class_dictionary = self.class_dictionary new_boss._is_fitted = True return new_boss def _clean(self): self.transformer.words = None self.transformer.save_words = False def _set_word_len(self, word_len): self.word_length = word_len self.transformer.word_length = word_len