def _resample(self, X, y): y = np.array(y) X = np.array(X) minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name) self.minority_data.append(minority.tolist()) self.chunk_time_stamp.append(self.time_stamp) if len(self.minority_data) > self.number_of_chunks: del self.minority_data[0] del self.chunk_time_stamp[0] self.chunk_sample_proba = np.arange(len(self.minority_data)) + 1 self.chunk_sample_proba = self.chunk_sample_proba / self.chunk_sample_proba.sum( ) number_of_instances = len(majority) / self.number_of_classifiers chunk_indexes = np.random.choice(len(self.chunk_sample_proba), int(number_of_instances), p=self.chunk_sample_proba) cia, cca = np.unique(chunk_indexes, return_counts=True) new_minority = [] for chunk_index, chunk_count in zip(cia, cca): if len(self.minority_data[chunk_index]) > chunk_count: new_minority.extend( random.sample(self.minority_data[chunk_index], chunk_count)) else: new_minority.extend(self.minority_data[chunk_index]) return new_minority
def partial_fit(self, X, y, classes=None): # Initial preperation if classes is None and self.classes is None: self.label_encoder = LabelEncoder() self.label_encoder.fit(y) self.classes = self.label_encoder.classes_ elif self.classes is None: self.label_encoder = LabelEncoder() self.label_encoder.fit(classes) self.classes = classes if classes[0] is "positive": self.minority_name = self.label_encoder.transform(classes[0]) self.majority_name = self.label_encoder.transform(classes[1]) elif classes[1] is "positive": self.minority_name = self.label_encoder.transform(classes[1]) self.majority_name = self.label_encodr.transform(classes[0]) y = self.label_encoder.transform(y) if self.minority_name is None or self.majority_name is None: self.minority_name, self.majority_name = minority_majority_name(y) self.number_of_features = len(X[0]) # Prune minority to_delete = [] for i, w in enumerate(self.weights_array_min): if w <= 0: to_delete.append(i) self.weights_array_min[i] -= 1 to_delete.reverse() for i in to_delete: del self.weights_array_min[i] del self.classifier_array_min[i] # Prune majority to_delete = [] for i, w in enumerate(self.weights_array_maj): if w <= 0: to_delete.append(i) self.weights_array_maj[i] -= 1 to_delete.reverse() for i in to_delete: del self.weights_array_maj[i] del self.classifier_array_maj[i] # Split data minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name) samples, n_of_clust = self._best_number_of_clusters(minority, 10) for i in range(n_of_clust): self.classifier_array_min.append(clone(self.base_classifier).fit(samples[i])) self.weights_array_min.append(self.number_of_classifiers) samples, n_of_clust = self._best_number_of_clusters(majority, 10) for i in range(n_of_clust): self.classifier_array_maj.append(clone(self.base_classifier).fit(samples[i])) self.weights_array_maj.append(self.number_of_classifiers)
def _resample(self, X, y): y = np.array(y) X = np.array(X) minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name) if self.minority_data is None: self.minority_data = minority self.iterator += 1 return X, y ratio = len(minority[:, 0]) / float(len(X[:, 0])) if self.balance_ratio > ratio: if ((len(minority) + len(self.minority_data)) / float(len(X) + len(self.minority_data))) <= self.balance_ratio: new_minority = np.concatenate((minority, self.minority_data), axis=0) else: knn = NearestNeighbors(n_neighbors=3).fit(X, y) distance, indicies = knn.kneighbors(self.minority_data) a = np.arange(0, len(distance)) distance = np.insert(distance, -1, a, axis=1) distance = distance[distance[:, 0].argsort()] new_minority = minority # print(range(int(len(X) * 2 * (self.balance_ratio - ratio)))) for i in range(int(len(X) * 2 * (self.balance_ratio - ratio))): try: new_minority = np.insert(new_minority, -1, self.minority_data[int( distance[i][1])], axis=0) except IndexError: break res_X = np.concatenate((new_minority, majority), axis=0) res_y = np.concatenate( (np.full(len(new_minority), self.minority_name), np.full(len(majority), self.majority_name)), axis=0) else: res_X = X res_y = y self.minority_data = np.concatenate((minority, self.minority_data), axis=0) self.iterator += 1 return res_X, res_y
def _resample(self, X, y): X = np.array(X) y = np.array(y) minioty, majority = minority_majority_split(X, y, self.minority_name, self.majority_name) if len(minioty) > 6: res_X, res_y = SMOTE().fit_sample(X, y) else: res_X, res_y = SMOTE(k_neighbors=len(minioty)-1).fit_sample(X, y) return res_X, res_y
def _resample(self, X, y): y = np.array(y) X = np.array(X) minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name) # Undersample majority array if len(minority) != 0: km = KMeans(n_clusters=len(minority)).fit(X) majority = km.cluster_centers_ res_X = np.concatenate((majority, minority), axis=0) res_y = len(majority)*[self.majority_name] + len(minority)*[self.minority_name] return res_X, res_y else: return None, None
def _new_sub_ensemble(self, X, y): y = np.array(y) X = np.array(X) minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name) T = self.number_of_classifiers N = len(X) sub_ensemble = [] for k in range(T): number_of_instances = int(math.floor(N / float(T))) df = pd.DataFrame(majority) sample = df.sample(number_of_instances) res_X = np.concatenate((sample, minority), axis=0) res_y = len(sample) * [self.majority_name ] + len(minority) * [self.minority_name] new_classifier = self.base_classifier.fit(res_X, res_y) sub_ensemble += [new_classifier] return sub_ensemble
def partial_fit(self, X, y, classes=None): if classes is None and self.classes is None: self.label_encoder = LabelEncoder() self.label_encoder.fit(y) self.classes = self.label_encoder.classes elif self.classes is None: self.label_encoder = LabelEncoder() self.label_encoder.fit(classes) self.classes = classes if classes[0] is "positive": self.minority_name = self.label_encoder.transform(classes[0]) self.majority_name = self.label_encoder.transform(classes[1]) elif classes[1] is "positive": self.minority_name = self.label_encoder.transform(classes[1]) self.majority_name = self.label_encoder.transform(classes[0]) y = self.label_encoder.transform(y) if self.minority_name is None or self.majority_name is None: self.minority_name, self.majority_name = minority_majority_name(y) new_minority = self._resample(X, y) minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name) if not majority.any(): return majority_split = np.array_split(majority, self.number_of_classifiers) self.classifier_array = [] for m_s in majority_split: res_X = np.concatenate((m_s, new_minority), axis=0) res_y = len(m_s) * [self.majority_name ] + len(new_minority) * [self.minority_name] new_classifier = clone(self.base_classifier).fit(res_X, res_y) self.classifier_array.append(new_classifier) self.time_stamp += 1
def partial_fit(self, X, y, classes=None): warnings.filterwarnings(action='ignore', category=DeprecationWarning) if classes is None and self.classes is None: self.label_encoder = LabelEncoder() self.label_encoder.fit(y) self.classes = self.label_encoder.classes elif self.classes is None: self.label_encoder = LabelEncoder() self.label_encoder.fit(classes) self.classes = classes if classes[0] is "positive": self.minority_name = self.label_encoder.transform(classes[0]) self.majority_name = self.label_encoder.transform(classes[1]) elif classes[1] is "positive": self.minority_name = self.label_encoder.transform(classes[1]) self.majority_name = self.label_encoder.transform(classes[0]) y = self.label_encoder.transform(y) if self.minority_name is None or self.majority_name is None: self.minority_name, self.majority_name = minority_majority_name(y) new_minority = self._resample(X, y) minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name) if not majority.any(): print("majoirty empty") return majority_split = np.array_split(majority, self.number_of_classifiers) self.classifier_array = [] for m_s in majority_split: res_X = np.concatenate((m_s, new_minority), axis=0) res_y = len(m_s)*[self.majority_name] + len(new_minority)*[self.minority_name] new_classifier = self.base_classifier.fit(res_X, res_y) self.classifier_array.append(new_classifier)
def _resample(self, X, y): y = np.array(y) X = np.array(X) minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name) self.minority_data.append(minority.tolist()) self.ratio_chunks.append(len(minority)/float(len(majority))) self.iterator += 1 if len(self.minority_data) > self.number_of_chunks: del self.minority_data[0] del self.ratio_chunks[0] number_of_instances = len(majority)/self.number_of_classifiers new_minority = [] for md in self.minority_data: if number_of_instances < len(md): new_minority.extend(random.sample(md, int(number_of_instances))) else: new_minority.extend(md) return new_minority
def partial_fit(self, X, y, classes=None): # ________________________________________ # Initial preperation if classes is None and self.classes is None: self.label_encoder = LabelEncoder() self.label_encoder.fit(y) self.classes = self.label_encoder.classes elif self.classes is None: self.label_encoder = LabelEncoder() self.label_encoder.fit(classes) self.classes = classes if classes[0] is "positive": self.minority_name = self.label_encoder.transform(classes[0]) self.majority_name = self.label_encoder.transform(classes[1]) elif classes[1] is "positive": self.minority_name = self.label_encoder.transform(classes[1]) self.majority_name = self.label_encoder.transform(classes[0]) y = self.label_encoder.transform(y) if self.minority_name is None or self.majority_name is None: self.minority_name, self.majority_name = minority_majority_name(y) self.number_of_features = len(X[0]) # ________________________________________ # Get stored data new_X, new_y = [], [] for tmp_X, tmp_y in zip(self.stored_X, self.stored_y): new_X.extend(tmp_X) new_y.extend(tmp_y) new_X.extend(X) new_y.extend(y) new_X = np.array(new_X) new_y = np.array(new_y) # ________________________________________ # Undersample and store new data und_X, und_y = self.undersampling.fit_resample(X, y) self.stored_X.append(und_X) self.stored_y.append(und_y) if len(self.stored_X) > self.number_of_chunks: del self.stored_X[0] del self.stored_y[0] # ________________________________________ # Oversample when below ratio minority, majority = minority_majority_split(new_X, new_y, self.minority_name, self.majority_name) ratio = len(minority)/len(majority) if ratio < self.balance_ratio: new_X, new_y = self.oversampling.fit_resample(new_X, new_y) # ________________________________________ # Train classifier self.clf = self.base_classifier.fit(new_X, new_y)
def _resample(self, X, y): y = np.array(y) X = np.array(X) minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name) if self.minority_data is None: self.minority_data = minority self.iterator += 1 return X, y ratio = len(minority[:, 0]) / float(len(X[:, 0])) if self.balance_ratio > ratio: if ((len(minority) + len(self.minority_data)) / float(len(X) + len(self.minority_data))) <= self.balance_ratio: new_minority = np.concatenate((minority, self.minority_data), axis=0) else: knn = NearestNeighbors(n_neighbors=10).fit(X) indices = knn.kneighbors(self.minority_data, return_distance=False) min_count = np.count_nonzero(y[indices] == self.minority_name, axis=1) a = np.arange(0, len(min_count)) min_count = np.insert(np.expand_dims(min_count, axis=1), 1, a, axis=1) min_count = min_count[min_count[:, 0].argsort()] min_count = min_count[::-1] # print(min_count) sorted_minority = min_count[:, 1].astype("int") # print(sorted_minority) n_instances = int((self.balance_ratio - ratio) * len(y)) if n_instances > len(sorted_minority): new_minority = self.minority_data[sorted_minority] else: new_minority = self.minority_data[ sorted_minority[0:n_instances]] res_X = np.concatenate((new_minority, majority), axis=0) res_y = np.concatenate( (np.full(len(new_minority), self.minority_name), np.full(len(majority), self.majority_name)), axis=0) else: res_X = X res_y = y self.minority_data = np.concatenate((minority, self.minority_data), axis=0) self.iterator += 1 return res_X, res_y
def partial_fit(self, X, y, classes=None): # ________________________________________ # Initial preperation if classes is None and self.classes is None: self.label_encoder = LabelEncoder() self.label_encoder.fit(y) self.classes = self.label_encoder.classes elif self.classes is None: self.label_encoder = LabelEncoder() self.label_encoder.fit(classes) self.classes = classes if classes[0] is "positive": self.minority_name = self.label_encoder.transform(classes[0]) self.majority_name = self.label_encoder.transform(classes[1]) elif classes[1] is "positive": self.minority_name = self.label_encoder.transform(classes[1]) self.majority_name = self.label_encoder.transform(classes[0]) y = self.label_encoder.transform(y) if self.minority_name is None or self.majority_name is None: self.minority_name, self.majority_name = minority_majority_name(y) self.number_of_features = len(X[0]) # ________________________________________ # Drift detector if (self.drift_detector is not None): dd_pred = self.drift_detector.predict(X) score = geometric_mean_score(dd_pred, y) if score / np.mean(self.metrics_array) < 0.7: self.drift_detector = None self.metrics_array = [] self.classifier_array = [] self.stored_X = [] self.stored_y = [] else: self.metrics_array.append(score) # ________________________________________ # Get stored data new_X, new_y = [], [] for tmp_X, tmp_y in zip(self.stored_X, self.stored_y): new_X.extend(tmp_X) new_y.extend(tmp_y) new_X.extend(X) new_y.extend(y) new_X = np.array(new_X) new_y = np.array(new_y) # ________________________________________ # Undersample and store new data und_X, und_y = self.undersampling.fit_resample(X, y) self.stored_X.append(und_X) self.stored_y.append(und_y) # ________________________________________ # Oversample when below ratio minority, majority = minority_majority_split(new_X, new_y, self.minority_name, self.majority_name) ratio = len(minority) / len(majority) if ratio < self.balance_ratio: new_X, new_y = self.oversampling.fit_resample(new_X, new_y) # ________________________________________ # Train new classifier self.classifier_array.append( clone(self.base_classifier).fit(new_X, new_y)) if len(self.classifier_array) >= self.number_of_classifiers: del self.classifier_array[0] del self.stored_X[0] del self.stored_y[0] if self.drift_detector is None: self.drift_detector = MLPClassifier((10)) self.drift_detector.partial_fit(new_X, new_y, np.unique(new_y)) self.iteration += 1