예제 #1
0
    def _resample(self, X, y):
        y = np.array(y)
        X = np.array(X)

        minority, majority = minority_majority_split(X, y, self.minority_name,
                                                     self.majority_name)

        self.minority_data.append(minority.tolist())
        self.chunk_time_stamp.append(self.time_stamp)

        if len(self.minority_data) > self.number_of_chunks:
            del self.minority_data[0]
            del self.chunk_time_stamp[0]

        self.chunk_sample_proba = np.arange(len(self.minority_data)) + 1
        self.chunk_sample_proba = self.chunk_sample_proba / self.chunk_sample_proba.sum(
        )

        number_of_instances = len(majority) / self.number_of_classifiers

        chunk_indexes = np.random.choice(len(self.chunk_sample_proba),
                                         int(number_of_instances),
                                         p=self.chunk_sample_proba)
        cia, cca = np.unique(chunk_indexes, return_counts=True)

        new_minority = []
        for chunk_index, chunk_count in zip(cia, cca):
            if len(self.minority_data[chunk_index]) > chunk_count:
                new_minority.extend(
                    random.sample(self.minority_data[chunk_index],
                                  chunk_count))
            else:
                new_minority.extend(self.minority_data[chunk_index])

        return new_minority
예제 #2
0
    def partial_fit(self, X, y, classes=None):

        # Initial preperation
        if classes is None and self.classes is None:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(y)
            self.classes = self.label_encoder.classes_
        elif self.classes is None:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(classes)
            self.classes = classes

        if classes[0] is "positive":
            self.minority_name = self.label_encoder.transform(classes[0])
            self.majority_name = self.label_encoder.transform(classes[1])
        elif classes[1] is "positive":
            self.minority_name = self.label_encoder.transform(classes[1])
            self.majority_name = self.label_encodr.transform(classes[0])

        y = self.label_encoder.transform(y)

        if self.minority_name is None or self.majority_name is None:
            self.minority_name, self.majority_name = minority_majority_name(y)
            self.number_of_features = len(X[0])

        # Prune minority
        to_delete = []
        for i, w in enumerate(self.weights_array_min):
            if w <= 0:
                to_delete.append(i)
            self.weights_array_min[i] -= 1
        to_delete.reverse()
        for i in to_delete:
            del self.weights_array_min[i]
            del self.classifier_array_min[i]

        # Prune majority
        to_delete = []
        for i, w in enumerate(self.weights_array_maj):
            if w <= 0:
                to_delete.append(i)
            self.weights_array_maj[i] -= 1
        to_delete.reverse()
        for i in to_delete:
            del self.weights_array_maj[i]
            del self.classifier_array_maj[i]

        # Split data
        minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name)

        samples, n_of_clust = self._best_number_of_clusters(minority, 10)

        for i in range(n_of_clust):
            self.classifier_array_min.append(clone(self.base_classifier).fit(samples[i]))
            self.weights_array_min.append(self.number_of_classifiers)

        samples, n_of_clust = self._best_number_of_clusters(majority, 10)
        for i in range(n_of_clust):
            self.classifier_array_maj.append(clone(self.base_classifier).fit(samples[i]))
            self.weights_array_maj.append(self.number_of_classifiers)
예제 #3
0
    def _resample(self, X, y):
        y = np.array(y)
        X = np.array(X)

        minority, majority = minority_majority_split(X, y, self.minority_name,
                                                     self.majority_name)

        if self.minority_data is None:
            self.minority_data = minority
            self.iterator += 1
            return X, y

        ratio = len(minority[:, 0]) / float(len(X[:, 0]))

        if self.balance_ratio > ratio:
            if ((len(minority) + len(self.minority_data)) /
                    float(len(X) +
                          len(self.minority_data))) <= self.balance_ratio:
                new_minority = np.concatenate((minority, self.minority_data),
                                              axis=0)

            else:
                knn = NearestNeighbors(n_neighbors=3).fit(X, y)

                distance, indicies = knn.kneighbors(self.minority_data)
                a = np.arange(0, len(distance))
                distance = np.insert(distance, -1, a, axis=1)
                distance = distance[distance[:, 0].argsort()]
                new_minority = minority

                # print(range(int(len(X) * 2 * (self.balance_ratio - ratio))))
                for i in range(int(len(X) * 2 * (self.balance_ratio - ratio))):
                    try:
                        new_minority = np.insert(new_minority,
                                                 -1,
                                                 self.minority_data[int(
                                                     distance[i][1])],
                                                 axis=0)
                    except IndexError:
                        break

            res_X = np.concatenate((new_minority, majority), axis=0)
            res_y = np.concatenate(
                (np.full(len(new_minority), self.minority_name),
                 np.full(len(majority), self.majority_name)),
                axis=0)

        else:
            res_X = X
            res_y = y

        self.minority_data = np.concatenate((minority, self.minority_data),
                                            axis=0)
        self.iterator += 1

        return res_X, res_y
예제 #4
0
    def _resample(self, X, y):
        X = np.array(X)
        y = np.array(y)

        minioty, majority = minority_majority_split(X, y, self.minority_name, self.majority_name)
        if len(minioty) > 6:
            res_X, res_y = SMOTE().fit_sample(X, y)
        else:
            res_X, res_y = SMOTE(k_neighbors=len(minioty)-1).fit_sample(X, y)
        return res_X, res_y
예제 #5
0
    def _resample(self, X, y):
        y = np.array(y)
        X = np.array(X)

        minority, majority = minority_majority_split(X, y,
                                                     self.minority_name,
                                                     self.majority_name)

        # Undersample majority array
        if len(minority) != 0:
            km = KMeans(n_clusters=len(minority)).fit(X)
            majority = km.cluster_centers_

            res_X = np.concatenate((majority, minority), axis=0)
            res_y = len(majority)*[self.majority_name] + len(minority)*[self.minority_name]

            return res_X, res_y
        else:
            return None, None
예제 #6
0
    def _new_sub_ensemble(self, X, y):
        y = np.array(y)
        X = np.array(X)

        minority, majority = minority_majority_split(X, y, self.minority_name,
                                                     self.majority_name)

        T = self.number_of_classifiers
        N = len(X)
        sub_ensemble = []
        for k in range(T):
            number_of_instances = int(math.floor(N / float(T)))
            df = pd.DataFrame(majority)
            sample = df.sample(number_of_instances)
            res_X = np.concatenate((sample, minority), axis=0)
            res_y = len(sample) * [self.majority_name
                                   ] + len(minority) * [self.minority_name]
            new_classifier = self.base_classifier.fit(res_X, res_y)
            sub_ensemble += [new_classifier]
        return sub_ensemble
예제 #7
0
    def partial_fit(self, X, y, classes=None):
        if classes is None and self.classes is None:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(y)
            self.classes = self.label_encoder.classes
        elif self.classes is None:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(classes)
            self.classes = classes

        if classes[0] is "positive":
            self.minority_name = self.label_encoder.transform(classes[0])
            self.majority_name = self.label_encoder.transform(classes[1])
        elif classes[1] is "positive":
            self.minority_name = self.label_encoder.transform(classes[1])
            self.majority_name = self.label_encoder.transform(classes[0])

        y = self.label_encoder.transform(y)

        if self.minority_name is None or self.majority_name is None:
            self.minority_name, self.majority_name = minority_majority_name(y)

        new_minority = self._resample(X, y)
        minority, majority = minority_majority_split(X, y, self.minority_name,
                                                     self.majority_name)

        if not majority.any():
            return

        majority_split = np.array_split(majority, self.number_of_classifiers)

        self.classifier_array = []
        for m_s in majority_split:
            res_X = np.concatenate((m_s, new_minority), axis=0)
            res_y = len(m_s) * [self.majority_name
                                ] + len(new_minority) * [self.minority_name]
            new_classifier = clone(self.base_classifier).fit(res_X, res_y)
            self.classifier_array.append(new_classifier)

        self.time_stamp += 1
예제 #8
0
    def partial_fit(self, X, y, classes=None):
        warnings.filterwarnings(action='ignore', category=DeprecationWarning)
        if classes is None and self.classes is None:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(y)
            self.classes = self.label_encoder.classes
        elif self.classes is None:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(classes)
            self.classes = classes

        if classes[0] is "positive":
            self.minority_name = self.label_encoder.transform(classes[0])
            self.majority_name = self.label_encoder.transform(classes[1])
        elif classes[1] is "positive":
            self.minority_name = self.label_encoder.transform(classes[1])
            self.majority_name = self.label_encoder.transform(classes[0])

        y = self.label_encoder.transform(y)

        if self.minority_name is None or self.majority_name is None:
            self.minority_name, self.majority_name = minority_majority_name(y)

        new_minority = self._resample(X, y)
        minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name)

        if not majority.any():
            print("majoirty empty")
            return

        majority_split = np.array_split(majority, self.number_of_classifiers)

        self.classifier_array = []
        for m_s in majority_split:
            res_X = np.concatenate((m_s, new_minority), axis=0)
            res_y = len(m_s)*[self.majority_name] + len(new_minority)*[self.minority_name]
            new_classifier = self.base_classifier.fit(res_X, res_y)
            self.classifier_array.append(new_classifier)
예제 #9
0
    def _resample(self, X, y):
        y = np.array(y)
        X = np.array(X)

        minority, majority = minority_majority_split(X, y, self.minority_name, self.majority_name)

        self.minority_data.append(minority.tolist())
        self.ratio_chunks.append(len(minority)/float(len(majority)))
        self.iterator += 1

        if len(self.minority_data) > self.number_of_chunks:
            del self.minority_data[0]
            del self.ratio_chunks[0]

        number_of_instances = len(majority)/self.number_of_classifiers

        new_minority = []
        for md in self.minority_data:
            if number_of_instances < len(md):
                new_minority.extend(random.sample(md, int(number_of_instances)))
            else:
                new_minority.extend(md)

        return new_minority
예제 #10
0
    def partial_fit(self, X, y, classes=None):

        # ________________________________________
        # Initial preperation

        if classes is None and self.classes is None:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(y)
            self.classes = self.label_encoder.classes
        elif self.classes is None:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(classes)
            self.classes = classes

        if classes[0] is "positive":
            self.minority_name = self.label_encoder.transform(classes[0])
            self.majority_name = self.label_encoder.transform(classes[1])
        elif classes[1] is "positive":
            self.minority_name = self.label_encoder.transform(classes[1])
            self.majority_name = self.label_encoder.transform(classes[0])

        y = self.label_encoder.transform(y)

        if self.minority_name is None or self.majority_name is None:
            self.minority_name, self.majority_name = minority_majority_name(y)
            self.number_of_features = len(X[0])

        # ________________________________________
        # Get stored data

        new_X, new_y = [], []

        for tmp_X, tmp_y in zip(self.stored_X, self.stored_y):
            new_X.extend(tmp_X)
            new_y.extend(tmp_y)

        new_X.extend(X)
        new_y.extend(y)

        new_X = np.array(new_X)
        new_y = np.array(new_y)

        # ________________________________________
        # Undersample and store new data

        und_X, und_y = self.undersampling.fit_resample(X, y)

        self.stored_X.append(und_X)
        self.stored_y.append(und_y)

        if len(self.stored_X) > self.number_of_chunks:
                del self.stored_X[0]
                del self.stored_y[0]

        # ________________________________________
        # Oversample when below ratio

        minority, majority = minority_majority_split(new_X, new_y, self.minority_name, self.majority_name)
        ratio = len(minority)/len(majority)

        if ratio < self.balance_ratio:
            new_X, new_y = self.oversampling.fit_resample(new_X, new_y)

        # ________________________________________
        # Train classifier

        self.clf = self.base_classifier.fit(new_X, new_y)
예제 #11
0
파일: rea.py 프로젝트: ibnoe/master-thesis
    def _resample(self, X, y):
        y = np.array(y)
        X = np.array(X)

        minority, majority = minority_majority_split(X, y, self.minority_name,
                                                     self.majority_name)

        if self.minority_data is None:
            self.minority_data = minority
            self.iterator += 1
            return X, y

        ratio = len(minority[:, 0]) / float(len(X[:, 0]))

        if self.balance_ratio > ratio:
            if ((len(minority) + len(self.minority_data)) /
                    float(len(X) +
                          len(self.minority_data))) <= self.balance_ratio:
                new_minority = np.concatenate((minority, self.minority_data),
                                              axis=0)

            else:
                knn = NearestNeighbors(n_neighbors=10).fit(X)

                indices = knn.kneighbors(self.minority_data,
                                         return_distance=False)

                min_count = np.count_nonzero(y[indices] == self.minority_name,
                                             axis=1)

                a = np.arange(0, len(min_count))
                min_count = np.insert(np.expand_dims(min_count, axis=1),
                                      1,
                                      a,
                                      axis=1)
                min_count = min_count[min_count[:, 0].argsort()]
                min_count = min_count[::-1]
                # print(min_count)

                sorted_minority = min_count[:, 1].astype("int")
                # print(sorted_minority)

                n_instances = int((self.balance_ratio - ratio) * len(y))

                if n_instances > len(sorted_minority):
                    new_minority = self.minority_data[sorted_minority]
                else:
                    new_minority = self.minority_data[
                        sorted_minority[0:n_instances]]

            res_X = np.concatenate((new_minority, majority), axis=0)
            res_y = np.concatenate(
                (np.full(len(new_minority), self.minority_name),
                 np.full(len(majority), self.majority_name)),
                axis=0)

        else:
            res_X = X
            res_y = y

        self.minority_data = np.concatenate((minority, self.minority_data),
                                            axis=0)
        self.iterator += 1

        return res_X, res_y
예제 #12
0
파일: dse.py 프로젝트: ibnoe/master-thesis
    def partial_fit(self, X, y, classes=None):

        # ________________________________________
        # Initial preperation

        if classes is None and self.classes is None:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(y)
            self.classes = self.label_encoder.classes
        elif self.classes is None:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(classes)
            self.classes = classes

        if classes[0] is "positive":
            self.minority_name = self.label_encoder.transform(classes[0])
            self.majority_name = self.label_encoder.transform(classes[1])
        elif classes[1] is "positive":
            self.minority_name = self.label_encoder.transform(classes[1])
            self.majority_name = self.label_encoder.transform(classes[0])

        y = self.label_encoder.transform(y)

        if self.minority_name is None or self.majority_name is None:
            self.minority_name, self.majority_name = minority_majority_name(y)
            self.number_of_features = len(X[0])

        # ________________________________________
        # Drift detector

        if (self.drift_detector is not None):
            dd_pred = self.drift_detector.predict(X)
            score = geometric_mean_score(dd_pred, y)
            if score / np.mean(self.metrics_array) < 0.7:
                self.drift_detector = None
                self.metrics_array = []
                self.classifier_array = []
                self.stored_X = []
                self.stored_y = []
            else:
                self.metrics_array.append(score)

        # ________________________________________
        # Get stored data

        new_X, new_y = [], []

        for tmp_X, tmp_y in zip(self.stored_X, self.stored_y):
            new_X.extend(tmp_X)
            new_y.extend(tmp_y)

        new_X.extend(X)
        new_y.extend(y)

        new_X = np.array(new_X)
        new_y = np.array(new_y)

        # ________________________________________
        # Undersample and store new data

        und_X, und_y = self.undersampling.fit_resample(X, y)

        self.stored_X.append(und_X)
        self.stored_y.append(und_y)

        # ________________________________________
        # Oversample when below ratio

        minority, majority = minority_majority_split(new_X, new_y,
                                                     self.minority_name,
                                                     self.majority_name)
        ratio = len(minority) / len(majority)

        if ratio < self.balance_ratio:
            new_X, new_y = self.oversampling.fit_resample(new_X, new_y)

        # ________________________________________
        # Train new classifier

        self.classifier_array.append(
            clone(self.base_classifier).fit(new_X, new_y))
        if len(self.classifier_array) >= self.number_of_classifiers:
            del self.classifier_array[0]
            del self.stored_X[0]
            del self.stored_y[0]

        if self.drift_detector is None:
            self.drift_detector = MLPClassifier((10))
        self.drift_detector.partial_fit(new_X, new_y, np.unique(new_y))

        self.iteration += 1