Exemplo n.º 1
0
class cdht(ClassifierMixin, BaseEstimator):
    def __init__(self, alpha=0.001, drift_detector="KSWIN"):
        self.classifier = HoeffdingTree()
        self.init_drift_detection = True
        self.drift_detector = drift_detector.upper()
        self.confidence = alpha
        self.n_detections = 0

    def partial_fit(self, X, y, classes=None):
        """
            Calls the MultinomialNB partial_fit from sklearn.
            ----------
            x : array-like, shape = [n_samples, n_features]
              Training vector, where n_samples in the number of samples and
              n_features is the number of features.
            y : array, shape = [n_samples]
              Target values (integers in classification, real numbers in
              regression)
            Returns
            --------
            """
        if self.concept_drift_detection(X, y):
            self.classifier.reset()

        self.classifier.partial_fit(X, y, classes)
        return self

    def predict(self, X):
        return self.classifier.predict(X)

    def concept_drift_detection(self, X, Y):
        if self.init_drift_detection:
            if self.drift_detector == "KSWIN":
                self.cdd = [
                    KSWIN(w_size=100, stat_size=30, alpha=self.confidence)
                    for elem in X.T
                ]
            if self.drift_detector == "ADWIN":
                self.cdd = [ADWIN() for elem in X.T]
            if self.drift_detector == "DDM":
                self.cdd = [DDM() for elem in X.T]
            if self.drift_detector == "EDDM":
                self.cdd = [EDDM() for elem in X.T]
            self.init_drift_detection = False
        self.drift_detected = False

        if not self.init_drift_detection:
            for elem, detector in zip(X.T, self.cdd):
                for e in elem:
                    detector.add_element(e)
                    if detector.detected_change():
                        self.drift_detected = True
                        self.n_detections = self.n_detections + 1

        return self.drift_detected


# if name=="__main__":
#     from skmultiflow import
Exemplo n.º 2
0
def evaluation1():
    classifiers = [
        OzaBagging(base_estimator=HoeffdingTree()),
        OzaBaggingAdwin(base_estimator=HoeffdingTree())
    ]  # Array mit Klassifikationsalgorithmen die getestet werden sollen
    cv = CrossValidation(clfs=classifiers, max_samples=1000000, test_size=1)
    cv.streams = cv.init_standard_streams() + cv.init_real_world(
    ) + cv.init_reoccuring_streams(
    )  # initialisiert Stream Generatoren des Scikit-Multiflow Package
    cv.test()
    cv.save_summary()
 def filter_instance_to_leaves(self,
                               X,
                               y,
                               weight,
                               parent,
                               parent_branch,
                               update_splitter_counts=False,
                               found_nodes=None):
     if found_nodes is None:
         found_nodes = []
     if update_splitter_counts:
         try:
             self._observed_class_distribution[
                 y] += weight  # Dictionary (class_value, weight)
         except KeyError:
             self._observed_class_distribution[y] = weight
     child_index = self.instance_child_index(X)
     if child_index >= 0:
         child = self.get_child(child_index)
         if child is not None:
             child.filter_instance_to_leaves(X, y, weight, parent,
                                             parent_branch,
                                             update_splitter_counts,
                                             found_nodes)
         else:
             found_nodes.append(
                 HoeffdingTree.FoundNode(None, self, child_index))
     if self._alternate_tree is not None:
         self._alternate_tree.filter_instance_to_leaves(
             X, y, weight, self, -999, update_splitter_counts,
             found_nodes)
 def filter_instance_to_leaves(self,
                               X,
                               y,
                               weight,
                               parent,
                               parent_branch,
                               update_splitter_counts,
                               found_nodes=None):
     if found_nodes is None:
         found_nodes = []
     found_nodes.append(
         HoeffdingTree.FoundNode(self, parent, parent_branch))
Exemplo n.º 5
0
    def parameter_q_and_t(self):
        accuracy_of_combinations = []
        combination = []
        quantile_percent = [0.50, 0.75, 1.0]
        threshold = [0.5, 0.6, 0.7]
        test_X, test_y = get_data_batches(self.X_array, self.y_array)
        ensemble_clf = DecisionTreeClassifier()
        clf = HoeffdingTree()
        bootstrap_count = 100
        for q in quantile_percent:
            for t in threshold:
                Train_X = test_X[0]
                Train_y = test_y[0].flatten()
                clf = clf.fit(Train_X, Train_y)
                MPD3_detector = MPD3(bootstrap_count, q, t)
                ensemble = MPD3_detector.ensemble_bootstrap(Train_X, Train_y)
                batch_accuracy = []
                result = []
                for i in range(len(test_X) - 1):
                    index = i + 1
                    prediction = clf.predict(test_X[index])
                    batch_accuracy.append(
                        accuracy_score(test_y[index], prediction))
                    mpd_value = MPD3_detector.MPD_score(
                        test_X[index], ensemble)

                    if MPD3_detector.drift_check(mpd_value):
                        Train_X = test_X[index]
                        Train_y = test_y[index].flatten()
                        clf = clf.partial_fit(Train_X, Train_y)
                        ensemble = MPD3_detector.ensemble_bootstrap(
                            Train_X, Train_y)

                mean_accuracy = np.average(batch_accuracy)
                accuracy_of_combinations.append(mean_accuracy)
                combination.append([q, t])
        index_of_max_acc = np.argmax(accuracy_of_combinations)
        final_q, final_t = combination[index_of_max_acc]
        return final_q, final_t
Exemplo n.º 6
0
def evaluation():
    classifiers = [
        GLVQ(prototypes_per_class=4),
        HoeffdingTree(),
        HAT(),
        KNN(),
        SAMKNN(),
        LeverageBagging(),
        KNNAdwin(max_window_size=1000)
    ]  # Array mit Klassifikationsalgorithmen die getestet werden sollen
    cv = CrossValidation(clfs=classifiers, max_samples=1000000, test_size=1)
    cv.streams = cv.init_standard_streams() + cv.init_real_world(
    ) + cv.init_reoccuring_streams(
    )  # initialisiert Stream Generatoren des Scikit-Multiflow Package
    cv.test()
    cv.save_summary()
Exemplo n.º 7
0
def test_multi_output_learner():

    stream = MultilabelGenerator(n_samples=5150,
                                 n_features=15,
                                 n_targets=3,
                                 n_labels=4,
                                 random_state=112)
    stream.prepare_for_use()

    classifier = MultiOutputLearner(base_estimator=HoeffdingTree())

    X, y = stream.next_sample(150)
    classifier.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100
    correct_predictions = 0

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(classifier.predict(X)[0])
            true_labels.append(y[0])
            if np.array_equal(y[0], predictions[-1]):
                correct_predictions += 1

        classifier.partial_fit(X, y)
        cnt += 1

    perf = hamming_score(true_labels, predictions)
    expected_predictions = [[1., 1., 1.], [1., 1., 1.], [1., 1., 1.],
                            [1., 1., 1.], [1., 1., 1.], [0., 1., 1.],
                            [1., 0., 1.], [1., 0., 1.], [1., 1., 1.],
                            [0., 0., 1.], [0., 1., 1.], [0., 1., 1.],
                            [1., 1., 1.], [0., 1., 1.], [1., 1., 0.],
                            [1., 1., 1.], [0., 1., 1.], [1., 0., 0.],
                            [1., 0., 1.], [1., 1., 1.], [1., 0., 1.],
                            [1., 1., 1.], [1., 1., 1.], [1., 1., 1.],
                            [1., 1., 1.], [1., 1., 1.], [1., 1., 1.],
                            [1., 0., 0.], [0., 1., 1.], [1., 1., 0.],
                            [1., 1., 1.], [0., 1., 1.], [1., 1., 1.],
                            [0., 1., 1.], [1., 0., 1.], [1., 0., 1.],
                            [0., 0., 1.], [0., 1., 1.], [1., 1., 0.],
                            [0., 1., 1.], [1., 1., 1.], [1., 1., 1.],
                            [1., 0., 1.], [1., 1., 1.], [1., 1., 1.],
                            [1., 0., 1.], [1., 1., 1.], [1., 1., 1.],
                            [0., 1., 1.]]
    expected_correct_predictions = 32

    expected_performance = 0.8503401360544217

    assert np.alltrue(np.array_equal(predictions, expected_predictions))
    assert np.isclose(expected_performance, perf)
    assert correct_predictions == expected_correct_predictions

    assert type(classifier.predict(X)) == np.ndarray
    assert type(classifier.predict_proba(X)) == np.ndarray
    df.iloc[:, 0:df.shape[1] - 1] = scaler.fit_transform(
        df.iloc[:, 0:df.shape[1] - 1])
    return df


def check_true(y, y_hat):
    if (y == y_hat):
        return 1
    else:
        return 0


df = select_data(sys.argv[1])
stream = DataStream(df)
stream.prepare_for_use()
stream_clf = HoeffdingTree()
w = int(sys.argv[2])
rho = float(sys.argv[3])
auc = float(sys.argv[4])

# In[ ]:

D3_win = D3(w, rho, stream.n_features, auc)
stream_acc = []
stream_record = []
stream_true = 0

i = 0
start = time.time()
X, y = stream.next_sample(int(w * rho))
stream_clf.partial_fit(X, y, classes=stream.target_values)
Exemplo n.º 9
0
from skmultiflow.data import WaveformGenerator
from skmultiflow.trees.hoeffding_tree import HoeffdingTree
from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential

# 1. Create a stream
stream = WaveformGenerator()
stream.prepare_for_use()

# 2. Instantiate the HoeffdingTree classifier
ht = HoeffdingTree()

# 3. Setup the evaluator
evaluator = EvaluatePrequential(show_plot=False,
                                pretrain_size=200,
                                max_samples=20000)

# 4. Run evaluation
evaluator.evaluate(stream=stream, model=ht)
Exemplo n.º 10
0
        labels.columns = ['class']
        n_samples = XT.shape[0] - preparatory_size

        ######################## CURIE ###################

        lst_dim = [n_bins] * n_feats
        curie = CA_VonNeumann_Classifier(bins=[],
                                         bins_margin=bins_margin,
                                         dimensions=lst_dim,
                                         cells=empties(lst_dim))
        limits_automata = list(np.zeros(1))
        #ca_names=['CURIE']
        mutants_time = empty_mutant(curie.dimensions)

        ######################## LEARNERS ###################
        learners_ref = [HoeffdingTree(), KNN(), NaiveBayes()]
        ######################## DETECTORS ###################
        detectores_ref = [DDM(), EDDM(), ADWIN(), PageHinkley(), curie]

        n_pasos = len(datasets) * len(tipos) * len(learners_ref) * len(
            detectores_ref)

        SCORES_LER = []
        TIMES_LER = []
        RAMS_LER = []
        DETECTIONS_LER = []

        for ler in range(len(learners_ref)):

            learner = deepcopy(learners_ref[ler])
Exemplo n.º 11
0
def unsupervised_analysis(df, nu, size, percent):
    stream = DataStream(df)
    stream.prepare_for_use()
    stream_clf = HoeffdingTree()
    stream_acc = []
    stream_record = []
    stream_true= 0
    buffer = dataBuffer(size, stream.n_features, percent)
    clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma='auto')
    
    #
    start = time.time()
    X,y = stream.next_sample(size)
    stream_clf.partial_fit(X,y, classes=stream.target_values)
    clf.fit(X)
    
    i=0
    while(stream.has_more_samples()): #stream.has_more_samples()
        X,y = stream.next_sample()
        if buffer.isEmpty():
            buffer.addInstance(X,y,clf.predict(X))
            y_hat = stream_clf.predict(X)
            stream_true = stream_true + check_true(y, y_hat)
            stream_clf.partial_fit(X,y)
            stream_acc.append(stream_true / (i+1))
            stream_record.append(check_true(y,y_hat))
            
        else:
            if buffer.driftCheck():             #detected
                #print("concept drift detected at {}".format(i))
                #retrain the model
                stream_clf.reset()
                #stream_clf = HoeffdingTree()
                stream_clf.partial_fit(buffer.getCurrentData(), buffer.getCurrentLabels(), classes=stream.target_values)
                #update one-class SVM
                clf.fit(buffer.getCurrentData())
                #evaluate and update the model
                y_hat = stream_clf.predict(X)
                stream_true = stream_true + check_true(y, y_hat)
                stream_clf.partial_fit(X,y)
                stream_acc.append(stream_true / (i+1))
                stream_record.append(check_true(y,y_hat))
                #add new sample to the window
                buffer.addInstance(X,y,clf.predict(X))
            else:
                #evaluate and update the model
                y_hat = stream_clf.predict(X)
                stream_true = stream_true + check_true(y, y_hat)
                stream_clf.partial_fit(X,y)
                stream_acc.append(stream_true / (i+1))
                stream_record.append(check_true(y,y_hat))
                #add new sample to the window
                buffer.addInstance(X,y,clf.predict(X))    
        i = i + 1
    #print(buffer.drift_count)
    
    elapsed = format(time.time() - start, '.4f')
    acc = format(stream_acc[-1] * 100, '.4f')
    final_accuracy = "Parameters: {}, {}, {}, Final accuracy: {}, Elapsed time: {}".format(nu,size,percent,acc,elapsed)
    return final_accuracy, stream_record
Exemplo n.º 12
0
# In[18]:


###Hoeffding Tree Online Classifier###


# In[19]:


### HT Online for RBF###


# In[233]:


HT = HoeffdingTree()
positive = 0
cnt=1
temp_accuracy = []
itr = []
HT_RBF_prediction = []
for i in range(len(RBF_X)):
    tempx = np.array([RBF_X[i]])
    tempy = np.array([RBF_Y[i]])
    prediction = HT.predict(tempx)
    if tempy == prediction:
        positive += 1
    temp_accuracy.append(positive/cnt)
    
    HT_RBF_prediction.append(np.int(HT.predict(tempx)))
    
Exemplo n.º 13
0
 def __init__(self, alpha=0.001, drift_detector="KSWIN"):
     self.classifier = HoeffdingTree()
     self.init_drift_detection = True
     self.drift_detector = drift_detector.upper()
     self.confidence = alpha
     self.n_detections = 0
Exemplo n.º 14
0
def hyperparametertuning_classifiers(classifiers, scoring, cv, X_init, y_init,
                                     max_iter, knn_max_window_size):

    for cl in range(len(classifiers)):

        cl_name = classifiers[cl].__class__.__name__

        if cl_name == 'PassiveAggressiveClassifier':
            #            print (cl_name,' tuning ...')

            PAC_grid = {
                'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0],
                'max_iter': [max_iter, 100, 200, 500]
            }

            grid_cv_PAC = RandomizedSearchCV(classifiers[cl],
                                             PAC_grid,
                                             cv=cv,
                                             scoring=scoring)
            grid_cv_PAC.fit(X_init, y_init)
            classifiers[cl] = grid_cv_PAC.best_estimator_

        elif cl_name == 'SGDClassifier':
            #            print (cl_name,' tuning ...')

            SGDC_grid = {
                'alpha':
                10.0**-np.arange(1, 7),
                'loss': [
                    'perceptron', 'hinge', 'log', 'modified_huber',
                    'squared_hinge'
                ],
                'learning_rate':
                ['constant', 'optimal', 'invscaling', 'adaptive'],
                'eta0': [0.1, 0.5, 1.0],
                'penalty': [None, 'l2', 'l1', 'elasticnet'],
                'max_iter': [max_iter, 100, 200, 500]
            }

            grid_cv_SGDC = RandomizedSearchCV(classifiers[cl],
                                              SGDC_grid,
                                              cv=cv,
                                              scoring=scoring)
            grid_cv_SGDC.fit(X_init, y_init)
            classifiers[cl] = grid_cv_SGDC.best_estimator_

        elif cl_name == 'MLPClassifier':
            #            print (cl_name,' tuning ...')

            MLPC_grid = {
                'hidden_layer_sizes': [(50, ), (100, ), (50, 50), (100, 100)],
                'activation': ['identity', 'logistic', 'tanh', 'relu'],
                'solver': ['sgd', 'adam'],
                'learning_rate': ['constant', 'invscaling', 'adaptive'],
                'learning_rate_init': [0.0005, 0.001, 0.005],
                'alpha': 10.0**-np.arange(1, 10),
                'batch_size': [1, 'auto'],
                'max_iter': [1, 100, 200, 500]
            }

            grid_cv_MLPC = RandomizedSearchCV(classifiers[cl],
                                              MLPC_grid,
                                              cv=cv,
                                              scoring=scoring)
            grid_cv_MLPC.fit(X_init, y_init)
            classifiers[cl] = grid_cv_MLPC.best_estimator_

        elif cl_name == 'MondrianTreeClassifier':
            #            print (cl_name,' tuning ...')

            MTC_grid = {
                'max_depth':
                [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110],
                'min_samples_split': [2, 5, 10]
            }

            grid_cv_MTC = RandomizedSearchCV(classifiers[cl],
                                             MTC_grid,
                                             cv=cv,
                                             scoring=scoring)
            grid_cv_MTC.fit(X_init, y_init)
            classifiers[cl] = grid_cv_MTC.best_estimator_

        elif cl_name == 'MondrianForestClassifier':
            #            print (cl_name,' tuning ...')

            MFR_grid = {
                'n_estimators': [5, 10, 25, 50, 100],
                'max_depth':
                [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110],
                'min_samples_split': [2, 5, 10]
            }

            grid_cv_MFC = RandomizedSearchCV(classifiers[cl],
                                             MFR_grid,
                                             cv=cv,
                                             scoring=scoring)
            grid_cv_MFC.fit(X_init, y_init)
            classifiers[cl] = grid_cv_MFC.best_estimator_

        elif cl_name == 'KNN':
            #            print (cl_name, ' No tuning yet! ')

            KNN_grid = {
                'n_neighbors': [5, 10, 15, 20],
                #                          'max_window_size': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                'leaf_size': [5, 10, 20, 30],
                'algorithm': ['auto'],
                'weights': ['uniform', 'distance']
            }

            grid_cv_KNN = RandomizedSearchCV(KNeighborsClassifier(),
                                             KNN_grid,
                                             cv=cv,
                                             scoring=scoring)
            grid_cv_KNN.fit(X_init, y_init)
            #            print('grid_cv_KNN.best_params_: ',grid_cv_KNN.best_params_)
            n_neighbors = grid_cv_KNN.best_params_['n_neighbors']
            leaf_size = grid_cv_KNN.best_params_['leaf_size']

            classifiers[cl] = lazy.KNN(n_neighbors=n_neighbors,
                                       max_window_size=knn_max_window_size,
                                       leaf_size=leaf_size)


#        elif cl_name=='VFDR':
#            print (cl_name, ' No tuning yet! ')

#            VFDR_grid = {'ordered_rules': [True,False],
#                          'rule_prediction': ['first_hit','weighted_max','weighted_sum'],
#                          'max_rules': [5,10,20,30,50],
#                          'drift_detector': [None],
#                          'expand_criterion': ['info_gain','hellinger','foil_gain']
#                          }

#            classifiers[cl]=VFDR()

        elif cl_name == 'HoeffdingTree':
            print(cl_name, ' No tuning yet! ')
            classifiers[cl] = HoeffdingTree()

        elif cl_name == 'GaussianNB':
            classifiers[cl] = GaussianNB()

        elif cl_name == 'GaussianNB':
            classifiers[cl] = GaussianNB()

    return classifiers
Exemplo n.º 15
0
    SCORES_sgdc = []
    SCORES_htc = []
    SCORES_mtc = []
    SCORES_pac = []
    SCORES_gnbc = []
    SCORES_knn = []
    SCORES_mlpc = []

    for ru in range(runs):

        print('-RUN=' + str(ru))

        #Defining streamers
        SGDC = SGDClassifier()
        HTC = HoeffdingTree()
        MTC = MondrianTreeClassifier()
        PAC = PassiveAggressiveClassifier()
        GNBC = GaussianNB()
        KNN = lazy.KNN()
        MLPC = MLPClassifier()

        classifiers = [SGDC, HTC, PAC, KNN]

        #Data
        features = pd.DataFrame(XT)
        labels = pd.DataFrame(YT)
        features.columns = columns
        labels.columns = ['class']

        #Data slicing