Exemplo n.º 1
0
def test_page_hinkley(test_path):
    """
    ADWIN drift detection test.
    The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1.
    From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7.
    """
    ph = PageHinkley()
    test_file = os.path.join(test_path, 'drift_stream.npy')
    data_stream = np.load(test_file)
    expected_indices = [
        28, 57, 86, 115, 145, 174, 203, 232, 262, 292, 322, 352, 382, 411, 441,
        471, 500, 530, 560, 589, 618, 648, 678, 708, 737, 767, 796, 826, 856,
        885, 914, 943, 973, 1002, 1031, 1060, 1090, 1120, 1150, 1179, 1208,
        1237, 1266, 1295, 1325, 1354, 1383, 1413, 1443, 1472, 1502, 1532, 1562,
        1591, 1620, 1649, 1678, 1708, 1738, 1768, 1798, 1828, 1857, 1887, 1916,
        1946, 1975
    ]
    detected_indices = []

    for i in range(data_stream.size):
        ph.add_element(data_stream[i])
        if ph.detected_change():
            detected_indices.append(i)

    assert detected_indices == expected_indices
Exemplo n.º 2
0
def perform_drift_detection(predict_dataframe,
                            dataframe,
                            feature_names,
                            detector,
                            drift_notification,
                            token="") -> str:
    log("[INFO] Calling perform_drift_detection", token)
    log("[INFO] Selected data drift detection method: " + detector)
    baseline_data = dataframe.values.tolist()
    predict_data = predict_dataframe.values.tolist()
    overall_data = list()
    for a in baseline_data:
        overall_data.append(a)
    for b in predict_data:
        overall_data.append(b)
    overall_dataframe = pd.DataFrame(overall_data, columns=feature_names)
    drifts = dict()
    window = len(baseline_data)
    for feature in feature_names:
        detected_drifts_indices = list()
        # HDDM
        if detector == "HDDM":
            hddm_w = HDDM_W()
            for i in range(len(overall_dataframe[feature])):
                hddm_w.add_element(float(overall_dataframe[feature][i]))
                if hddm_w.detected_change() and i >= window:
                    detected_drifts_indices.append(i - window)
        # Page Hinkley
        if detector == "Page Hinkley":
            ph = PageHinkley()
            for i in range(len(overall_dataframe[feature])):
                ph.add_element(float(overall_dataframe[feature][i]))
                if ph.detected_change() and i >= window:
                    detected_drifts_indices.append(i - window)
        # ADWIN
        if detector == "ADWIN":
            adwin = ADWIN()
            for i in range(len(overall_dataframe[feature])):
                adwin.add_element(float(overall_dataframe[feature][i]))
                if adwin.detected_change() and i >= window:
                    detected_drifts_indices.append(i - window)
        # Check for detected drifts
        if len(detected_drifts_indices) != 0:
            log("[INFO] Data drift detected in feature: " + feature)
            log("[INFO] The drifted rows are: " + str(detected_drifts_indices))
            drifts[feature] = detected_drifts_indices
            if drift_notification:
                log("[INFO] Sending a web notification", token)
                message = "MaaS data drift detected from " + get_token_user(
                    token) + " (" + token + ")"
                if submit_web_notification(message, token):
                    log("[INFO] Web notification sent!")
                else:
                    log("[ERROR] Error occurred while sending a web notification"
                        )
    return json.dumps(drifts, cls=NpEncoder)
Exemplo n.º 3
0
def test_page_hinkley(test_path):
    """
    ADWIN drift detection test.
    The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1.
    From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7.
    """
    ph = PageHinkley()
    test_file = os.path.join(test_path, 'drift_stream.npy')
    data_stream = np.load(test_file)
    expected_indices = [1013, 1335, 1505, 1758]
    detected_indices = []

    for i in range(data_stream.size):
        ph.add_element(data_stream[i])
        if ph.detected_change():
            detected_indices.append(i)

    assert detected_indices == expected_indices
def initialize_detectors(detector_type):
    
    #note PH test uses differenced raw data! [168,24]
    #note MK_diff test uses differenced raw data! [168,24]
    #Note: HDDDM_diff actually same as "HDDDM" but important to name differently for the retrain function which looks for the "diff" term in the name
    
    detectors_dict = {'HDDDM': HDDDM(3*4*168, gamma=1.5),
                      'HDDDM_diff': HDDDM(3*4*168, gamma=1.5),
                      'STEPD': STEPD(3*4*168),
                      'MK': MannKendall(min_instances = 3*4*168, instances_step = 168, test_type = 'seasonal', alpha=0.01, period = 52, slope_threshold = 0.05),
                      'MK_diff': MannKendall(min_instances = 3*4*168, instances_step = 168, test_type = 'original_mk', alpha=0.05, slope_threshold = 0.00),
                      'ADWIN': ADWIN(delta=0.0007),
                      'PH': PageHinkley(min_instances = 3*4*168, threshold = 700, delta = 900),
                      'PH_diff': PageHinkley(min_instances = 3*4*168, threshold = 1200, delta = 1000)
                     
                     }
    
    return detectors_dict[detector_type]
Exemplo n.º 5
0
def skmultiflow_detector(drift_detector_type: str) -> BaseDriftDetector:
    if drift_detector_type == "SKMULTIFLOW_EDDM":
        multiflow_detector = EDDM()
    elif drift_detector_type == "SKMULTIFLOW_PageHinkley":
        multiflow_detector = PageHinkley()
    elif drift_detector_type == "SKMULTIFLOW_DDM":
        multiflow_detector = DDM()
    elif drift_detector_type == "SKMULTIFLOW_ADWIN":
        multiflow_detector = ADWIN()
    else:
        raise Exception("Drift detector %s not implemented" %
                        drift_detector_type)
    return multiflow_detector
Exemplo n.º 6
0
ph_param1 = [25, 50, 75]
ph_param2 = [0.005, 0.01, 0.02]

knn = KNNClassifier()

stream = driftStreams[0]

for i in range(0, 3):
    trainX, trainY = stream.next_sample(2000)
    knn.partial_fit(trainX, trainY)

    adwin = ADWIN(delta=adwin_param[i])
    ddm = DDM(out_control_level=ddm_param[i])
    kswin1 = KSWIN(window_size=ks_param1[i])
    # kswin2 = KSWIN(stat_size=ks_param2[i])
    ph1 = PageHinkley(threshold=ph_param1[i])
    ph2 = PageHinkley(delta=ph_param2[i])

    adwin_results = []
    ddm_results = []
    kswin1_results = []
    kswin2_results = []
    ph1_results = []
    ph2_results = []

    n_samples = 0
    corrects = 0

    coldstartData = []
    while n_samples < 2000:
        X, y = stream.next_sample()
Exemplo n.º 7
0
    return stream


def drift_flow(stream, method, name, beginning_stream, end_tables):
    detected_change = []
    detected_warning = []
    number_of_changes = 0
    for i in range(len(stream)):
        method.add_element(stream[i])
        if method.detected_warning_zone():
            print(f'Warning zone has been detected in data: {stream[i]} - of index: {i}')
            detected_warning.append((stream[i]))
        if method.detected_change():
            detected_change.append(stream[i])
            print(f'Change has been detected in data: {stream[i]} - of index: {i}')
            number_of_changes += 1
        else:
            detected_change.append(None)
    print(f'{name} Detected changes: {number_of_changes}')
    print(f'{name} Detected warning zones: {str(len(detected_warning))}')
    plots(stream, detected_change, name, beginning_stream, end_tables)


stream = make_stream(PATH)

drift_flow(stream, EDDM(), 'EDDM', 0, 500)
drift_flow(stream, HDDM_A(), 'HDDM_A', 0, 500)
drift_flow(stream, HDDM_W(), 'HDDM_W', 0, 500)
drift_flow(stream, PageHinkley(), 'PH', 0, 500)
drift_flow(stream, DDM(), 'DDM', 0, 500)
Exemplo n.º 8
0
elif DETECTOR == "HDDM_A":
    print ("HDDM_A")
    nama_model = nama_model+DETECTOR
    detector = HDDM_A()
elif DETECTOR == "HDDM_W":
    print ("HDDM_W")
    nama_model = nama_model+DETECTOR
    detector = HDDM_W()
elif DETECTOR == "KSWIN":
    print ("KSWIN")
    nama_model = nama_model+DETECTOR
    detector = KSWIN()
elif DETECTOR == "PageHinkley":
    print ("PageHinkley")
    nama_model = nama_model+DETECTOR
    detector = PageHinkley()
elif DETECTOR =="KD3":
    nama_model = nama_model+DETECTOR
    detector= KD3(window_size=args.window_size, 
            accumulative_threshold=args.p2, 
            detection_threshold=args.p1,bandwidth=0.75)
else:
    detector=None

labels = train_dataset['label'].unique().tolist()
mapping = dict( zip(labels,range(len(labels))) )
train_dataset.replace({'label': mapping},inplace=True)

ds = args.dataset 
ds = ds.replace("final_800_", "")
ds = ds.replace(".pickle", "")
Exemplo n.º 9
0
        def test_on_data_set(data_desc, D):
            r = {data_desc: {"HDDDM": [], "SWIDD": [], "EDDM": [], "DDM": [], "ADWIN": [], "PageHinkley": []}}

            training_buffer_size = 100  # Size of training buffer of the drift detector
            n_train = 200   # Initial training set size

            concept_drifts = D["drifts"]
            X, Y = D["data"]
            data_stream = np.concatenate((X, Y.reshape(-1, 1)), axis=1)


            X0, Y0 = X[0:n_train, :], Y[0:n_train, :]   # Training dataset
            data0 = data_stream[0:n_train,:]

            X_next, Y_next = X[n_train:, :], Y[n_train:, :]  # Test set
            data_next = data_stream[n_train:,:]

            # Run unsupervised drift detector  
            dd = DriftDetectorUnsupervised(HDDDM(data0, gamma=None, alpha=0.005), batch_size=50)
            changes_detected = dd.apply_to_stream(data_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["HDDDM"].append(scores)

            dd = DriftDetectorUnsupervised(SWIDD(max_window_size=300, min_window_size=100), batch_size=1)
            changes_detected = dd.apply_to_stream(data_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["SWIDD"].append(scores)

            # Run supervised drift detector
            model = GaussianNB()
            
            # EDDM
            drift_detector = EDDM()

            clf = Classifier(model)
            clf.flip_score = True
            clf.fit(X0, Y0.ravel())

            dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size)
            changes_detected = dd.apply_to_stream(X_next, Y_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["EDDM"].append(scores)

            # DDM
            drift_detector = DDM(min_num_instances=30, warning_level=2.0, out_control_level=3.0)
            
            clf = Classifier(model)
            clf.flip_score = True
            clf.fit(X0, Y0.ravel())

            dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size)
            changes_detected = dd.apply_to_stream(X_next, Y_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["DDM"].append(scores)

            # ADWIN
            drift_detector = ADWIN(delta=2.)

            clf = Classifier(model)
            clf.fit(X0, Y0.ravel())

            dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size)
            changes_detected = dd.apply_to_stream(X_next, Y_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["ADWIN"].append(scores)
            
            # PageHinkley
            drift_detector = PageHinkley()
            
            clf = Classifier(model)
            clf.flip_score = True
            clf.fit(X0, Y0.ravel())

            dd = DriftDetectorSupervised(clf=clf, drift_detector=drift_detector, training_buffer_size=training_buffer_size)
            changes_detected = dd.apply_to_stream(X_next, Y_next)
            
            # Evaluation
            scores = evaluate(concept_drifts, changes_detected)
            r[data_desc]["PageHinkley"].append(scores)
        
            return r
# Stream elements are added to DDM and checking whether drift occured
for j in range(1000):
    HW.add_element(stream[j])
    if HW.detected_change():
        print('Concept drift detected in data: ' + str(stream[j]) +
              ' - at index: ' + str(j))
    if HW.detected_warning_zone():
        print('Warning detected in data: ' + str(stream[j]) + ' - at index: ' +
              str(j))

# page hinkley test
import numpy as np
from skmultiflow.drift_detection import PageHinkley

# Initialize the PageHinkley object
ph = PageHinkley()

# set seed for reproducibility
np.random.seed(123)

# Simulate a data stream of size 1000 from a normal distribution
# with mean=0 and standard deviation=0.1
stream = np.random.normal(0, 0.1, 1000)

# Data concept are changed from index 299 to 799
for j in range(299, 800):
    stream[j] = np.random.randint(5, high=9)

# Adding stream elements to the PageHinkley drift detector and verifying if drift occurred
for j in range(1000):
    ph.add_element(stream[j])