예제 #1
0
def test_regression_hoeffding_tree_categorical_features(test_path):
    data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy')
    stream = np.load(data_path)

    # Remove class value
    stream = stream[:, np.delete(np.arange(8), 7)]
    # Removes the last column (used only in the multi-target regression case)
    stream = stream[:, :-1]
    X, y = stream[:, :-1], stream[:, -1]

    nominal_attr_idx = np.arange(7).tolist()
    learner = RegressionHoeffdingTree(nominal_attributes=nominal_attr_idx)

    learner.partial_fit(X, y)

    expected_description = "if Attribute 4 = 0.0:\n" \
                           "  Leaf = Statistics {0: 606.0000, 1: 1212.0000, 2: 3626.0000}\n" \
                           "if Attribute 4 = 1.0:\n" \
                           "  Leaf = Statistics {0: 551.0000, 1: 1128.0000, 2: 3400.0000}\n" \
                           "if Attribute 4 = 2.0:\n" \
                           "  Leaf = Statistics {0: 566.0000, 1: 1139.0000, 2: 3423.0000}\n" \
                           "if Attribute 4 = 3.0:\n" \
                           "  Leaf = Statistics {0: 577.0000, 1: 1138.0000, 2: 3374.0000}\n" \
                           "if Attribute 4 = 4.0:\n" \
                           "  Leaf = Statistics {0: 620.0000, 1: 1233.0000, 2: 3725.0000}\n" \
                           "if Attribute 4 = -3.0:\n" \
                           "  Leaf = Statistics {0: 80.0000, 1: 163.0000, 2: 483.0000}\n"

    assert SequenceMatcher(
        None, expected_description, learner.get_model_description()
    ).ratio() > 0.9
예제 #2
0
def test_hoeffding_tree_coverage(test_path):
    # Cover nominal attribute observer
    test_file = os.path.join(test_path, 'regression_data.npz')
    data = np.load(test_file)
    X = data['X']
    y = data['y']

    learner = RegressionHoeffdingTree(leaf_prediction='mean', nominal_attributes=[i for i in range(3)])
    learner.partial_fit(X, y)
예제 #3
0
def InnerCycle_Train(X, y, inject_drift, perc_train):

    # get number of training samples
    ntrain = int(perc_train * X.shape[0])

    if inject_drift:
        # pick a point between 0.7 and 0.9 of the stream
        dpoints = Driftpoints(X)
        dpoints["cleanrun"] = dpoints["row"] - ntrain

        # contaminate X after that point
        X = Swapcols(df=X,
                     class_vec=y,
                     ids=dpoints["cols"],
                     t_change=dpoints["row"])
    else:
        dpoints = dict({"row": X.shape[0], "cols": 0})

    # cast data as DataStream class
    stream = DataStream(X, y)
    stream.prepare_for_use()
    # call incr model (main classifier, teacher model)
    stream_clf = ARF(n_estimators=25)  #,
    #drift_detection_method=None,
    #warning_detection_method=None
    #)

    # get training data... first ntrain rows
    Xtrain, ytrain = stream.next_sample(ntrain)

    # partial fit of the incre model using training data
    stream_clf.fit(Xtrain, ytrain, classes=stream.target_values)
    yhat_train = stream_clf.predict(Xtrain)
    yhat_train_prob = stream_clf.predict_proba(
        Xtrain)  ### needs warnings!!!!!!!!!
    yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob])

    # fit student model
    student_clf = ARF(n_estimators=25)  #,
    #drift_detection_method=None,
    #warning_detection_method=None)
    student_clf.fit(Xtrain, yhat_train, classes=stream.target_values)

    student_regr = RHT()
    student_regr.fit(Xtrain, yhat_tr_max_prob)

    results = dict()
    results["Teacher"] = stream_clf
    results["Student"] = student_clf
    results["StudentRegression"] = student_regr
    results["Driftpoints"] = dpoints
    results["n"] = ntrain
    results["Stream"] = stream
    results["Xtrain"] = Xtrain

    return (results)
def test_hoeffding_tree_perceptron():
    stream = RegressionGenerator(n_samples=500,
                                 n_features=20,
                                 n_informative=15,
                                 random_state=1)
    stream.prepare_for_use()

    learner = RegressionHoeffdingTree(leaf_prediction='perceptron',
                                      random_state=1)

    cnt = 0
    max_samples = 500
    y_pred = array('d')
    y_true = array('d')
    wait_samples = 10

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_true.append(y[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('d', [
        1198.4326121743168, 456.36607750881586, 927.9912160545144,
        1160.4797981899128, 506.50541829176535, -687.8187227095925,
        -677.8120094065415, 231.14888704761225, -284.46324039942937,
        -255.69195985557175, 47.58787439365423, -135.22494016284043,
        -10.351457437330152, 164.95903200643997, 360.72854984472383,
        193.30633911830088, -64.23638301570358, 587.9771578214296,
        649.8395655757931, 481.01214222804026, 305.4402728117724,
        266.2096493865043, -445.11447171009775, -567.5748694154349,
        -68.70070048021438, -446.79910655850153, -115.892348067663,
        -98.26862866231015, 71.04707905920286, -10.239274802165584,
        18.748731569441812, 4.971217265129857, 172.2223575990573,
        -655.2864976783711, -129.69921313686626, -114.01187375876822,
        -405.66166686550963, -215.1264381928009, -345.91020370426247,
        -80.49330468453074, 108.78958382083302, 134.95267043280126,
        -398.5273538477553, -157.1784910649728, 219.72541225645654,
        -100.91598162899217, 80.9768574308987, -296.8856956382453,
        251.9332271253148
    ])
    assert np.allclose(y_pred, expected_predictions)

    error = mean_absolute_error(y_true, y_pred)
    expected_error = 362.98595964244623
    assert np.isclose(error, expected_error)

    expected_info = 'RegressionHoeffdingTree: max_byte_size: 33554432 - memory_estimate_period: 1000000 ' \
                    '- grace_period: 200 - split_criterion: variance reduction - split_confidence: 1e-07 ' \
                    '- tie_threshold: 0.05 - binary_split: False - stop_mem_management: False ' \
                    '- remove_poor_atts: False - no_pre_prune: False - leaf_prediction: perceptron - nb_threshold: 0 ' \
                    '- nominal_attributes: [] - '
    assert learner.get_info() == expected_info

    assert isinstance(learner.get_model_description(), type(''))
    assert type(learner.predict(X)) == np.ndarray
def test_hoeffding_tree():
    stream = RegressionGenerator(n_samples=500,
                                 n_features=20,
                                 n_informative=15,
                                 random_state=1)
    stream.prepare_for_use()

    learner = RegressionHoeffdingTree(leaf_prediction='mean')

    cnt = 0
    max_samples = 500
    y_pred = array('d')
    y_true = array('d')
    wait_samples = 10

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_true.append(y[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('d', [
        102.38946041769101, 55.6584574987656, 5.746076599168373,
        17.11797209372667, 2.566888222752787, 9.188247802192826,
        17.87894804676911, 15.940629626883966, 8.981172175448485,
        13.152624115190092, 11.106058099429399, 6.473195313058236,
        4.723621479590173, 13.825568609556493, 8.698873073880696,
        1.6452441811010252, 5.123496188584294, 6.34387187194982,
        5.9977733790395105, 6.874251577667707, 4.605348088338317,
        8.20112636572672, 9.032631648758098, 4.428189978974459,
        4.249801041367518, 9.983272668044492, 12.859518508979734,
        11.741395774380285, 11.230028410261868, 9.126921979081521,
        9.132146661688296, 7.750655625124709, 6.445145118245414,
        5.760928671876355, 4.041291302080659, 3.591837600560529,
        0.7640424010500604, 0.1738639840537784, 2.2068337802212286,
        -81.05302946841077, 96.17757415335177, -77.35894903819677,
        95.85568683733698, 99.1981674250886, 99.89327888035015,
        101.66673013734784, -79.1904234513751, -80.42952143783687,
        100.63954789983896
    ])
    assert np.allclose(y_pred, expected_predictions)

    error = mean_absolute_error(y_true, y_pred)
    expected_error = 143.11351404083086
    assert np.isclose(error, expected_error)

    expected_info = 'RegressionHoeffdingTree: max_byte_size: 33554432 - memory_estimate_period: 1000000 ' \
                    '- grace_period: 200 - split_criterion: variance reduction - split_confidence: 1e-07 ' \
                    '- tie_threshold: 0.05 - binary_split: False - stop_mem_management: False ' \
                    '- remove_poor_atts: False - no_pre_prune: False - leaf_prediction: mean - nb_threshold: 0 ' \
                    '- nominal_attributes: [] - '
    assert learner.get_info() == expected_info

    assert isinstance(learner.get_model_description(), type(''))
    assert type(learner.predict(X)) == np.ndarray
예제 #6
0
def test_hoeffding_tree_coverage(test_path):
    # Cover nominal attribute observer
    test_file = os.path.join(test_path, 'regression_data.npz')
    data = np.load(test_file)
    X = data['X']
    y = data['y']

    # Typo in leaf prediction
    learner = RegressionHoeffdingTree(
        leaf_prediction='percptron', nominal_attributes=[i for i in range(3)]
    )
    print(learner.split_criterion)
    # Invalid split_criterion
    learner.split_criterion = 'VR'
    learner.partial_fit(X, y)

    assert learner._estimator_type == 'regressor'
예제 #7
0
def test_regression_hoeffding_tree_model_description():
    stream = RegressionGenerator(
        n_samples=500, n_features=20, n_informative=15, random_state=1
    )
    stream.prepare_for_use()

    learner = RegressionHoeffdingTree(leaf_prediction='mean')

    max_samples = 500
    X, y = stream.next_sample(max_samples)
    learner.partial_fit(X, y)

    expected_description = "if Attribute 6 <= 0.1394515530995348:\n" \
                           "  Leaf = Statistics {0: 276.0000, 1: -21537.4157, 2: 11399392.2187}\n" \
                           "if Attribute 6 > 0.1394515530995348:\n" \
                           "  Leaf = Statistics {0: 224.0000, 1: 22964.8868, 2: 10433581.2534}\n"

    assert SequenceMatcher(
        None, expected_description, learner.get_model_description()
    ).ratio() > 0.9
예제 #8
0
def test_evaluate_regression_coverage(tmpdir):
    # A simple coverage test. Tests for metrics are placed in the corresponding test module.
    from skmultiflow.data import RegressionGenerator
    from skmultiflow.trees import RegressionHoeffdingTree

    max_samples = 1000

    # Stream
    stream = RegressionGenerator(n_samples=max_samples)
    stream.prepare_for_use()

    # Learner
    htr = RegressionHoeffdingTree()

    output_file = os.path.join(str(tmpdir), "prequential_summary.csv")
    metrics = ['mean_square_error', 'mean_absolute_error']
    evaluator = EvaluatePrequential(max_samples=max_samples,
                                    metrics=metrics,
                                    output_file=output_file)

    evaluator.evaluate(stream=stream, model=htr, model_names=['HTR'])
예제 #9
0
def InnerCycle(X, y, inject_drift, perc_train, window, delta, pval,
               prob_instance, inst_delay):

    # get number of training samples
    ntrain = int(perc_train * X.shape[0])

    if inject_drift:
        # pick a point between 0.7 and 0.9 of the stream
        dpoints = Driftpoints(X)
        dpoints["cleanrun"] = dpoints["row"] - ntrain

        # contaminate X after that point
        X = Swapcols(df=X,
                     class_vec=y,
                     ids=dpoints["cols"],
                     t_change=dpoints["row"])
    else:
        dpoints = dict({"row": X.shape[0], "cols": 0})

    # cast data as DataStream class
    stream = DataStream(X, y)
    stream.prepare_for_use()
    # call incr model (main classifier, teacher model)
    stream_clf = ARF(n_estimators=25,
                     drift_detection_method=None,
                     warning_detection_method=None)

    # get training data... first ntrain rows
    Xtrain, ytrain = stream.next_sample(ntrain)

    # partial fit of the incre model using training data
    stream_clf.fit(Xtrain, ytrain, classes=stream.target_values)
    yhat_train = stream_clf.predict(Xtrain)
    yhat_train_prob = stream_clf.predict_proba(
        Xtrain)  ### needs warnings!!!!!!!!!
    yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob])

    # fit student model
    student_clf = ARF(n_estimators=25,
                      drift_detection_method=None,
                      warning_detection_method=None)
    student_clf.fit(Xtrain, yhat_train, classes=stream.target_values)

    student_regr = RHT()
    student_regr.fit(Xtrain, yhat_tr_max_prob)

    ####### Call drift detectors

    ## Supervised
    # Supervised with ADWIN
    S_ADWIN = ADWIN()  #(delta=delta)
    S_ADWIN_alarms = []
    # Supervised with PHT
    S_PHT = PHT()  #(min_instances=window,delta=delta)
    S_PHT_alarms = []
    # Delayed Supervised with ADWIN
    DS_ADWIN = ADWIN()  #(delta=delta)
    DS_ADWIN_alarms = []
    # Delayed Supervised with PHT
    DS_PHT = PHT()  #(min_instances=window,delta=delta)
    DS_PHT_alarms = []

    ## Semi-supervised
    # Semi-Supervised with ADWIN
    WS_ADWIN = ADWIN()  #(delta=delta)
    WS_ADWIN_alarms = []
    # Supervised with PHT
    WS_PHT = PHT()  #(min_instances=window,delta=delta)
    WS_PHT_alarms = []
    # Delayed Supervised with ADWIN
    DWS_ADWIN = ADWIN()  #(delta=delta)
    DWS_ADWIN_alarms = []
    # Delayed Supervised with PHT
    DWS_PHT = PHT()  #(min_instances=window,delta=delta)
    DWS_PHT_alarms = []

    ##### Unsupervised
    # Student with ADWIN
    U_ADWIN = ADWIN()  #(delta=delta)
    U_ADWIN_alarms = []
    # Student with PHT
    U_PHT = PHT()  #(min_instances=window,delta=delta)
    U_PHT_alarms = []

    # Student with ADWIN
    UR_ADWIN = ADWIN()  #(delta=delta)
    UR_ADWIN_alarms = []
    # Student with PHT
    UR_PHT = PHT()  #(min_instances=window,delta=delta)
    UR_PHT_alarms = []

    # WRS with output
    WRS_Output = HypothesisTestDetector(method="wrs", window=window, thr=pval)
    WRS_Output_alarms = []
    # WRS with class prob
    WRS_Prob = HypothesisTestDetector(method="wrs", window=window, thr=pval)
    WRS_Prob_alarms = []
    # TT with output
    TT_Output = HypothesisTestDetector(method="tt", window=window, thr=pval)
    TT_Output_alarms = []
    # TT with class prob
    TT_Prob = HypothesisTestDetector(method="tt", window=window, thr=pval)
    TT_Prob_alarms = []
    # KS with output
    KS_Output = HypothesisTestDetector(method="ks", window=window, thr=pval)
    KS_Output_alarms = []
    # KS with class prob
    KS_Prob = HypothesisTestDetector(method="ks", window=window, thr=pval)
    KS_Prob_alarms = []

    Driftmodels = [
        S_ADWIN, S_PHT, DS_ADWIN, DS_PHT, WS_ADWIN, WS_PHT, DWS_ADWIN, DWS_PHT,
        U_ADWIN, U_PHT, UR_ADWIN, UR_PHT, WRS_Output, TT_Output, KS_Output,
        WRS_Prob, TT_Prob, KS_Prob
    ]

    Driftmodels_alarms = [
        S_ADWIN_alarms, S_PHT_alarms, DS_ADWIN_alarms, DS_PHT_alarms,
        WS_ADWIN_alarms, WS_PHT_alarms, DWS_ADWIN_alarms, DWS_PHT_alarms,
        U_ADWIN_alarms, U_PHT_alarms, UR_ADWIN_alarms, UR_PHT_alarms,
        WRS_Output_alarms, TT_Output_alarms, KS_Output_alarms, WRS_Prob_alarms,
        TT_Prob_alarms, KS_Prob_alarms
    ]

    S_driftmodels = Driftmodels[0:2]
    DS_driftmodels = Driftmodels[2:4]
    WS_driftmodels = Driftmodels[4:6]
    DWS_driftmodels = Driftmodels[6:8]
    Ustd_driftmodels = Driftmodels[8:10]
    Ustdreg_driftmodels = Driftmodels[10:12]
    Uoutput_driftmodels = Driftmodels[12:15]
    Uprob_driftmodels = Driftmodels[15:18]

    # always updated
    S_clf = copy.deepcopy(stream_clf)
    # always updated with delay
    DS_clf = copy.deepcopy(stream_clf)
    # updated immediately with some prob
    WS_clf = copy.deepcopy(stream_clf)
    # updated with delay with some prob
    DWS_clf = copy.deepcopy(stream_clf)
    # never updated
    U_clf = copy.deepcopy(stream_clf)

    i = ntrain
    k = 0
    DWS_yhat_hist = []
    DS_yhat_hist = []
    X_hist = []
    y_hist = []
    while (stream.has_more_samples()):
        print(i)
        #i=3000
        Xi, yi = stream.next_sample()

        y_hist.append(yi[0])
        X_hist.append(Xi)

        ext_Xi = np.concatenate([Xtrain[-10:], Xi])

        U_prob = U_clf.predict_proba(ext_Xi)[-1]
        U_yhat = U_clf.predict(ext_Xi)[-1]
        S_yhat = S_clf.predict(ext_Xi)[-1]
        WS_yhat = WS_clf.predict(ext_Xi)[-1]
        DS_yhat = DS_clf.predict(ext_Xi)[-1]
        DWS_yhat = DWS_clf.predict(ext_Xi)[-1]

        DWS_yhat_hist.append(DWS_yhat)
        DS_yhat_hist.append(DS_yhat)

        if len(U_prob) < 2:
            U_yhat_prob_i = U_prob[0]
        elif len(U_prob) == 2:
            U_yhat_prob_i = U_prob[1]
        else:
            U_yhat_prob_i = np.max(U_prob)

        y_meta_hat_i = student_clf.predict(ext_Xi)[-1]
        y_meta_prob = student_regr.predict(ext_Xi)[-1]

        # Updating student model
        student_clf.partial_fit(Xi, [U_yhat])
        # Updating supervised model
        S_clf.partial_fit(Xi, yi)

        # Computing loss
        S_err_i = int(yi[0] != S_yhat)
        student_err_i = int(y_meta_hat_i != U_yhat)
        student_prob_err_i = U_yhat_prob_i - y_meta_prob

        for model in S_driftmodels:
            model.add_element(S_err_i)

        for model in Ustd_driftmodels:
            model.add_element(student_err_i)

        for model in Ustdreg_driftmodels:
            model.add_element(student_prob_err_i)

        for model in Uoutput_driftmodels:
            model.add_element(U_yhat)

        for model in Uprob_driftmodels:
            model.add_element(U_yhat_prob_i)

        put_i_available = np.random.binomial(1, prob_instance)

        if k >= inst_delay:
            DS_err_i = int(
                y_hist[k - inst_delay] != DS_yhat_hist[k - inst_delay])
            DS_clf.partial_fit(X_hist[k - inst_delay],
                               [y_hist[k - inst_delay]])
            for model in DS_driftmodels:
                model.add_element(DS_err_i)

            if put_i_available > 0:
                DWS_err_i = int(
                    y_hist[k - inst_delay] != DWS_yhat_hist[k - inst_delay])
                DWS_clf.partial_fit(X_hist[k - inst_delay],
                                    [y_hist[k - inst_delay]])
                for model in DWS_driftmodels:
                    model.add_element(DWS_err_i)

        if put_i_available > 0:
            WS_err_i = int(yi[0] != WS_yhat)
            WS_clf.partial_fit(Xi, yi)
            for model in WS_driftmodels:
                model.add_element(WS_err_i)

        # detect changes
        for j, model in enumerate(Driftmodels):
            has_change = model.detected_change()
            if has_change:
                Driftmodels_alarms[j].append(i)

        i += 1
        k += 1

    return ([Driftmodels_alarms, dpoints])
예제 #10
0
X = tdf[["Pressure (millibars)", "Humidity",
         "Wind Speed (km/h)"]].resample("6H").mean()
y = tdf[["Temperature (C)"]].resample("6H").max()

X.plot(subplots=True, layout=(1, 3))
y.plot()

#%%

reload(samknnreg)
from samknnreg import SAMKNNRegressor

sam = SAMKNNRegressor()
hat = RegressionHAT()
rht = RegressionHoeffdingTree()
ds = DataStream(X, y=y)
ds.prepare_for_use()

evaluator = EvaluatePrequential(
    show_plot=True,
    n_wait=730,
    batch_size=28,
    metrics=['mean_square_error', 'true_vs_predicted'])

#%%
evaluator.evaluate(stream=ds,
                   model=[sam, rht, hat],
                   model_names=[
                       "SAM", "Hoeffding Tree Regressor",
                       "Hoeffding Tree Regressor (Adaptive)"