示例#1
0
def unsupervised_analysis(df, nu, size, percent):
    stream = DataStream(df)
    stream.prepare_for_use()
    stream_clf = HoeffdingTree()
    stream_acc = []
    stream_record = []
    stream_true= 0
    buffer = dataBuffer(size, stream.n_features, percent)
    clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma='auto')
    
    #
    start = time.time()
    X,y = stream.next_sample(size)
    stream_clf.partial_fit(X,y, classes=stream.target_values)
    clf.fit(X)
    
    i=0
    while(stream.has_more_samples()): #stream.has_more_samples()
        X,y = stream.next_sample()
        if buffer.isEmpty():
            buffer.addInstance(X,y,clf.predict(X))
            y_hat = stream_clf.predict(X)
            stream_true = stream_true + check_true(y, y_hat)
            stream_clf.partial_fit(X,y)
            stream_acc.append(stream_true / (i+1))
            stream_record.append(check_true(y,y_hat))
            
        else:
            if buffer.driftCheck():             #detected
                #print("concept drift detected at {}".format(i))
                #retrain the model
                stream_clf.reset()
                #stream_clf = HoeffdingTree()
                stream_clf.partial_fit(buffer.getCurrentData(), buffer.getCurrentLabels(), classes=stream.target_values)
                #update one-class SVM
                clf.fit(buffer.getCurrentData())
                #evaluate and update the model
                y_hat = stream_clf.predict(X)
                stream_true = stream_true + check_true(y, y_hat)
                stream_clf.partial_fit(X,y)
                stream_acc.append(stream_true / (i+1))
                stream_record.append(check_true(y,y_hat))
                #add new sample to the window
                buffer.addInstance(X,y,clf.predict(X))
            else:
                #evaluate and update the model
                y_hat = stream_clf.predict(X)
                stream_true = stream_true + check_true(y, y_hat)
                stream_clf.partial_fit(X,y)
                stream_acc.append(stream_true / (i+1))
                stream_record.append(check_true(y,y_hat))
                #add new sample to the window
                buffer.addInstance(X,y,clf.predict(X))    
        i = i + 1
    #print(buffer.drift_count)
    
    elapsed = format(time.time() - start, '.4f')
    acc = format(stream_acc[-1] * 100, '.4f')
    final_accuracy = "Parameters: {}, {}, {}, Final accuracy: {}, Elapsed time: {}".format(nu,size,percent,acc,elapsed)
    return final_accuracy, stream_record
def test_data_stream(test_path, package_path):
    test_file = os.path.join(package_path, 'src/skmultiflow/data/datasets/sea_stream.csv')
    raw_data = pd.read_csv(test_file)
    stream = DataStream(raw_data, name='Test')

    assert not stream._Y_is_defined

    stream.prepare_for_use()

    assert stream.n_remaining_samples() == 40000

    expected_names = ['attrib1', 'attrib2', 'attrib3']
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['class']

    assert stream.n_features == 3

    assert stream.n_cat_features == 0

    assert stream.n_num_features == 3

    assert stream.n_targets == 1

    assert stream.get_data_info() == 'Test: 1 target(s), 2 classes'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream_file.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]

    assert 'stream' == stream._estimator_type

    expected_info = "DataStream(n_targets=-1, target_idx=1, cat_features=None, name='Test')"
    assert stream.get_info() == expected_info
def test_data_stream_X_y(test_path, package_path):
    test_file = os.path.join(package_path,
                             'src/skmultiflow/datasets/sea_stream.csv')
    raw_data = pd.read_csv(test_file)
    y = raw_data.iloc[:, -1:]
    X = raw_data.iloc[:, :-1]
    stream = DataStream(X, y)

    assert stream._Y_is_defined == True

    stream.prepare_for_use()

    assert stream.n_remaining_samples() == 40000

    expected_names = ['attrib1', 'attrib2', 'attrib3']
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['class']

    assert stream.n_features == 3

    assert stream.n_cat_features == 0

    assert stream.n_num_features == 3

    assert stream.n_targets == 1

    assert stream.get_data_info() == '1 target(s), 2 target_values'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]
示例#4
0
def NSE_run (dataset_name, batch, num_copy):
    data = load_arff(path, dataset_name, num_copy)
    # data transform
    stream = DataStream(data)
    #print(stream)

    # Setup variables to control loop and track performance
    n_samples = 0
    max_samples = data.shape[0]

    # Train the classifier with the samples provided by the data stream
    pred = np.empty(0)
    np.random.seed(0)
    
    model = LearnPPNSEClassifier()
    while n_samples < max_samples and stream.has_more_samples():
        X, y = stream.next_sample(batch)
        y_pred = model.predict(X)
        pred = np.hstack((pred,y_pred))
        model.partial_fit(X, y,stream.target_values)
        n_samples += batch

    # evaluate
    data = data.values
    Y = data[:,-1]
    acc = accuracy_score(Y[batch:], pred[batch:])
    f1 = f1_score(Y[batch:], pred[batch:], average='macro')
    #print (Y[batch:].shape, pred[batch:].shape)
    print("acc:",acc)
    print("f1:",f1)
    
    # save results
    result = np.zeros([pred[batch:].shape[0], 2])
    result[:, 0] = pred[batch:]
    result[:, 1] = Y[batch:]
示例#5
0
def InnerCycle_Train(X, y, inject_drift, perc_train):

    # get number of training samples
    ntrain = int(perc_train * X.shape[0])

    if inject_drift:
        # pick a point between 0.7 and 0.9 of the stream
        dpoints = Driftpoints(X)
        dpoints["cleanrun"] = dpoints["row"] - ntrain

        # contaminate X after that point
        X = Swapcols(df=X,
                     class_vec=y,
                     ids=dpoints["cols"],
                     t_change=dpoints["row"])
    else:
        dpoints = dict({"row": X.shape[0], "cols": 0})

    # cast data as DataStream class
    stream = DataStream(X, y)
    stream.prepare_for_use()
    # call incr model (main classifier, teacher model)
    stream_clf = ARF(n_estimators=25)  #,
    #drift_detection_method=None,
    #warning_detection_method=None
    #)

    # get training data... first ntrain rows
    Xtrain, ytrain = stream.next_sample(ntrain)

    # partial fit of the incre model using training data
    stream_clf.fit(Xtrain, ytrain, classes=stream.target_values)
    yhat_train = stream_clf.predict(Xtrain)
    yhat_train_prob = stream_clf.predict_proba(
        Xtrain)  ### needs warnings!!!!!!!!!
    yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob])

    # fit student model
    student_clf = ARF(n_estimators=25)  #,
    #drift_detection_method=None,
    #warning_detection_method=None)
    student_clf.fit(Xtrain, yhat_train, classes=stream.target_values)

    student_regr = RHT()
    student_regr.fit(Xtrain, yhat_tr_max_prob)

    results = dict()
    results["Teacher"] = stream_clf
    results["Student"] = student_clf
    results["StudentRegression"] = student_regr
    results["Driftpoints"] = dpoints
    results["n"] = ntrain
    results["Stream"] = stream
    results["Xtrain"] = Xtrain

    return (results)
示例#6
0
def ARF_run (dataset_name, batch, random_seeds):
    data = load_arff(path, dataset_name)
    #print (data.shape)
    # data transform
    stream = DataStream(data)
    #print(stream)

    # Setup variables to control loop and track performance
    n_samples = 0
    max_samples = data.shape[0]

    # Train the classifier with the samples provided by the data stream
    pred = np.empty(0)
    np.random.seed(0)
    
    model = AdaptiveRandomForestClassifier(n_estimators=24, random_state=random_seeds)
    while n_samples < max_samples and stream.has_more_samples():
        X, y = stream.next_sample(batch)
        y_pred = model.predict(X)
        pred = np.hstack((pred,y_pred))
        model.partial_fit(X, y,stream.target_values)
        n_samples += batch

    # evaluate
    data = data.values
    Y = data[:,-1]
    acc = accuracy_score(Y[batch:], pred[batch:])
    f1 = f1_score(Y[batch:], pred[batch:], average='macro')
    #print (Y[batch:].shape, pred[batch:].shape)
    print("acc:",acc)
    print("f1:",f1)
    
    # save results
    result = np.zeros([pred[batch:].shape[0], 2])
    result[:, 0] = pred[batch:]
    result[:, 1] = Y[batch:]
    np.savetxt(dataset_name +'_' + 'ARF' + str(random_seeds) +'.out', result, delimiter=',')
stream.prepare_for_use()
stream_clf = HoeffdingTree()
w = int(sys.argv[2])
rho = float(sys.argv[3])
auc = float(sys.argv[4])

# In[ ]:

D3_win = D3(w, rho, stream.n_features, auc)
stream_acc = []
stream_record = []
stream_true = 0

i = 0
start = time.time()
X, y = stream.next_sample(int(w * rho))
stream_clf.partial_fit(X, y, classes=stream.target_values)
while (stream.has_more_samples()):
    X, y = stream.next_sample()
    if D3_win.isEmpty():
        D3_win.addInstance(X, y)
        y_hat = stream_clf.predict(X)
        stream_true = stream_true + check_true(y, y_hat)
        stream_clf.partial_fit(X, y)
        stream_acc.append(stream_true / (i + 1))
        stream_record.append(check_true(y, y_hat))
    else:
        if D3_win.driftCheck():  #detected
            #print("concept drift detected at {}".format(i))
            #retrain the model
            stream_clf.reset()
示例#8
0
def InnerCycle(X, y, inject_drift, perc_train, window, delta, pval,
               prob_instance, inst_delay):

    # get number of training samples
    ntrain = int(perc_train * X.shape[0])

    if inject_drift:
        # pick a point between 0.7 and 0.9 of the stream
        dpoints = Driftpoints(X)
        dpoints["cleanrun"] = dpoints["row"] - ntrain

        # contaminate X after that point
        X = Swapcols(df=X,
                     class_vec=y,
                     ids=dpoints["cols"],
                     t_change=dpoints["row"])
    else:
        dpoints = dict({"row": X.shape[0], "cols": 0})

    # cast data as DataStream class
    stream = DataStream(X, y)
    stream.prepare_for_use()
    # call incr model (main classifier, teacher model)
    stream_clf = ARF(n_estimators=25,
                     drift_detection_method=None,
                     warning_detection_method=None)

    # get training data... first ntrain rows
    Xtrain, ytrain = stream.next_sample(ntrain)

    # partial fit of the incre model using training data
    stream_clf.fit(Xtrain, ytrain, classes=stream.target_values)
    yhat_train = stream_clf.predict(Xtrain)
    yhat_train_prob = stream_clf.predict_proba(
        Xtrain)  ### needs warnings!!!!!!!!!
    yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob])

    # fit student model
    student_clf = ARF(n_estimators=25,
                      drift_detection_method=None,
                      warning_detection_method=None)
    student_clf.fit(Xtrain, yhat_train, classes=stream.target_values)

    student_regr = RHT()
    student_regr.fit(Xtrain, yhat_tr_max_prob)

    ####### Call drift detectors

    ## Supervised
    # Supervised with ADWIN
    S_ADWIN = ADWIN()  #(delta=delta)
    S_ADWIN_alarms = []
    # Supervised with PHT
    S_PHT = PHT()  #(min_instances=window,delta=delta)
    S_PHT_alarms = []
    # Delayed Supervised with ADWIN
    DS_ADWIN = ADWIN()  #(delta=delta)
    DS_ADWIN_alarms = []
    # Delayed Supervised with PHT
    DS_PHT = PHT()  #(min_instances=window,delta=delta)
    DS_PHT_alarms = []

    ## Semi-supervised
    # Semi-Supervised with ADWIN
    WS_ADWIN = ADWIN()  #(delta=delta)
    WS_ADWIN_alarms = []
    # Supervised with PHT
    WS_PHT = PHT()  #(min_instances=window,delta=delta)
    WS_PHT_alarms = []
    # Delayed Supervised with ADWIN
    DWS_ADWIN = ADWIN()  #(delta=delta)
    DWS_ADWIN_alarms = []
    # Delayed Supervised with PHT
    DWS_PHT = PHT()  #(min_instances=window,delta=delta)
    DWS_PHT_alarms = []

    ##### Unsupervised
    # Student with ADWIN
    U_ADWIN = ADWIN()  #(delta=delta)
    U_ADWIN_alarms = []
    # Student with PHT
    U_PHT = PHT()  #(min_instances=window,delta=delta)
    U_PHT_alarms = []

    # Student with ADWIN
    UR_ADWIN = ADWIN()  #(delta=delta)
    UR_ADWIN_alarms = []
    # Student with PHT
    UR_PHT = PHT()  #(min_instances=window,delta=delta)
    UR_PHT_alarms = []

    # WRS with output
    WRS_Output = HypothesisTestDetector(method="wrs", window=window, thr=pval)
    WRS_Output_alarms = []
    # WRS with class prob
    WRS_Prob = HypothesisTestDetector(method="wrs", window=window, thr=pval)
    WRS_Prob_alarms = []
    # TT with output
    TT_Output = HypothesisTestDetector(method="tt", window=window, thr=pval)
    TT_Output_alarms = []
    # TT with class prob
    TT_Prob = HypothesisTestDetector(method="tt", window=window, thr=pval)
    TT_Prob_alarms = []
    # KS with output
    KS_Output = HypothesisTestDetector(method="ks", window=window, thr=pval)
    KS_Output_alarms = []
    # KS with class prob
    KS_Prob = HypothesisTestDetector(method="ks", window=window, thr=pval)
    KS_Prob_alarms = []

    Driftmodels = [
        S_ADWIN, S_PHT, DS_ADWIN, DS_PHT, WS_ADWIN, WS_PHT, DWS_ADWIN, DWS_PHT,
        U_ADWIN, U_PHT, UR_ADWIN, UR_PHT, WRS_Output, TT_Output, KS_Output,
        WRS_Prob, TT_Prob, KS_Prob
    ]

    Driftmodels_alarms = [
        S_ADWIN_alarms, S_PHT_alarms, DS_ADWIN_alarms, DS_PHT_alarms,
        WS_ADWIN_alarms, WS_PHT_alarms, DWS_ADWIN_alarms, DWS_PHT_alarms,
        U_ADWIN_alarms, U_PHT_alarms, UR_ADWIN_alarms, UR_PHT_alarms,
        WRS_Output_alarms, TT_Output_alarms, KS_Output_alarms, WRS_Prob_alarms,
        TT_Prob_alarms, KS_Prob_alarms
    ]

    S_driftmodels = Driftmodels[0:2]
    DS_driftmodels = Driftmodels[2:4]
    WS_driftmodels = Driftmodels[4:6]
    DWS_driftmodels = Driftmodels[6:8]
    Ustd_driftmodels = Driftmodels[8:10]
    Ustdreg_driftmodels = Driftmodels[10:12]
    Uoutput_driftmodels = Driftmodels[12:15]
    Uprob_driftmodels = Driftmodels[15:18]

    # always updated
    S_clf = copy.deepcopy(stream_clf)
    # always updated with delay
    DS_clf = copy.deepcopy(stream_clf)
    # updated immediately with some prob
    WS_clf = copy.deepcopy(stream_clf)
    # updated with delay with some prob
    DWS_clf = copy.deepcopy(stream_clf)
    # never updated
    U_clf = copy.deepcopy(stream_clf)

    i = ntrain
    k = 0
    DWS_yhat_hist = []
    DS_yhat_hist = []
    X_hist = []
    y_hist = []
    while (stream.has_more_samples()):
        print(i)
        #i=3000
        Xi, yi = stream.next_sample()

        y_hist.append(yi[0])
        X_hist.append(Xi)

        ext_Xi = np.concatenate([Xtrain[-10:], Xi])

        U_prob = U_clf.predict_proba(ext_Xi)[-1]
        U_yhat = U_clf.predict(ext_Xi)[-1]
        S_yhat = S_clf.predict(ext_Xi)[-1]
        WS_yhat = WS_clf.predict(ext_Xi)[-1]
        DS_yhat = DS_clf.predict(ext_Xi)[-1]
        DWS_yhat = DWS_clf.predict(ext_Xi)[-1]

        DWS_yhat_hist.append(DWS_yhat)
        DS_yhat_hist.append(DS_yhat)

        if len(U_prob) < 2:
            U_yhat_prob_i = U_prob[0]
        elif len(U_prob) == 2:
            U_yhat_prob_i = U_prob[1]
        else:
            U_yhat_prob_i = np.max(U_prob)

        y_meta_hat_i = student_clf.predict(ext_Xi)[-1]
        y_meta_prob = student_regr.predict(ext_Xi)[-1]

        # Updating student model
        student_clf.partial_fit(Xi, [U_yhat])
        # Updating supervised model
        S_clf.partial_fit(Xi, yi)

        # Computing loss
        S_err_i = int(yi[0] != S_yhat)
        student_err_i = int(y_meta_hat_i != U_yhat)
        student_prob_err_i = U_yhat_prob_i - y_meta_prob

        for model in S_driftmodels:
            model.add_element(S_err_i)

        for model in Ustd_driftmodels:
            model.add_element(student_err_i)

        for model in Ustdreg_driftmodels:
            model.add_element(student_prob_err_i)

        for model in Uoutput_driftmodels:
            model.add_element(U_yhat)

        for model in Uprob_driftmodels:
            model.add_element(U_yhat_prob_i)

        put_i_available = np.random.binomial(1, prob_instance)

        if k >= inst_delay:
            DS_err_i = int(
                y_hist[k - inst_delay] != DS_yhat_hist[k - inst_delay])
            DS_clf.partial_fit(X_hist[k - inst_delay],
                               [y_hist[k - inst_delay]])
            for model in DS_driftmodels:
                model.add_element(DS_err_i)

            if put_i_available > 0:
                DWS_err_i = int(
                    y_hist[k - inst_delay] != DWS_yhat_hist[k - inst_delay])
                DWS_clf.partial_fit(X_hist[k - inst_delay],
                                    [y_hist[k - inst_delay]])
                for model in DWS_driftmodels:
                    model.add_element(DWS_err_i)

        if put_i_available > 0:
            WS_err_i = int(yi[0] != WS_yhat)
            WS_clf.partial_fit(Xi, yi)
            for model in WS_driftmodels:
                model.add_element(WS_err_i)

        # detect changes
        for j, model in enumerate(Driftmodels):
            has_change = model.detected_change()
            if has_change:
                Driftmodels_alarms[j].append(i)

        i += 1
        k += 1

    return ([Driftmodels_alarms, dpoints])
示例#9
0
def test_data_stream_X_y(test_path):
    test_file = os.path.join(test_path, 'sea_stream_file.csv')
    raw_data = pd.read_csv(test_file)
    y = raw_data.iloc[:, -1:]
    X = raw_data.iloc[:, :-1]
    stream = DataStream(X, y)

    assert stream._Y_is_defined

    assert stream.n_remaining_samples() == 40

    expected_names = ['attrib1', 'attrib2', 'attrib3']
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['class']

    assert stream.n_features == 3

    assert stream.n_cat_features == 0

    assert stream.n_num_features == 3

    assert stream.n_targets == 1

    assert stream.get_data_info() == '1 target(s), 2 classes'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream_file.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]

    # Ensure that the regression case is also covered
    y = raw_data.iloc[:, -1:]
    X = raw_data.iloc[:, :-1]
    y = y.astype('float64')
    stream = DataStream(X, y, name='Test')

    assert stream.task_type == 'regression'
    assert stream.get_data_info() == 'Test: 1 target(s)'