예제 #1
0
def NSE_run (dataset_name, batch, num_copy):
    data = load_arff(path, dataset_name, num_copy)
    # data transform
    stream = DataStream(data)
    #print(stream)

    # Setup variables to control loop and track performance
    n_samples = 0
    max_samples = data.shape[0]

    # Train the classifier with the samples provided by the data stream
    pred = np.empty(0)
    np.random.seed(0)
    
    model = LearnPPNSEClassifier()
    while n_samples < max_samples and stream.has_more_samples():
        X, y = stream.next_sample(batch)
        y_pred = model.predict(X)
        pred = np.hstack((pred,y_pred))
        model.partial_fit(X, y,stream.target_values)
        n_samples += batch

    # evaluate
    data = data.values
    Y = data[:,-1]
    acc = accuracy_score(Y[batch:], pred[batch:])
    f1 = f1_score(Y[batch:], pred[batch:], average='macro')
    #print (Y[batch:].shape, pred[batch:].shape)
    print("acc:",acc)
    print("f1:",f1)
    
    # save results
    result = np.zeros([pred[batch:].shape[0], 2])
    result[:, 0] = pred[batch:]
    result[:, 1] = Y[batch:]
예제 #2
0
def InnerCycle_Train(X, y, inject_drift, perc_train):

    # get number of training samples
    ntrain = int(perc_train * X.shape[0])

    if inject_drift:
        # pick a point between 0.7 and 0.9 of the stream
        dpoints = Driftpoints(X)
        dpoints["cleanrun"] = dpoints["row"] - ntrain

        # contaminate X after that point
        X = Swapcols(df=X,
                     class_vec=y,
                     ids=dpoints["cols"],
                     t_change=dpoints["row"])
    else:
        dpoints = dict({"row": X.shape[0], "cols": 0})

    # cast data as DataStream class
    stream = DataStream(X, y)
    stream.prepare_for_use()
    # call incr model (main classifier, teacher model)
    stream_clf = ARF(n_estimators=25)  #,
    #drift_detection_method=None,
    #warning_detection_method=None
    #)

    # get training data... first ntrain rows
    Xtrain, ytrain = stream.next_sample(ntrain)

    # partial fit of the incre model using training data
    stream_clf.fit(Xtrain, ytrain, classes=stream.target_values)
    yhat_train = stream_clf.predict(Xtrain)
    yhat_train_prob = stream_clf.predict_proba(
        Xtrain)  ### needs warnings!!!!!!!!!
    yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob])

    # fit student model
    student_clf = ARF(n_estimators=25)  #,
    #drift_detection_method=None,
    #warning_detection_method=None)
    student_clf.fit(Xtrain, yhat_train, classes=stream.target_values)

    student_regr = RHT()
    student_regr.fit(Xtrain, yhat_tr_max_prob)

    results = dict()
    results["Teacher"] = stream_clf
    results["Student"] = student_clf
    results["StudentRegression"] = student_regr
    results["Driftpoints"] = dpoints
    results["n"] = ntrain
    results["Stream"] = stream
    results["Xtrain"] = Xtrain

    return (results)
예제 #3
0
def test_active_learning_window_extraction_with_delta():
    df = pd.read_csv(METADB_PATH)
    stream = DataStream(df)

    learner = ActiveLearner(0.1,
                            stream,
                            HoeffdingTreeClassifier(),
                            store_history=True)

    for i in range(1000):
        learner.next_data()

    new_curr1 = mean([x[2] for x in learner.history])
    old_last_window_acc1 = learner.last_window_acc
    expected_delta1 = new_curr1 - old_last_window_acc1
    wind1 = learner.get_last_window(delta_acc_summary_func="mean")

    for i in range(1000):
        learner.next_data()

    new_curr2 = max([x[2] for x in learner.history])
    old_last_window_acc2 = learner.last_window_acc
    expected_delta2 = new_curr2 - old_last_window_acc2
    wind2 = learner.get_last_window(n_classes=5, delta_acc_summary_func="max")

    print(wind1)
    print(wind2)

    assert wind1.shape[0] == 1
    assert wind1.shape[1] > 0
    assert wind2.shape[0] == 1
    assert wind2.shape[1] > 0
    assert expected_delta1 == wind1["window_acc_delta"].to_numpy()[0]
    assert expected_delta2 == wind2["window_acc_delta"].to_numpy()[0]
    assert old_last_window_acc2 == new_curr1
예제 #4
0
def test_active_learning_window_extraction():
    df = pd.read_csv(METADB_PATH)
    stream = DataStream(df)

    learner = ActiveLearner(0.1,
                            stream,
                            HoeffdingTreeClassifier(),
                            store_history=True)

    for i in range(1000):
        learner.next_data()

    wind1 = learner.get_last_window()

    for i in range(1000):
        learner.next_data()

    wind2 = learner.get_last_window(n_classes=5)

    print(wind1)
    print(wind2)

    assert wind1.shape[0] == 1
    assert wind1.shape[1] > 0
    assert wind2.shape[0] == 1
    assert wind2.shape[1] > 0
예제 #5
0
def unsupervised_analysis(df, nu, size, percent):
    stream = DataStream(df)
    stream.prepare_for_use()
    stream_clf = HoeffdingTree()
    stream_acc = []
    stream_record = []
    stream_true= 0
    buffer = dataBuffer(size, stream.n_features, percent)
    clf = svm.OneClassSVM(nu=nu, kernel="rbf", gamma='auto')
    
    #
    start = time.time()
    X,y = stream.next_sample(size)
    stream_clf.partial_fit(X,y, classes=stream.target_values)
    clf.fit(X)
    
    i=0
    while(stream.has_more_samples()): #stream.has_more_samples()
        X,y = stream.next_sample()
        if buffer.isEmpty():
            buffer.addInstance(X,y,clf.predict(X))
            y_hat = stream_clf.predict(X)
            stream_true = stream_true + check_true(y, y_hat)
            stream_clf.partial_fit(X,y)
            stream_acc.append(stream_true / (i+1))
            stream_record.append(check_true(y,y_hat))
            
        else:
            if buffer.driftCheck():             #detected
                #print("concept drift detected at {}".format(i))
                #retrain the model
                stream_clf.reset()
                #stream_clf = HoeffdingTree()
                stream_clf.partial_fit(buffer.getCurrentData(), buffer.getCurrentLabels(), classes=stream.target_values)
                #update one-class SVM
                clf.fit(buffer.getCurrentData())
                #evaluate and update the model
                y_hat = stream_clf.predict(X)
                stream_true = stream_true + check_true(y, y_hat)
                stream_clf.partial_fit(X,y)
                stream_acc.append(stream_true / (i+1))
                stream_record.append(check_true(y,y_hat))
                #add new sample to the window
                buffer.addInstance(X,y,clf.predict(X))
            else:
                #evaluate and update the model
                y_hat = stream_clf.predict(X)
                stream_true = stream_true + check_true(y, y_hat)
                stream_clf.partial_fit(X,y)
                stream_acc.append(stream_true / (i+1))
                stream_record.append(check_true(y,y_hat))
                #add new sample to the window
                buffer.addInstance(X,y,clf.predict(X))    
        i = i + 1
    #print(buffer.drift_count)
    
    elapsed = format(time.time() - start, '.4f')
    acc = format(stream_acc[-1] * 100, '.4f')
    final_accuracy = "Parameters: {}, {}, {}, Final accuracy: {}, Elapsed time: {}".format(nu,size,percent,acc,elapsed)
    return final_accuracy, stream_record
예제 #6
0
def test_check_data():
    # Test if data contains non-numeric values
    data = pd.DataFrame(
        np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10],
                  [11, 'invalid', 13, 14, 15]]))

    with pytest.raises(ValueError):
        DataStream(data=data, allow_nan=False)

    # Test if data contains NaN values
    data = pd.DataFrame(
        np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, np.nan, 13, 14,
                                                      15]]))

    with pytest.raises(ValueError):
        DataStream(data=data, allow_nan=False)

    # Test warning for NaN values

    with pytest.warns(UserWarning):
        DataStream(data=data, allow_nan=True)
예제 #7
0
def ARF_run (dataset_name, batch, random_seeds):
    data = load_arff(path, dataset_name)
    #print (data.shape)
    # data transform
    stream = DataStream(data)
    #print(stream)

    # Setup variables to control loop and track performance
    n_samples = 0
    max_samples = data.shape[0]

    # Train the classifier with the samples provided by the data stream
    pred = np.empty(0)
    np.random.seed(0)
    
    model = AdaptiveRandomForestClassifier(n_estimators=24, random_state=random_seeds)
    while n_samples < max_samples and stream.has_more_samples():
        X, y = stream.next_sample(batch)
        y_pred = model.predict(X)
        pred = np.hstack((pred,y_pred))
        model.partial_fit(X, y,stream.target_values)
        n_samples += batch

    # evaluate
    data = data.values
    Y = data[:,-1]
    acc = accuracy_score(Y[batch:], pred[batch:])
    f1 = f1_score(Y[batch:], pred[batch:], average='macro')
    #print (Y[batch:].shape, pred[batch:].shape)
    print("acc:",acc)
    print("f1:",f1)
    
    # save results
    result = np.zeros([pred[batch:].shape[0], 2])
    result[:, 0] = pred[batch:]
    result[:, 1] = Y[batch:]
    np.savetxt(dataset_name +'_' + 'ARF' + str(random_seeds) +'.out', result, delimiter=',')
def test_data_stream(test_path):
    test_file = os.path.join(test_path, 'data/data_n30000.csv')
    raw_data = pd.read_csv(test_file)
    stream = DataStream(raw_data, name='Test')
    normal_knn_learner = KNNClassifier(
        n_neighbors=8,
        max_window_size=2000,
        leaf_size=40,
    )
    weighted_knn_learner = WeightedKNNClassifier(n_neighbors=8,
                                                 max_window_size=2000,
                                                 leaf_size=40)
    standardize_knn_learner = KNNClassifier(n_neighbors=8,
                                            max_window_size=2000,
                                            leaf_size=40,
                                            standardize=True)
    nominal_attr_idx = [x for x in range(15, len(stream.feature_names))]

    hoeffding_learner = HoeffdingTreeClassifier(
        nominal_attributes=nominal_attr_idx)
    nb_learner = NaiveBayes()

    metrics = ['accuracy', 'kappa_m', 'kappa_t', 'recall']
    output_file = os.path.join(test_path, 'data/kkn_output.csv')
    evaluator = EvaluatePrequential(metrics=metrics, output_file=output_file)

    # Evaluate
    result = evaluator.evaluate(stream=stream,
                                model=[
                                    normal_knn_learner,
                                    weighted_knn_learner,
                                    standardize_knn_learner,
                                    hoeffding_learner,
                                    nb_learner,
                                ])
    mean_performance, current_performance = evaluator.get_measurements()
    assert 1 == 1
def main():
    logging = set_logger()
    args = parser.parse_args()
    output_dir = create_output_dir(
        output_path=args.output if args.output else None)
    metadata = {
        "experimento": args.experiment or "",
        "command": " ".join(sys.argv),
        "date": time.strftime("%Y%m%d%H%M%S"),
    }

    lk_plot_data = []
    ld_plot_data = []
    ld_mae_plot_data = []

    if not args.dataset:
        print("Dataset not provided. Exiting.")
        sys.exit(0)

    #### DATASET ANALYSIS ######

    logging.info("Analyzing dataset %s", args.dataset)
    logging.info("Loading dataset: %s", args.dataset)
    x_stream, y_stream, _, label_names = load_given_dataset(args.dataset)
    data_stream = DataStream(data=x_stream.todense(),
                             y=y_stream.todense(),
                             name=args.dataset)
    labels = y_stream.shape[1]
    cardinality = sum(np.sum(y_stream.toarray(),
                             axis=1)) / y_stream.toarray().shape[0]
    density = cardinality / labels
    metadata["dataset"] = {
        "name": args.dataset,
        "instances": data_stream.n_remaining_samples(),
        "X_shape": x_stream.shape,
        "y_shape": y_stream.shape,
        "labels": labels,
        "cardinality": cardinality,
        "density": density,
        "label_names": [i[0] for i in label_names]
    }

    logging.info("Analyzing label relationship")
    priors, coocurrences, conditional_matrix = generate_labels_relationship(
        y_stream.toarray(),
        cardinalidad=cardinality,
    )
    save_labels_relationship(output_dir, args.dataset, priors, coocurrences,
                             conditional_matrix)
    labels_relationship_graph(plot_props={"data": conditional_matrix},
                              output=os.path.join(
                                  output_dir,
                                  filename_path("relationship_graph",
                                                args.dataset,
                                                output_dir,
                                                ext="png")))
    data_stream.restart()

    logging.info("Analyzing label skew")
    labels_skew_original = generate_labels_skew(y_stream.toarray())
    labels_skew_original.to_csv(
        os.path.join(output_dir, args.dataset + "_label_skew.csv"))
    lk_plot_data.append({
        "x":
        np.arange(1, SKEW_TOP_COMBINATIONS + 1),
        "y":
        labels_skew_original.values[:SKEW_TOP_COMBINATIONS],
        "color":
        "black",
        "join":
        True,
        "label":
        "Original"
    })

    logging.info("Analyzing label distribution")
    lbo_not_scaled, labels_distribution_original = generate_labels_distribution(
        y_stream.toarray())
    lbo_not_scaled.to_csv(
        os.path.join(output_dir, args.dataset + "_label_distribution.csv"))
    ld_plot_data.append({
        "x": labels_distribution_original.index.values,
        "y": labels_distribution_original.values,
        "color": "black",
        "join": True,
        "label": "Original"
    })
    # Mean absolute error - graph
    ld_mae_plot_data.append({
        "x":
        labels_distribution_original.index.values,
        "y":
        np.zeros(shape=len(labels_distribution_original)),
        "color":
        "black",
        "label":
        "Original",
        "join":
        True
    })

    # Limpia memoria
    del x_stream, y_stream, data_stream

    #### FIN DATASET ANALYSIS ######

    #### STREAM ANALYSIS ######

    if args.streams:
        stream_names = args.streamsnames or []
        if len(stream_names) != len(args.streams):
            logging.error(
                "La cantidad de streams y la cantidad de nombres de streams no coinciden."
            )
            sys.exit(1)
        metadata["syn_streams"] = []
        for idx, i in enumerate(args.streams):
            stream_path = to_absolute(i)
            stream_name = stream_names[idx]

            logging.info("Analyzing syn stream: %s", stream_name)

            logging.info("Loading syn stream to memory")
            _, y_syn, _, _ = load_moa_stream(stream_path, args.labels)

            labels = y_syn.shape[1]
            cardinality = sum(np.sum(y_syn.toarray(),
                                     axis=1)) / y_syn.toarray().shape[0]
            density = cardinality / labels

            logging.info("Analyzing label skew")
            labels_skew_syn = generate_labels_skew(y_syn.toarray())
            labels_skew_syn.to_csv(
                os.path.join(output_dir, stream_name + "_label_skew.csv"))
            lk_plot_data.append({
                "x":
                np.arange(1, SKEW_TOP_COMBINATIONS + 1),
                "y":
                labels_skew_syn.values[:SKEW_TOP_COMBINATIONS],
                "color":
                PLOT_COLORS[idx],
                "join":
                True,
                "label":
                stream_name
            })

            logging.info("Analyzing label distribution")
            lds_not_scaled, labels_distribution_syn = generate_labels_distribution(
                y_syn.toarray())
            ld_syn = labels_distribution_syn.reindex(
                np.arange(labels_distribution_original.index.min(),
                          labels_distribution_original.index.max() +
                          1)).fillna(0)
            ld_syn_not_scaled = lds_not_scaled.reindex(
                np.arange(labels_distribution_original.index.min(),
                          labels_distribution_original.index.max() +
                          1)).fillna(0)
            ld_plot_data.append({
                "x": ld_syn.index.values,
                "y": ld_syn.values,
                "color": PLOT_COLORS[idx],
                "join": True,
                "label": stream_name
            })
            ld_syn_not_scaled.to_csv(
                os.path.join(output_dir,
                             stream_name + "_label_distribution.csv"))
            mae = mean_absolute_error(labels_distribution_original.to_numpy(),
                                      ld_syn.to_numpy())
            # plot mae
            ld_mae_plot_data.append({
                "x":
                labels_distribution_original.index.values,
                "y":
                labels_distribution_original.to_numpy() - ld_syn.to_numpy(),
                "label":
                stream_name,
                "color":
                PLOT_COLORS[idx],
                "join":
                True
            })

            logging.info("Analyzing label relationship")
            priors, coocurrences, conditional_matrix = generate_labels_relationship(
                y_syn.toarray(),
                cardinalidad=cardinality,
            )
            save_labels_relationship(output_dir, stream_name, priors,
                                     coocurrences, conditional_matrix)
            labels_relationship_graph(plot_props={"data": conditional_matrix},
                                      output=os.path.join(
                                          output_dir,
                                          filename_path("relationship_graph",
                                                        stream_name,
                                                        output_dir,
                                                        ext="png")))

            metadata["syn_streams"].append({
                "stream_path":
                stream_path,
                "stream_name":
                stream_name,
                "y_shape":
                y_syn.shape,
                "labels":
                labels,
                "cardinality":
                cardinality,
                "density":
                density,
                "labels_distribution_mean_absolute_error":
                mae
            })

    #### FIN STREAM ANALYSIS ######

    logging.info("Plotting Label Skew")
    labels_skew_graph(lk_plot_data,
                      title="Label Skew\n{}".format(
                          metadata["dataset"]["name"].title()),
                      output=os.path.join(output_dir, "label_skew.png"))

    logging.info("Plotting Label Distribution")
    labels_distribution_graph(ld_plot_data,
                              title="Label Distribution\n{}".format(
                                  metadata["dataset"]["name"].title()),
                              output=os.path.join(output_dir,
                                                  "label_distribution.png"))
    labels_distribution_mae_graph(
        ld_mae_plot_data,
        title="Label Distribution - Mean Absolute Error\n{}".format(
            metadata["dataset"]["name"].title()),
        output=os.path.join(output_dir, "ld_mae.png"))

    logging.info("Saving metadata")
    with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp:
        json.dump(metadata, fp, indent=4)
    logging.info("Files saved in %s", output_dir)
예제 #10
0
파일: b5.py 프로젝트: viggates/MLTickets
#    kf = KFold(n_splits=10)
#    kf.get_n_splits(data)
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
    data = tfidf.fit_transform(dfTickets.Title)

    #tfidf_transformer = TfidfTransformer()
    #data = tfidf_transformer.fit_transform(data)

    from skmultiflow.data.data_stream import DataStream
    dataM=data.toarray()
    dff = pd.DataFrame(dataM)
    dff['Resolution'] = dfTickets['Resolution']
    stream = DataStream(dff)
    stream.prepare_for_use()



    kf = KFold(n_splits=10)
    kf.get_n_splits(stream.X)


    #from sklearn.neighbors import KNeighborsRegressor
    #from sklearn.neighbors import KNeighborsClassifier  
    # Create the knn model.
    # Look at the five closest neighbors.
    #knn = KNeighborsClassifier(n_neighbors=5, weights='distance')

    from skmultiflow.classification.lazy.knn import KNN
예제 #11
0
            total_length = int(total_length)
            for data in response.iter_content(chunk_size=4096):
                dl += len(data)
                f.write(data)
                done = int(50 * dl / total_length)
                sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done)))
                sys.stdout.flush()
    data = np.load(file_name, allow_pickle=True)

    return data


# data = download_data()
#If dataset file is already downloaded
data = np.load(file_name, allow_pickle=True)

sam = SAMKNN()
arf = HoeffdingAdaptiveTreeClassifier()

stream = DataStream(data[:, 1:], data[:, 0].astype(int))
stream.prepare_for_use()

evaluator = EvaluatePrequential(max_samples=10000,
                                max_time=1000,
                                show_plot=True,
                                metrics=['accuracy', 'kappa'])

evaluator.evaluate(stream=stream,
                   model=[sam, arf],
                   model_names=['Sam', 'RSLVQ'])
예제 #12
0
def test_data_stream_X_y(test_path, package_path):
    test_file = os.path.join(package_path,
                             'src/skmultiflow/data/datasets/sea_stream.csv')
    raw_data = pd.read_csv(test_file)
    y = raw_data.iloc[:, -1:]
    X = raw_data.iloc[:, :-1]
    stream = DataStream(X, y)

    assert stream._Y_is_defined

    stream.prepare_for_use()

    assert stream.n_remaining_samples() == 40000

    expected_names = ['attrib1', 'attrib2', 'attrib3']
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['class']

    assert stream.n_features == 3

    assert stream.n_cat_features == 0

    assert stream.n_num_features == 3

    assert stream.n_targets == 1

    assert stream.get_data_info() == '1 target(s), 2 classes'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream_file.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]
    df = pd.read_csv(x)
    scaler = MinMaxScaler()
    df.iloc[:, 0:df.shape[1] - 1] = scaler.fit_transform(
        df.iloc[:, 0:df.shape[1] - 1])
    return df


def check_true(y, y_hat):
    if (y == y_hat):
        return 1
    else:
        return 0


df = select_data(sys.argv[1])
stream = DataStream(df)
stream.prepare_for_use()
stream_clf = HoeffdingTree()
w = int(sys.argv[2])
rho = float(sys.argv[3])
auc = float(sys.argv[4])

# In[ ]:

D3_win = D3(w, rho, stream.n_features, auc)
stream_acc = []
stream_record = []
stream_true = 0

i = 0
start = time.time()
예제 #14
0
def main():
    usedSynthData = [[
        "synthData/cess_data.csv", "synthData/cess_targets.csv"
    ], ["synthData/move_square_data.csv", "synthData/move_square_targets.csv"],
                     ["synthData/sea_data.csv", "synthData/sea_targets.csv"]]

    #Name of the datastreams
    synthDataStreams_names = [
        "Cess_data",
        "Move_squares",
        "Sea_data",
    ]

    realDataFiles = [
        ["realData/electric_data.csv", "realData/electric_targets.csv"],
        ["realData/poker_data.csv", "realData/poker_targets.csv"],
        ["realData/weather_data.csv", "realData/weather_targets.csv"],
        ["realData/rialto_data.csv", "realData/rialto_targets.csv"]
    ]

    #Name of the datastreams
    realDataStreams_names = ["Electric", "Poker", "Weather", "Rialto"]

    #fixe the poker dataset
    #dfX=pd.read_csv("realData/poker_data_broken.csv")
    #dfY=pd.read_csv(realTargetFiles[1])
    #print(dfX.dtypes)

    #remove the false columns
    #dfX = dfX.drop(columns = ['feat_11', 'feat_12'])
    #print(dfX.dtypes)

    #save fixed data as csv
    #dfX.to_csv(r'realData/poker_data.csv', index = None, header=True)

    #check if saved correctly
    #X=pd.read_csv(realDataFiles[1])
    #print(X.dtypes)

    #fix electirc dataset
    #dfX=pd.read_csv("realData/electric_data_broken.csv")
    #print(dfX.dtypes)

    #remove the false columns
    #dfX = dfX.drop(columns = ['feat_1', 'feat_2'])
    #print(dfX.dtypes)
    #dfX.to_csv(r'realData/electric_data.csv', index = None, header=True)

    #check if saved correctly
    #X=pd.read_csv(realDataFiles[0])
    #print(X.dtypes)

    #Stream with synth generated data from generators, synth data stream that were used in other works and real data streams
    synthDataStreams = [
        [AGRAWALGenerator(random_state=112, perturbation=0.1), "Agrawal"],
        [
            ConceptDriftStream(stream=AGRAWALGenerator(random_state=112),
                               drift_stream=AGRAWALGenerator(random_state=112,
                                                             perturbation=0.1),
                               position=40000,
                               width=10000), "Agrawal_drift"
        ],
        [
            HyperplaneGenerator(mag_change=0.001, noise_percentage=0.1),
            "Hyperplane"
        ],
        [
            ConceptDriftStream(stream=HyperplaneGenerator(),
                               drift_stream=HyperplaneGenerator(),
                               position=40000,
                               width=10000), "Hyperplane_drift"
        ], [SineGenerator(random_state=112), "Sine"],
        [
            ConceptDriftStream(stream=SineGenerator(random_state=112),
                               drift_stream=SineGenerator(random_state=112),
                               position=40000,
                               width=10000), "Sine_drift"
        ]
    ]

    synthDataStreamsUsed = []
    for i in range(len(usedSynthData)):
        synthDataStreamsUsed.append([
            DataStream(pd.read_csv(usedSynthData[i][0]),
                       pd.read_csv(usedSynthData[i][1])),
            synthDataStreams_names[i]
        ])

    realDataStreams = []
    for i in range(len(realDataFiles)):
        realDataStreams.append([
            DataStream(pd.read_csv(realDataFiles[i][0]),
                       pd.read_csv(realDataFiles[i][1])),
            realDataStreams_names[i]
        ])

    clfs = [[RSLVQSgd(), 'RSLVQ_SGD'], [RSLVQAdadelta(), 'RSLVQ_Adadelta'],
            [RSLVQRMSprop(), 'RSLVQ_RMSprop'], [RSLVQAdam(), 'RSLVQ_Adam']]

    max_items = 40000

    #insert the dataset array that should be evaluated, if the reform exception occurs, set the dataset
    #that is effected by it as the first one in the array and run again
    for i in range(len(synthDataStreams)):
        for j in range(len(clfs)):
            #print('bla')
            #custom_evaluation(synthDataStreams[i], clfs[j], max_items, False)
            custom_evaluation(synthDataStreams[i], clfs[j], max_items, True)
def main():
    args = parser.parse_args()
    logging = set_logger(args.verbose)
    if not valid_args(args):
        sys.exit(0)

    datasets = args.datasets
    models = [i.lower() for i in args.models]
    copies = [int(i) for i in args.copies]

    dir_path = os.path.dirname(os.path.realpath(__file__))
    to_absolute = curry(to_absolute_path)(dir_path)

    metadata = {
        "experimento": args.experiment or "",
        "command": " ".join(sys.argv),
        "date": time.strftime("%Y%m%d%H%M%S"),
        "models": models,
        "copies": copies,
        "datasets": []
    }
    logging.debug(metadata)

    # DATASET CLASSIFICATION ######
    all_train_data = []
    true_vs_pred = []
    logging.debug(datasets)
    for idx, dataset in enumerate(datasets):
        logging.info("Classifying dataset %s", dataset)
        logging.debug("Loading dataset: %s", dataset)
        x_stream, y_stream, _, label_names = load_given_dataset(dataset)
        logging.debug("Copies per instance: %s", copies[idx])
        x_stream, y_stream = repeatInstances(
            x_stream.todense(), y_stream.todense(), copies=copies[idx])

        data_stream = DataStream(data=x_stream,
                                 y=y_stream, name=dataset)
        cardinality = sum(np.sum(y_stream, axis=1)
                          ) / y_stream.shape[0]
        dataset_metadata = {
            "name": dataset,
            "instances": data_stream.n_remaining_samples(),
            "x_shape": x_stream.shape,
            "y_shape": y_stream.shape,
            "cardinality": cardinality,
            "label_names": [i[0] for i in label_names],
            "copies": copies[idx]
        }
        logging.debug(dataset_metadata)

        for model_id in models:
            model = SUPPORTED_MODELS[model_id]
            logging.info(model["name"])
            train_data = {"model": model["name"], "model_id": model_id,
                          "stream": data_stream.name, "copies": copies[idx]}
            train_stats, true_labels, predictions = evaluar(
                data_stream,
                model["model"](data_stream),
                pretrain_size=args.pretrainsize,
                ensemble=model["ensemble"],
                catch_errors=args.catch,
                logging=logging,
                train_logs_max=100000,
                window_size=20
            )
            eval_stats = {}
            if true_labels and predictions:
                logging.info("Evaluating...")
                eval_stats = evaluation_metrics(
                    true_labels,
                    predictions,
                    train_stats["start_time"],
                    train_stats["end_time"]
                )
                true_vs_pred.append({
                    "model": model_id,
                    "dataset": dataset,
                    "true": true_labels,
                    "pred": predictions
                })
            train_data.update(train_stats)
            train_data.update(eval_stats)
            all_train_data.append(train_data)
            data_stream.restart()

        metadata["datasets"].append(dataset_metadata)
        # Limpia memoria
        del x_stream, y_stream, data_stream

    # FIN DATASET CLASSIFICATION ######

    # STREAM ANALYSIS ######

    if args.streams:
        print("Stream classification. Not yet implemented.")
        sys.exit(0)
        stream_names = args.streamsnames or []
        if len(stream_names) != len(args.streams):
            logging.error(
                "La cantidad de streams y la cantidad de nombres" +
                " de streams no coinciden."
            )
            sys.exit(1)
            metadata["syn_streams"] = []
            for idx, i in enumerate(args.streams):
                stream_path = to_absolute(i)
                stream_name = stream_names[idx]

                logging.info("Classifying syn stream: %s", stream_name)

                logging.info("Loading syn stream to memory")
                _, y_syn, _, _ = load_moa_stream(stream_path, args.labels)

                cardinality = sum(
                    np.sum(y_syn.toarray(), axis=1)
                ) / y_syn.toarray().shape[0]

                metadata["syn_streams"].append({
                    "labels": args.labels,
                    "stream_path": stream_path,
                    "stream_name": stream_name,
                    "y_shape": y_syn.shape,
                    "cardinality": cardinality,
                })

                # FIN STREAM ANALYSIS ######

    default_output_path = "experiments/"
    dest_dir = "{}_classification".format(
        time.strftime(TIME_STR)
    )
    output_rel = os.path.join(
        args.output if args.output else default_output_path,
        dest_dir
    )
    output_dir = pipe(
        output_rel,
        to_absolute,
        create_path_if_not_exists
    )

    logging.info("Saving results in a csv...")
    pd.DataFrame.from_dict(all_train_data).to_csv(
        os.path.join(
            output_dir, "results.csv"
        )
    )

    logging.info("Saving true_vs_pred in a csv...")
    for i in true_vs_pred:
        true_file = '{}_{}_true.csv'.format(i["dataset"], i["model"])
        pred_file = '{}_{}_predicted.csv'.format(i["dataset"], i["model"])
        np.savetxt(os.path.join(output_dir, true_file),
                   i["true"], delimiter=',')
        np.savetxt(os.path.join(output_dir, pred_file),
                   i["pred"], delimiter=',')

    logging.info("Saving metadata")
    with open(os.path.join(output_dir, 'metadata.json'), 'w') as f_p:
        json.dump(metadata, f_p, indent=4)

    logging.info("Files saved in %s", output_dir)
예제 #16
0
def InnerCycle(X, y, inject_drift, perc_train, window, delta, pval,
               prob_instance, inst_delay):

    # get number of training samples
    ntrain = int(perc_train * X.shape[0])

    if inject_drift:
        # pick a point between 0.7 and 0.9 of the stream
        dpoints = Driftpoints(X)
        dpoints["cleanrun"] = dpoints["row"] - ntrain

        # contaminate X after that point
        X = Swapcols(df=X,
                     class_vec=y,
                     ids=dpoints["cols"],
                     t_change=dpoints["row"])
    else:
        dpoints = dict({"row": X.shape[0], "cols": 0})

    # cast data as DataStream class
    stream = DataStream(X, y)
    stream.prepare_for_use()
    # call incr model (main classifier, teacher model)
    stream_clf = ARF(n_estimators=25,
                     drift_detection_method=None,
                     warning_detection_method=None)

    # get training data... first ntrain rows
    Xtrain, ytrain = stream.next_sample(ntrain)

    # partial fit of the incre model using training data
    stream_clf.fit(Xtrain, ytrain, classes=stream.target_values)
    yhat_train = stream_clf.predict(Xtrain)
    yhat_train_prob = stream_clf.predict_proba(
        Xtrain)  ### needs warnings!!!!!!!!!
    yhat_tr_max_prob = np.array([np.max(x) for x in yhat_train_prob])

    # fit student model
    student_clf = ARF(n_estimators=25,
                      drift_detection_method=None,
                      warning_detection_method=None)
    student_clf.fit(Xtrain, yhat_train, classes=stream.target_values)

    student_regr = RHT()
    student_regr.fit(Xtrain, yhat_tr_max_prob)

    ####### Call drift detectors

    ## Supervised
    # Supervised with ADWIN
    S_ADWIN = ADWIN()  #(delta=delta)
    S_ADWIN_alarms = []
    # Supervised with PHT
    S_PHT = PHT()  #(min_instances=window,delta=delta)
    S_PHT_alarms = []
    # Delayed Supervised with ADWIN
    DS_ADWIN = ADWIN()  #(delta=delta)
    DS_ADWIN_alarms = []
    # Delayed Supervised with PHT
    DS_PHT = PHT()  #(min_instances=window,delta=delta)
    DS_PHT_alarms = []

    ## Semi-supervised
    # Semi-Supervised with ADWIN
    WS_ADWIN = ADWIN()  #(delta=delta)
    WS_ADWIN_alarms = []
    # Supervised with PHT
    WS_PHT = PHT()  #(min_instances=window,delta=delta)
    WS_PHT_alarms = []
    # Delayed Supervised with ADWIN
    DWS_ADWIN = ADWIN()  #(delta=delta)
    DWS_ADWIN_alarms = []
    # Delayed Supervised with PHT
    DWS_PHT = PHT()  #(min_instances=window,delta=delta)
    DWS_PHT_alarms = []

    ##### Unsupervised
    # Student with ADWIN
    U_ADWIN = ADWIN()  #(delta=delta)
    U_ADWIN_alarms = []
    # Student with PHT
    U_PHT = PHT()  #(min_instances=window,delta=delta)
    U_PHT_alarms = []

    # Student with ADWIN
    UR_ADWIN = ADWIN()  #(delta=delta)
    UR_ADWIN_alarms = []
    # Student with PHT
    UR_PHT = PHT()  #(min_instances=window,delta=delta)
    UR_PHT_alarms = []

    # WRS with output
    WRS_Output = HypothesisTestDetector(method="wrs", window=window, thr=pval)
    WRS_Output_alarms = []
    # WRS with class prob
    WRS_Prob = HypothesisTestDetector(method="wrs", window=window, thr=pval)
    WRS_Prob_alarms = []
    # TT with output
    TT_Output = HypothesisTestDetector(method="tt", window=window, thr=pval)
    TT_Output_alarms = []
    # TT with class prob
    TT_Prob = HypothesisTestDetector(method="tt", window=window, thr=pval)
    TT_Prob_alarms = []
    # KS with output
    KS_Output = HypothesisTestDetector(method="ks", window=window, thr=pval)
    KS_Output_alarms = []
    # KS with class prob
    KS_Prob = HypothesisTestDetector(method="ks", window=window, thr=pval)
    KS_Prob_alarms = []

    Driftmodels = [
        S_ADWIN, S_PHT, DS_ADWIN, DS_PHT, WS_ADWIN, WS_PHT, DWS_ADWIN, DWS_PHT,
        U_ADWIN, U_PHT, UR_ADWIN, UR_PHT, WRS_Output, TT_Output, KS_Output,
        WRS_Prob, TT_Prob, KS_Prob
    ]

    Driftmodels_alarms = [
        S_ADWIN_alarms, S_PHT_alarms, DS_ADWIN_alarms, DS_PHT_alarms,
        WS_ADWIN_alarms, WS_PHT_alarms, DWS_ADWIN_alarms, DWS_PHT_alarms,
        U_ADWIN_alarms, U_PHT_alarms, UR_ADWIN_alarms, UR_PHT_alarms,
        WRS_Output_alarms, TT_Output_alarms, KS_Output_alarms, WRS_Prob_alarms,
        TT_Prob_alarms, KS_Prob_alarms
    ]

    S_driftmodels = Driftmodels[0:2]
    DS_driftmodels = Driftmodels[2:4]
    WS_driftmodels = Driftmodels[4:6]
    DWS_driftmodels = Driftmodels[6:8]
    Ustd_driftmodels = Driftmodels[8:10]
    Ustdreg_driftmodels = Driftmodels[10:12]
    Uoutput_driftmodels = Driftmodels[12:15]
    Uprob_driftmodels = Driftmodels[15:18]

    # always updated
    S_clf = copy.deepcopy(stream_clf)
    # always updated with delay
    DS_clf = copy.deepcopy(stream_clf)
    # updated immediately with some prob
    WS_clf = copy.deepcopy(stream_clf)
    # updated with delay with some prob
    DWS_clf = copy.deepcopy(stream_clf)
    # never updated
    U_clf = copy.deepcopy(stream_clf)

    i = ntrain
    k = 0
    DWS_yhat_hist = []
    DS_yhat_hist = []
    X_hist = []
    y_hist = []
    while (stream.has_more_samples()):
        print(i)
        #i=3000
        Xi, yi = stream.next_sample()

        y_hist.append(yi[0])
        X_hist.append(Xi)

        ext_Xi = np.concatenate([Xtrain[-10:], Xi])

        U_prob = U_clf.predict_proba(ext_Xi)[-1]
        U_yhat = U_clf.predict(ext_Xi)[-1]
        S_yhat = S_clf.predict(ext_Xi)[-1]
        WS_yhat = WS_clf.predict(ext_Xi)[-1]
        DS_yhat = DS_clf.predict(ext_Xi)[-1]
        DWS_yhat = DWS_clf.predict(ext_Xi)[-1]

        DWS_yhat_hist.append(DWS_yhat)
        DS_yhat_hist.append(DS_yhat)

        if len(U_prob) < 2:
            U_yhat_prob_i = U_prob[0]
        elif len(U_prob) == 2:
            U_yhat_prob_i = U_prob[1]
        else:
            U_yhat_prob_i = np.max(U_prob)

        y_meta_hat_i = student_clf.predict(ext_Xi)[-1]
        y_meta_prob = student_regr.predict(ext_Xi)[-1]

        # Updating student model
        student_clf.partial_fit(Xi, [U_yhat])
        # Updating supervised model
        S_clf.partial_fit(Xi, yi)

        # Computing loss
        S_err_i = int(yi[0] != S_yhat)
        student_err_i = int(y_meta_hat_i != U_yhat)
        student_prob_err_i = U_yhat_prob_i - y_meta_prob

        for model in S_driftmodels:
            model.add_element(S_err_i)

        for model in Ustd_driftmodels:
            model.add_element(student_err_i)

        for model in Ustdreg_driftmodels:
            model.add_element(student_prob_err_i)

        for model in Uoutput_driftmodels:
            model.add_element(U_yhat)

        for model in Uprob_driftmodels:
            model.add_element(U_yhat_prob_i)

        put_i_available = np.random.binomial(1, prob_instance)

        if k >= inst_delay:
            DS_err_i = int(
                y_hist[k - inst_delay] != DS_yhat_hist[k - inst_delay])
            DS_clf.partial_fit(X_hist[k - inst_delay],
                               [y_hist[k - inst_delay]])
            for model in DS_driftmodels:
                model.add_element(DS_err_i)

            if put_i_available > 0:
                DWS_err_i = int(
                    y_hist[k - inst_delay] != DWS_yhat_hist[k - inst_delay])
                DWS_clf.partial_fit(X_hist[k - inst_delay],
                                    [y_hist[k - inst_delay]])
                for model in DWS_driftmodels:
                    model.add_element(DWS_err_i)

        if put_i_available > 0:
            WS_err_i = int(yi[0] != WS_yhat)
            WS_clf.partial_fit(Xi, yi)
            for model in WS_driftmodels:
                model.add_element(WS_err_i)

        # detect changes
        for j, model in enumerate(Driftmodels):
            has_change = model.detected_change()
            if has_change:
                Driftmodels_alarms[j].append(i)

        i += 1
        k += 1

    return ([Driftmodels_alarms, dpoints])
예제 #17
0
def test_data_stream_X_y(test_path):
    test_file = os.path.join(test_path, 'sea_stream_file.csv')
    raw_data = pd.read_csv(test_file)
    y = raw_data.iloc[:, -1:]
    X = raw_data.iloc[:, :-1]
    stream = DataStream(X, y)

    assert stream._Y_is_defined

    assert stream.n_remaining_samples() == 40

    expected_names = ['attrib1', 'attrib2', 'attrib3']
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['class']

    assert stream.n_features == 3

    assert stream.n_cat_features == 0

    assert stream.n_num_features == 3

    assert stream.n_targets == 1

    assert stream.get_data_info() == '1 target(s), 2 classes'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream_file.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]

    # Ensure that the regression case is also covered
    y = raw_data.iloc[:, -1:]
    X = raw_data.iloc[:, :-1]
    y = y.astype('float64')
    stream = DataStream(X, y, name='Test')

    assert stream.task_type == 'regression'
    assert stream.get_data_info() == 'Test: 1 target(s)'
예제 #18
0
"""
Data source: https://github.com/alipsgh/data_streams
"""
#data, X, y = read_data_arff('./data/stagger_w_50_n_0.1_103.arff')
#data, X, y = read_data_arff('./data/led_w_500_n_0.1_104.arff')

# 1.c Load and preprocessing data
"""
Data source: https://github.com/scikit-multiflow/streaming-datasets
"""
#data, X, y = read_data_csv('./data/streaming-datasets-master/elec.csv')
#data, X, y = read_data_csv('./data/streaming-datasets-master/airlines.csv')
#data, X, y = read_data_csv('./data/streaming-datasets-master/agr_a.csv')
#data, X, y = read_data_csv('./data/streaming-datasets-master/covtype.csv')

stream = DataStream(X, y)

stream.prepare_for_use()

# 2a. Models initialization
nb = NaiveBayes()
ht = HoeffdingTreeClassifier()
aw = AccuracyWeightedEnsembleClassifier()
dw = DynamicWeightedMajorityClassifier()
ob = OnlineBoostingClassifier()
oz = OzaBaggingClassifier()

# 2b. Inicialization of DDCW model for comparsion tests
dwc = DiversifiedDynamicClassWeightedClassifier(
    period=100,
    base_estimators=[NaiveBayes(), HoeffdingTreeClassifier()],
예제 #19
0
def test_data_stream(test_path):
    test_file = os.path.join(test_path, 'sea_stream_file.csv')
    raw_data = pd.read_csv(test_file)
    stream = DataStream(raw_data, name='Test')

    assert stream.n_remaining_samples() == 40

    expected_names = ['attrib1', 'attrib2', 'attrib3']
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['class']

    assert stream.n_features == 3

    assert stream.n_cat_features == 0

    assert stream.n_num_features == 3

    assert stream.n_targets == 1

    assert stream.get_data_info() == 'Test: 1 target(s), 2 classes'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream_file.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]

    assert 'stream' == stream._estimator_type

    expected_info = "DataStream(n_targets=-1, target_idx=1, cat_features=None, name='Test')"
    assert stream.get_info() == expected_info
예제 #20
0
        temp = sum(x[low_index:high_index])/N
        w_avg.append(temp)
        low_index = low_index + N
        high_index = high_index + N
    return w_avg


# MAIN CODE
warnings.warn = warn
warnings.simplefilter(action='ignore', category=FutureWarning)

df = select_data(sys.argv[1])
nu = float(sys.argv[2])
size = int(sys.argv[3])
percent = float(sys.argv[4])
stream = DataStream(df)
final_acc, st_rec = unsupervised_analysis(df,nu,size,percent)
print(final_acc)


# PLOT CODE
temp=int((len(st_rec))/30)
st_rec2 = window_average(st_rec, temp)
x = np.linspace(0, 100, len(st_rec2), endpoint=True)

f = plt.figure()
plt.plot(x, st_rec2, 'r', label='OCDD', marker="*")
plt.xlabel('Percentage of data', fontsize=10)
plt.ylabel('Accuracy', fontsize=10)
plt.grid(True)
plt.legend(loc='lower left')