def test_adaptive_forest():
    test_data_directory = os.path.join(TEST_DIRECTORY, 'data')
    test_file = os.path.join(
        test_data_directory,
        'test_data/weather.csv'
    )
    raw_data = pd.read_csv(test_file)
    stream1 = DataStream(raw_data, name='Test')
    stream2 = DataStream(raw_data, name='Test')
#    learner = ExtendedHoeffdingAdaptiveTree()
#    learner1 = AdaptiveHoeffdingTreeEnsemble(n_estimators=4)
    # stream1_learner = calculate_accuracy(learner, stream1, stream1.n_samples)
#    stream2_learner = calculate_accuracy(learner1, stream2, stream2.n_samples)
#    stream1_learner = calculate_accuracy(learner, stream1, stream1.n_samples)
    learner3 = AdaptiveRandomForestClassifier(n_estimators=10)
    stream3_learner = calculate_accuracy(learner3, stream1, stream1.n_samples)
#    learner4 = StreamingRandomPatchesClassifier(n_estimators=3)
#    stream4_learner = calculate_accuracy(learner4, stream1, stream1.n_samples)
#    learner5 = DeepStreamLearner(classes=stream1.target_values)
#    stream5_learner = calculate_accuracy(learner5, stream1, stream1.n_samples)

    import pudb; pudb.set_trace()  # XXX BREAKPOINT
    assert 1 == 1
#    print(stream2_learner.base_estimator.accuracy)
    with open (
            os.path.join(test_data_directory, 'test_data/adaptive_test_result.txt'),
            '+w'
    ) as f:
        f.write('stream2 average_accuracy:')

    import pudb; pudb.set_trace()  # XXX BREAKPOINT
    assert 1 == 1
示例#2
0
 def partial_fit(self, data: pd.DataFrame) -> None:
     """ The most basic working version for now.
     TODO: improve, calculate accuracy, maybe add possibility to learn in batches? """
     stream = DataStream(self.prepare_data(data))
     n = stream.n_remaining_samples()
     for i in range(n):
         x, y = stream.next_sample()
         if self.model.predict(x)[0] == y[0]:
             self.correct_predictions += 1
         self.model.partial_fit(x, y)
     self.predictions += n
     self.accuracy = self.correct_predictions / self.predictions
def test_hoeffding_tree_ensemble():
    test_data_directory = os.path.join(TEST_DIRECTORY, 'data')
    test_file = os.path.join(
        test_data_directory,
        'test_data/airlines.csv'
    )
    test_file2 = os.path.join(
        test_data_directory,
        'test_data/weather.csv'
    )
    raw_data = pd.read_csv(test_file)
    raw_data2 = pd.read_csv(test_file2)
    stream3 = DataStream(raw_data2, name='Test2')
    stream1 = DataStream(raw_data, name='Test')
    stream2 = RandomTreeGenerator(
        tree_random_state=23, sample_random_state=12, n_classes=4,
        n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5,
        max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15
    )
#    learner = HoeffdingTreeClassifier(
#        leaf_prediction='nb',
#        classes=stream.target_values
#    )

#    learner = HoeffdingTreeEnsemble(
#        n_estimators=3,
#        classes=stream.target_values)
#    learner1 = DeepStreamLearner(classes=stream1.target_values)
#    stream1_learner = calculate_accuracy(learner1, stream1, stream1.n_samples)
    learner2 = DeepStreamLearner(classes=stream2.target_values)
    stream2_learner = calculate_accuracy(learner2, stream2, 100000)
    learner3 = DeepStreamLearner(classes=stream3.target_values)
    stream3_learner = calculate_accuracy(learner3, stream3, stream3.n_samples)
    with open (
        os.path.join(test_data_directory, 'test_data/test_result.txt'),
        '+w'
    ) as f:
#        f.write('stream1 accuracy: {} \n'.format(stream1_learner.accuracy[-1]))
#        f.write('stream1 first_layer_accuracy: {} \n'.format(stream1_learner.first_layer_cascade.accuracy_per_sample[-1]))
#        f.write('stream1 average_accuracy: {} \n'.format(sum(stream1_learner.accuracy)/stream1_learner.number_of_samples))
#        f.write('stream1 first_layer_average_accuracy: {} \n \n'.format(sum(stream1_learner.first_layer_cascade.accuracy_per_sample)/stream1_learner.number_of_samples))
#        f.write('stream2 accuracy: {} \n'.format(stream2_learner.accuracy[-1]))
#        f.write('stream2 first_layer_accuracy: {} \n'.format(stream2_learner.first_layer_cascade.accuracy_per_sample[-1]))
        f.write('stream2 average_accuracy: {} \n'.format(sum(stream2_learner.accuracy)/stream2_learner.number_of_samples))
        f.write('stream2 first_layer_average_accuracy: {} \n \n'.format(sum(stream2_learner.first_layer_cascade.accuracy_per_sample)/stream2_learner.number_of_samples))
        f.write('stream3 accuracy: {} \n'.format(stream3_learner.accuracy[-1]))
        f.write('stream3 first_layer_accuracy: {} \n'.format(stream3_learner.first_layer_cascade.accuracy_per_sample[-1]))
        f.write('stream3 average_accuracy: {} \n'.format(sum(stream3_learner.accuracy)/stream3_learner.number_of_samples))
        f.write('stream3 first_layer_average_accuracy: {} \n \n'.format(sum(stream3_learner.first_layer_cascade.accuracy_per_sample)/stream3_learner.number_of_samples))

    import pudb; pudb.set_trace()  # XXX BREAKPOINT
    assert 1 == 1
示例#4
0
def test_pipeline(test_path):
    n_categories = 5

    # Load test data generated using:
    # RandomTreeGenerator(tree_random_state=1, sample_random_state=1,
    #                     n_cat_features=n_categories, n_num_features=0)
    test_file = os.path.join(test_path, 'data-one-hot.npz')
    data = np.load(test_file)
    X = data['X']
    y = data['y']
    stream = DataStream(data=X, y=y)
    stream.prepare_for_use()

    # Setup transformer
    cat_att_idx = [[i + j for i in range(n_categories)]
                   for j in range(0, n_categories * n_categories, n_categories)
                   ]
    transformer = OneHotToCategorical(categorical_list=cat_att_idx)

    # Set up the classifier
    classifier = KNNAdwin(n_neighbors=2, max_window_size=50, leaf_size=40)
    # Setup the pipeline
    pipe = Pipeline([('one-hot', transformer), ('KNNAdwin', classifier)])
    # Setup the evaluator
    evaluator = EvaluatePrequential(show_plot=False,
                                    pretrain_size=10,
                                    max_samples=100)
    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)

    metrics = evaluator.get_mean_measurements()

    expected_accuracy = 0.5555555555555556
    assert np.isclose(expected_accuracy, metrics[0].accuracy_score())

    expected_kappa = 0.11111111111111116
    assert np.isclose(expected_kappa, metrics[0].kappa_score())
    print(pipe.get_info())
    expected_info = "Pipeline:\n" \
                    "[OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9],\n" \
                    "                                      [10, 11, 12, 13, 14],\n" \
                    "                                      [15, 16, 17, 18, 19],\n" \
                    "                                      [20, 21, 22, 23, 24]])\n" \
                    "KNNAdwin(leaf_size=40, max_window_size=50, n_neighbors=2,\n" \
                    "         nominal_attributes=None)]"
    assert pipe.get_info() == expected_info
示例#5
0
def get_error_hoeffdingtree(data, pre_train_size, **hf_kwargs):
    orig_X = data[:, :-1]
    orig_y = data[:, -1].astype(int)
    stream = DataStream(orig_X, orig_y)
    hf = HoeffdingTreeClassifier(**hf_kwargs)

    pretrainX, pretrainy = stream.next_sample(pre_train_size)

    # Pre-train
    hf.partial_fit(pretrainX, pretrainy, classes=stream.target_values)

    evaluations = []
    while stream.has_more_samples():
        X, y = stream.next_sample()

        # Evaluation
        y_hat = hf.predict(X)
        evaluations.append(int(y_hat[0] == y[0]))

        # Train
        hf.partial_fit(X, y, classes=stream.target_values)

    return evaluations
示例#6
0
def train(name, clusters, window, normalize=False):
    input_csv = '{}{}_clusters={}_window={}_prepared.csv'.format(
        DATA_LOCATION, name, clusters, window)
    data = pd.read_csv(input_csv, index_col=0)

    if normalize:
        states = data.filter(['current_state', 'next_state'])
        sensors = data.drop(columns=['current_state', 'next_state'])
        scaler = StandardScaler()
        data = pd.DataFrame(data=scaler.fit_transform(X=sensors),
                            index=data.index,
                            columns=sensors.columns)
        data = pd.concat([data, states], axis='columns')

    stream = DataStream(data)

    hf = HoeffdingTreeClassifier()
    sgd = SGDClassifier()

    evaluator = EvaluatePrequential()
    evaluator.evaluate(stream=stream, model=[hf, sgd])
    # print('---------------------------------------------')
    # measurements = evaluator.get_mean_measurements()[0]
    # print(measurements.confusion_matrix)
    # print(measurements.accuracy_score())
    data = []
    for i, measurements in enumerate(evaluator.get_mean_measurements()):
        data.append([
            name, clusters, window, MODEL_NAMES[i], normalize,
            measurements.accuracy_score(),
            measurements.precision_score(),
            measurements.recall_score(),
            measurements.f1_score()
        ])
    return pd.DataFrame(data=data,
                        columns=[
                            'name', 'clusters', 'window', 'model',
                            'normalized', 'accuracy', 'precision', 'recall',
                            'f1'
                        ])
示例#7
0
    df = pd.read_csv(filename, comment='#')
    y = df['Target'].values
    anoms = (y == "'Anomaly'")
    normal = (y == "'Normal'")
    y[anoms] = 1
    y[normal] = 0
    X = df.drop(["Target"], axis=1)
    return X, y


n_samples = 5000

X, y = generate_data(n_samples)
# data_file="data/mulcross.csv"
# X,y=read_data(data_file)
stream = DataStream(data=X, y=y)
n_clusters = 3
random_state = 3

# 2. Prepare for use
stream.prepare_for_use()

kmeans1 = KMeans(n_clusters=n_clusters,
                 random_state=random_state,
                 init="k-means++")
kmeans2 = KMeans(n_clusters=n_clusters,
                 random_state=random_state,
                 init="k-means++")

coreset_size = 100
示例#8
0
            precisions = []
            recalls = []
            f1s = []
            tprs = []
            aucs = []
            mean_fpr = np.linspace(0, 1, 100)

            for fold, split in enumerate(cross_validation.split(X_train, y_train)):
                fold_train_indexes, fold_test_indexes = split
                fold_X_train = X_train.iloc[fold_train_indexes]
                fold_y_train = y_train.iloc[fold_train_indexes]
                fold_X_test = X_train.iloc[fold_test_indexes]
                fold_y_test = y_train.iloc[fold_test_indexes]

                if (classifier_name == 'hoeffding'):
                    stream = DataStream(X, y.values.ravel())
                    stream.prepare_for_use()
                    evaluator = EvaluatePrequential(
                        show_plot=False, pretrain_size=200, metrics=['accuracy'])
                    model = evaluator.evaluate(
                        stream=stream, model=classifier)[0]
                    model.fit(fold_X_train, fold_y_train.values.ravel())

                # elif (classifier_name == 'cn2'):
                #     model = CrossValidation(
                #         table_from_frame(data), [CN2Learner()], k=5)

                else:
                    model = classifier.fit(
                        fold_X_train, fold_y_train.values.ravel())
                    y_pred = model.predict(fold_X_test)
示例#9
0
# Global variable
TRAINING_SIZE = 1
grace = 1000
ignore = 0

elec_data = arff.load("elecNormNew.arff")
elec_df = pandas.DataFrame(elec_data)
elec_df.columns = ['date', 'day', 'period', 'nswprice', 'nswdemand', 'vicprice', 'vicdemand', 'transfer', 'class']
mapping = {"day":{"1":1, "2":2, "3":3, "4":4, "5":5, "6":6, "7":7}, "class": {"UP": 0, "DOWN": 1}}
elec_df = elec_df.replace(mapping)

elec_full_df = pandas.concat([elec_df] * 200)

STREAM_SIZE = elec_full_df.shape[0]

elec_stream = DataStream(elec_full_df, name="elec")
elec_stream.prepare_for_use()

X_train, y_train = elec_stream.next_sample(TRAINING_SIZE)

ht = HoeffdingTreeClassifier()

ht.partial_fit(X_train, y_train)

n_global = ignore + TRAINING_SIZE  # Cumulative Number of observations
d_ddm = 0
w_ddm = 0
TP_ddm = []
FP_ddm = []
RT_ddm = []
DIST_ddm = []
示例#10
0
from skmultiflow.evaluation import EvaluatePrequential
from forget_tree import ForgetHATT
from skmultiflow.data import DataStream
import pandas as pd
from skmultiflow.trees import HATT

data_filepath = "../datasets/transient_chess.data"
labels_filepath = "../datasets/transient_chess.labels"

data = pd.read_csv(data_filepath, delimiter=" ")
labels = pd.read_csv(labels_filepath, delimiter=" ")
labels['y'] = labels['y'].astype('category')

stream = DataStream(data=data, y=labels)
stream.prepare_for_use()

evaluator = EvaluatePrequential(output_file="log.log",
                                show_plot=False,
                                metrics=['accuracy'],
                                max_time=60)

models = [
    ForgetHATT(data_filepath,
               labels_filepath,
               forget_percentage=0,
               delimiter=" "),
    ForgetHATT(data_filepath,
               labels_filepath,
               forget_percentage=0.1,
               delimiter=" "),
    ForgetHATT(data_filepath,
def start_run(options):
    if not os.path.exists(options.experiment_directory):
        print('No Directory')
        return
    name = '-'.join([options.moa_learner, str(options.concept_limit), 'py'])
    print(name)
    datastream_filename = None
    datastream_pickle_filename = None
    fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"]))
    print(fns)
    for fn in fns:
        if fn.split('.')[-1] == 'ARFF':
            actual_fn = fn.split(os.sep)[-1]
            fn_path = os.sep.join(fn.split(os.sep)[:-1])
            print(actual_fn)
            print(fn_path)
            pickle_fn = f"{actual_fn.split('.')[0]}_concept_chain.pickle"
            pickle_full_fn = os.sep.join([fn_path, pickle_fn])
            csv_fn = f"{name}.csv"
            csv_full_fn = os.sep.join([fn_path, csv_fn])
            print(csv_full_fn)
            if os.path.exists(pickle_full_fn):
                skip_file = False
                if os.path.exists(csv_full_fn):
                    if os.path.getsize(csv_full_fn) > 2000:
                        skip_file = True
                if not skip_file:
                    datastream_filename = fn
                    datastream_pickle_filename = pickle_full_fn
                    break
                else:
                    print('csv exists')
    if datastream_filename == None:
        print('Not datastream file')
        return
    print(datastream_filename)

    bat_filename = f"{options.experiment_directory}{os.sep}{name}.{'bat' if not options.using_linux else 'sh'}"
    if not os.path.exists(bat_filename) or True:
        with open(f'{datastream_pickle_filename}', 'rb') as f:
            concept_chain = pickle.load(f)
        print(concept_chain)
        concepts = sorted(list(concept_chain.keys()))
        num_examples = concepts[-1] + (concepts[-1] - concepts[-2])
        stream_string = moaLink.get_moa_stream_from_filename(
            os.sep.join(datastream_filename.split(os.sep)[:-1]),
            datastream_filename.split(os.sep)[-1])
        moa_string = moaLink.make_moa_command(stream_string,
                                              options.moa_learner,
                                              options.concept_limit,
                                              'int',
                                              num_examples,
                                              config.report_window_length,
                                              options.experiment_directory,
                                              is_bat=not options.using_linux)
        moaLink.save_moa_bat(moa_string, bat_filename, not options.using_linux)
        # datastream = None
    t_start = process_time()
    command = f"{bat_filename} {options.moa_location}"
    print(command)
    print(options.moa_learner)
    if options.moa_learner != 'arf':
        if options.using_linux:

            subprocess.run(['chmod', '+x', bat_filename])
            subprocess.run([bat_filename, options.moa_location])
        else:
            subprocess.run(command)
    else:
        datastream_filename = f"{os.sep.join(datastream_filename.split(os.sep)[:-1])}{os.sep}{datastream_filename.split(os.sep)[-1]}"
        data = arff.loadarff(datastream_filename)
        df = pd.DataFrame(data[0], dtype='float64')
        df['y0'] = df['y0'].astype('int64')
        # df["y0"] = df["y0"].astype('category')
        print(df.info())
        datastream = DataStream(df)
        datastream.prepare_for_use()

        print(datastream.target_values)
        learner = AdaptiveRandomForest(n_estimators=int(options.concept_limit))
        right = 0
        wrong = 0
        overall_log = []
        while datastream.has_more_samples():
            X, y = datastream.next_sample()
            prediction = learner.predict(X)
            is_correct = prediction[0] == y[0]
            if is_correct:
                right += 1
            else:
                wrong += 1
            learner.partial_fit(X, y)
            if (right + wrong) > 0 and (right + wrong) % 200 == 0:
                overall_log.append((right + wrong, right / (right + wrong)))
                print(f'ex: {right + wrong}, Acc: {right / (right + wrong)}\r',
                      end="")
        overall = pd.DataFrame(overall_log, columns=['ex', 'overall_accuracy'])
        overall.to_csv(f"{options.experiment_directory}{os.sep}{name}.csv")
        print("")
        print(f'Accuracy: {right / (right + wrong)}')
    #fsm, system_stats, concept_chain, ds, stream_examples =  fsmsys.run_fsm(datastream, options, suppress = True, name = name, save_checkpoint=True)
    t_stop = process_time()
    print("")
    print("Elapsed time during the whole program in seconds:",
          t_stop - t_start)
示例#12
0
def start_run(options):
    if not os.path.exists(options.experiment_directory):
        print('No Directory')
        return
    datastream_filename = None
    datastream_pickle_filename = None
    fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"]))
    print(fns)
    # mm_options = ['rA', 'age', 'LRU', 'acc'] if options.memory_management == 'all' else [options.memory_management]
    for fn in fns:
        save_mm = options.memory_management
        mm_options = [options.memory_management
                      ] if options.memory_management != 'all' else [
                          "score", "rA", 'auc', "age", "LRU", 'acc', 'div'
                      ]
        mm_options = mm_options if options.memory_management != 'mine' else [
            'auc', "score", "rA"
        ]
        for mm in mm_options:
            print(mm)
            options.memory_management = mm
            sys_name = 'system'
            if options.drift_detector != 'adwin':
                sys_name += f"{options.drift_detector}"
            name = '-'.join([
                sys_name,
                str(options.noise),
                str(options.concept_limit),
                str(options.memory_management),
                str(options.sensitivity),
                str(options.window),
                str(options.optimal_selection),
                str(options.learner_str),
                str(options.poisson),
                str(options.seed),
                str(options.optimal_drift),
                str(options.similarity_measure),
                str(options.merge_strategy),
                str(options.merge_similarity)
            ])
            name_no_seed = '-'.join([
                sys_name,
                str(options.noise),
                str(options.concept_limit),
                str(options.memory_management),
                str(options.sensitivity),
                str(options.window),
                str(options.optimal_selection),
                str(options.learner_str),
                str(options.poisson), "*",
                str(options.optimal_drift),
                str(options.similarity_measure),
                str(options.merge_strategy),
                str(options.merge_similarity)
            ])
            print(name)
            if fn.split('.')[-1] == 'ARFF':
                actual_fn = fn.split(os.sep)[-1]
                fn_path = os.sep.join(fn.split(os.sep)[:-1])
                print(actual_fn)
                print(fn_path)
                pickle_fn = f"{actual_fn.split('.')[0]}_concept_chain.pickle"
                pickle_full_fn = os.sep.join([fn_path, pickle_fn])
                csv_fn = f"{name}.csv"
                csv_full_fn = os.sep.join([fn_path, csv_fn])
                print(f"checking {csv_full_fn}")

                concept_chain_exists = os.path.exists(pickle_full_fn)

                if not options.no_chain and not concept_chain_exists:
                    print("No concept chain pickle file")
                    continue

                skip_file = False

                existing_matches = glob.glob(
                    os.sep.join([fn_path, f"{name_no_seed}.csv"]))
                if len(existing_matches):
                    if any(
                        [os.path.getsize(x) > 2000 for x in existing_matches]):
                        skip_file = True
                if not skip_file:
                    datastream_filename = fn
                    datastream_pickle_filename = pickle_full_fn
                else:
                    print(f'{csv_full_fn} exists')

            if datastream_filename == None:
                print('Not datastream file')
                continue
            print(datastream_filename)

            if not options.no_chain:
                with open(f'{datastream_pickle_filename}', 'rb') as f:
                    concept_chain = pickle.load(f)
            else:
                concept_chain = None

            with open(f"{options.experiment_directory}{os.sep}{name}_info.txt",
                      "w") as f:
                f.write(
                    json.dumps(options.__dict__,
                               default=lambda o: '<not serializable>'))
                f.write(
                    f"\n Git Commit: {subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).strip()}"
                )

            try:
                data = arff.loadarff(datastream_filename)
                df = pd.DataFrame(data[0])
            except Exception as e:
                print(e)
                print("trying csv")
                df = pd.read_csv(datastream_filename, header=None)

            for c_i, c in enumerate(df.columns):

                if pd.api.types.is_string_dtype(df[c]):
                    print(f"Factoizing {c}")
                    print(pd.factorize(df[c])[0].shape)
                    df[c] = pd.factorize(df[c])[0]

                # print(f"{c_i}: {len(df.columns) - 1}")
                # if c_i == len(df.columns) - 1:
                #     print(f"converting {c}")
                #     df[c] = df[c].astype('category')

            print(df.info())

            datastream = DataStream(df)
            datastream.concept_chain = concept_chain
            print(concept_chain)
            datastream.prepare_for_use()
            t_start = process_time()
            print(options.__dict__)
            classifier = FSMClassifier(
                concept_limit=options.concept_limit,
                memory_management=options.memory_management,
                learner=options.learner,
                window=options.window,
                sensitivity=options.sensitivity,
                concept_chain=concept_chain,
                optimal_selection=options.optimal_selection,
                optimal_drift=options.optimal_drift,
                rand_weights=options.rand_weights,
                poisson=options.poisson,
                similarity_measure=options.similarity_measure,
                merge_strategy=options.merge_strategy,
                use_clean=options.use_clean,
                merge_similarity=options.merge_similarity,
                drift_detector=options.drift_detector)
            avg_memory, max_memory = evaluate_prequential.evaluate_prequential(
                datastream,
                classifier,
                directory=options.experiment_directory,
                name=name,
                noise=options.noise,
                seed=options.seed)
            t_stop = process_time()
            print("")
            print("Elapsed time during the whole program in seconds:",
                  t_stop - t_start)
            with open(
                    f"{options.experiment_directory}{os.sep}{name}_timer.txt",
                    "w") as f:
                f.write(
                    f"Elapsed time during the whole program in seconds: {t_stop-t_start}"
                )
            with open(
                    f"{options.experiment_directory}{os.sep}{name}_memory.txt",
                    "w") as f:
                f.write(f"Average: {avg_memory}\n")
                f.write(f"Max: {max_memory}")
            options.memory_management = save_mm
        options.memory_management = save_mm
示例#13
0
X = tdf[["Pressure (millibars)", "Humidity",
         "Wind Speed (km/h)"]].resample("6H").mean()
y = tdf[["Temperature (C)"]].resample("6H").max()

X.plot(subplots=True, layout=(1, 3))
y.plot()

#%%

reload(samknnreg)
from samknnreg import SAMKNNRegressor

sam = SAMKNNRegressor()
hat = RegressionHAT()
rht = RegressionHoeffdingTree()
ds = DataStream(X, y=y)
ds.prepare_for_use()

evaluator = EvaluatePrequential(
    show_plot=True,
    n_wait=730,
    batch_size=28,
    metrics=['mean_square_error', 'true_vs_predicted'])

#%%
evaluator.evaluate(stream=ds,
                   model=[sam, rht, hat],
                   model_names=[
                       "SAM", "Hoeffding Tree Regressor",
                       "Hoeffding Tree Regressor (Adaptive)"
                   ])
def start_run(options):
    if not os.path.exists(options.experiment_directory):
        print('No Directory')
        return
    name = '-'.join([
        options.moa_learner,
        str(options.concept_limit), 'pyn',
        str(options.seed)
    ])
    print(name)
    datastream_filename = None
    datastream_pickle_filename = None
    fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"]))
    print(fns)
    for fn in fns:
        if fn.split('.')[-1] == 'ARFF':
            actual_fn = fn.split(os.sep)[-1]
            fn_path = os.sep.join(fn.split(os.sep)[:-1])
            print(actual_fn)
            print(fn_path)
            csv_fn = f"{name}.csv"
            csv_full_fn = os.sep.join([fn_path, csv_fn])
            print(csv_full_fn)
            skip_file = False
            if os.path.exists(csv_full_fn):
                if os.path.getsize(csv_full_fn) > 2000:
                    skip_file = True
            if not skip_file:
                datastream_filename = fn
                break
            else:
                print('csv exists')
    if datastream_filename == None:
        print('Not datastream file')
        return
    print(datastream_filename)

    datastream_filename = f"{os.sep.join(datastream_filename.split(os.sep)[:-1])}{os.sep}{datastream_filename.split(os.sep)[-1]}"
    data = arff.loadarff(datastream_filename)
    df = pd.DataFrame(data[0])
    print(df.tail())
    for c in df.columns:
        print(f"Factoizing {c}")
        if pd.api.types.is_string_dtype(df[c]):
            print(pd.factorize(df[c])[0].shape)
            df[c] = pd.factorize(df[c])[0]
    print(df.tail())

    bat_filename = f"{options.experiment_directory}{os.sep}{name}.{'bat' if not options.using_linux else 'sh'}"
    if not os.path.exists(bat_filename) or True:
        num_examples = df.shape[0]
        stream_string = moaLink.get_moa_stream_from_filename(
            os.sep.join(datastream_filename.split(os.sep)[:-1]),
            datastream_filename.split(os.sep)[-1])
        moa_string = moaLink.make_moa_command(stream_string,
                                              options.moa_learner,
                                              options.concept_limit,
                                              'int',
                                              num_examples,
                                              config.report_window_length,
                                              options.experiment_directory,
                                              is_bat=not options.using_linux,
                                              name=name,
                                              num_features=len(df.columns) - 1)
        moaLink.save_moa_bat(moa_string, bat_filename, not options.using_linux)
        # datastream = None
    t_start = process_time()
    command = f'{bat_filename} "{options.moa_location}"'
    print(command)
    print(options.moa_learner)
    if options.moa_learner != 'arf' or options.use_moa:
        if options.using_linux:

            subprocess.run(['chmod', '+x', bat_filename])
            subprocess.run([bat_filename, options.moa_location])
        else:
            subprocess.run(command)
    else:

        # df['y0'] = df['y0'].astype('int64')
        # df["y0"] = df["y0"].astype('category')
        print(df.info())
        datastream = DataStream(df)
        datastream.prepare_for_use()

        print(datastream.target_values)
        learner = AdaptiveRandomForest(n_estimators=int(options.concept_limit))
        avg_memory, max_memory = evaluate_prequential(
            datastream=datastream,
            classifier=learner,
            directory=options.experiment_directory,
            name=name)
        # right = 0
        # wrong = 0
        # overall_log = []
        # while datastream.has_more_samples():
        #     X,y = datastream.next_sample()
        #     prediction = learner.predict(X)
        #     is_correct = prediction[0] == y[0]
        #     if is_correct:
        #         right += 1
        #     else:
        #         wrong += 1
        #     learner.partial_fit(X, y)
        #     if (right + wrong) > 0 and (right + wrong) % 200 == 0:
        #         overall_log.append((right+ wrong, right / (right + wrong)))
        #         print(f'ex: {right + wrong}, Acc: {right / (right + wrong)}\r', end = "")
        # overall = pd.DataFrame(overall_log, columns = ['ex', 'overall_accuracy'])
        # overall.to_csv(f"{options.experiment_directory}{os.sep}{name}.csv")
        # print("")
        # print(f'Accuracy: {right / (right + wrong)}')
    #fsm, system_stats, concept_chain, ds, stream_examples =  fsmsys.run_fsm(datastream, options, suppress = True, name = name, save_checkpoint=True)
    t_stop = process_time()
    print("")
    print("Elapsed time during the whole program in seconds:",
          t_stop - t_start)
    with open(f"{options.experiment_directory}{os.sep}{name}_timer.txt",
              "w") as f:
        f.write(
            f"Elapsed time during the whole program in seconds: {t_stop-t_start}"
        )
示例#15
0
warnings.filterwarnings('ignore')
plt.style.use("seaborn-whitegrid")

# Global variable
TRAINING_SIZE = 1
grace = 1000
ignore = 0

weather_data = arff.load('weatherAUS.arff')
weather_df = pandas.DataFrame(weather_data)

weather_full_df = pandas.concat([weather_df] * 150)

STREAM_SIZE = weather_full_df.shape[0]

weather_stream = DataStream(weather_full_df, name="weather")
weather_stream.prepare_for_use()

X_train, y_train = weather_stream.next_sample(TRAINING_SIZE)

ht = HoeffdingTreeClassifier()

ht.partial_fit(X_train, y_train)

n_global = ignore + TRAINING_SIZE  # Cumulative Number of observations
d_ddm = 0
w_ddm = 0
TP_ddm = []
FP_ddm = []
RT_ddm = []
DIST_ddm = []
示例#16
0
def make_stream(path):
    data = prepare_data(path)
    stream = DataStream(data, y=None, target_idx=-1, n_targets=1, cat_features=None, name=None, allow_nan=False)
    stream = stream.y
    return stream
示例#17
0
def test_regressor_chains():
    X_reg, y_reg = make_regression(random_state=112,
                                   n_targets=3,
                                   n_samples=5150)
    stream = DataStream(X_reg, y_reg)

    estimator = SGDRegressor(random_state=112, max_iter=10)
    learner = RegressorChain(base_estimator=estimator, random_state=112)

    X, y = stream.next_sample(150)
    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(list(learner.predict(X)[0]))
            true_labels.append(y[0])

        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = [
        [-21.932581119953333, 1265662295936.5574, 7.5406725414072326e+22],
        [-97.17297744582125, 5438576501559.791, -1.1370581201037737e+24],
        [-60.06308622605051, 26421144038311.047, 1.3207650552720094e+25],
        [-285.32687352244847, 8881551118262.033, -1.1322856827798374e+24],
        [-115.80322693771457, -24997431307818.508, 2.85747306174037e+24],
        [-12.184193815918672, 3510562166726.0283, -4.8590562435597834e+23],
        [-94.99008392491476, 4794062761133.606, -1.8849188211946465e+24],
        [66.35576182871232, -8147485653396.883, -7.492944375995595e+23],
        [-52.145505628056995, -1013810481101.9043, -4.5310283013446384e+23],
        [16.715060622072958, 562391244392.6193, 3.3789644409962397e+22],
        [96.32219400190282, -20397346086007.85, 1.558245298240083e+24],
        [-281.8168065846582, 118681520215938.52, 4.815807486956294e+25],
        [-135.62679760307105, 20260866750185.832, 1.605753540523006e+24],
        [0.07932047636460954, -708539394047.3298, -3.61482684929158e+22],
        [-292.1646176261883, -11162615183157.55, -8.674643964570704e+23],
        [-176.92746747754094, -29231218161585.13, 1.411600743825668e+24],
        [-348.0498644784687, -100615393132365.25, 9.759683002046948e+23],
        [30.948974669258675, -1199287119275.6328, 2.0866927007519847e+23],
        [214.0020659569134, -24437173206276.543, 9.450880718880671e+23],
        [153.98931593720746, 32675842205528.723, -1.7246747286222668e+24],
        [99.39074016354951, -11385065116243.611, 1.0770253102805811e+24],
        [127.81660709796127, 16929726964275.697, 7.14820947257164e+24],
        [40.45505653639006, -14311951591200.725, -9.33193290094133e+23],
        [117.52219878440611, 17952367624051.36, 4.5651719663788677e+23],
        [75.53942801239991, -9231543699137.594, 3.2317133158453914e+24],
        [31.795193207760704, -4084783706153.4004, -4.188095047309216e+23],
        [68.5318978502461, 5735810247065.921, 1.7284713503779943e+24],
        [65.18438567482129, -13298743450357.943, -1.4367047198923567e+24],
        [-116.63952028337805, -344127767223.9295, 2.3925104169428623e+22],
        [-76.81599010889556, 8711205431447.733, -1.1575305916673031e+24],
        [263.1077717649874, 32146618104196.434, -7.240279466740839e+24],
        [-94.07597099457413, -8216681977657.527, 2.3785728690780553e+24],
        [-175.78429788635424, -368856885004.46, -5.7200993095587195e+22],
        [59.648477499483285, -1752783828320.242, 2.1429953624557326e+23],
        [71.68447202426032, -27151271800666.492, 9.367463190825582e+24],
        [-189.96629636835922, -27090727476080.18, -3.8659883994544866e+24],
        [-240.7920206809074, 15406047062899.537, 2.0609123388035027e+24],
        [-105.80996634043589, -1518636404558.1646, -1.4166487855869706e+23],
        [-164.02527753963858, -61386039046571.125, -2.179071650432624e+25],
        [52.451759456657975, -988509747123.6125, -7.334899319683594e+22],
        [68.37044139814127, -7434200892467.581, -7.535677215142279e+23],
        [164.9457843624521, -9474550940989.51, -1.3512944635293625e+24],
        [189.34401690407307, -14349556896444.508, 1.0732760415617274e+24],
        [0.8944005517286119, 463945767759.78735, -1.9938544157612443e+22],
        [71.7856433565235, -9804063257174.584, 4.7874862540754335e+23],
        [-5.450502769025279, 281585481223.33276, 2.1974700575843552e+22],
        [248.00190755589915, -81874135462745.58, -2.6532557110860303e+25],
        [-113.86249490223707, 2634310697909.643, 1.580428629322546e+23],
        [-35.92856878407447, -5410985463428.589, 2.522168862637753e+23]
    ]

    print(predictions)
    assert np.allclose(
        np.array(predictions).all(),
        np.array(expected_predictions).all())
    assert type(learner.predict(X)) == np.ndarray

    expected_info = "RegressorChain(base_estimator=SGDRegressor(max_iter=10, random_state=112), " \
                    "order=None, random_state=112)"

    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
示例#18
0
def increment_model(ht_regressor):
    try:
        start_time = time.time()
        # val_df = pd.read_sql(engine.execute("select * from consumption where integrated = 0 limit 0,10").statement,session.bind)
        logging.info("[ML - modIncrement] Loading data... Time: " +
                     str(round(time.time() - start_time, 2)))
        val_df = pd.read_sql(
            session.query(Consumption).filter(
                Consumption.integrated == False).limit(2000000).statement,
            session.bind)
        logging.info("[ML - modIncrement] Data loaded... Time: " +
                     str(round(time.time() - start_time, 2)))
        n_samples = 0
        cnter = 0
        client_ids = []
        logging.info(
            "[ML - modIncrement] Starting model incremental fitting... Time: "
            + str(round(time.time() - start_time, 2)))
        client_id_max = max(val_df.client_id.unique())
        client_id_min = min(val_df.client_id.unique())
        df = val_df.drop(
            columns=['id', 'client_id', 'year', 'month', 'integrated'])

        stream = DataStream(data=df, target_idx=0)

        plr = []
        plprev_ht = []
        while stream.has_more_samples():
            X, y = stream.next_sample()
            if (cnter % 7000 == 0):
                y_prev = ht_regressor.predict(X)
                plr.append(y)
                plprev_ht.append(y_prev)
            ht_regressor.partial_fit(X, y)
            if (cnter % 10000 == 0):
                logging.info("[ML - modIncrement] Extracting element #" +
                             str(cnter) + " Time: " +
                             str(round(time.time() - start_time, 2)))
            n_samples += 1
            cnter += 1

        fig, ax = plt.subplots(figsize=(15, 6))
        plt.plot(range(len(plr)), plr, 'b-', label='Real')
        plt.plot(range(len(plprev_ht)),
                 plprev_ht,
                 'g--',
                 label='HoeffdingTreeRegressor')
        plt.legend()
        mse = mean_squared_error(plr, plprev_ht)
        r2 = r2_score(plr, plprev_ht)
        plt.suptitle(client_id_max, fontsize=12)
        plt.title("R2: " + str(r2) + " MSE: " + str(mse))
        filename = "images/predictionHT12F" + str(r2) + ".png"
        plt.savefig(filename)
        plt.close()
        #Updating

        logging.info("[ML - modIncrement] Execution %d --- %s seconds ---" %
                     (cnter, round(time.time() - start_time, 2)))
        return ht_regressor, client_id_min, client_id_max
    except:
        logging.error("[ML - modIncrement] Stopping...")