Exemplo n.º 1
0
def get_error_hoeffdingtree(data, pre_train_size, **hf_kwargs):
    orig_X = data[:, :-1]
    orig_y = data[:, -1].astype(int)
    stream = DataStream(orig_X, orig_y)
    hf = HoeffdingTreeClassifier(**hf_kwargs)

    pretrainX, pretrainy = stream.next_sample(pre_train_size)

    # Pre-train
    hf.partial_fit(pretrainX, pretrainy, classes=stream.target_values)

    evaluations = []
    while stream.has_more_samples():
        X, y = stream.next_sample()

        # Evaluation
        y_hat = hf.predict(X)
        evaluations.append(int(y_hat[0] == y[0]))

        # Train
        hf.partial_fit(X, y, classes=stream.target_values)

    return evaluations
Exemplo n.º 2
0
mine_std = []
mine_alpha = []
pr_min = []
std_min = []
pi = []
mine_x_mean = []
mine_sum = []
mine_threshold = []
pred_grace_ht = []
pred_grace_ht_p = []
ht_p = None
ML_accuracy = 0

ddm = DDM()
h = hpy()
while elec_stream.has_more_samples():
    n_global += 1

    X_test, y_test = elec_stream.next_sample()
    y_predict = ht.predict(X_test)

    ddm_start_time = time.time()
    ddm.add_element(y_test != y_predict)
    ML_accuracy += 1 if y_test == y_predict else 0
    ddm_running_time = time.time() - ddm_start_time
    RT_ddm.append(ddm_running_time)
    if (n_global > grace_end):
        if (n_global > detect_end):
            if ht_p is not None:
                drift_point = detect_end - 2 * grace
                print("Accuracy of ht: " + str(np.mean(pred_grace_ht)))
def start_run(options):
    if not os.path.exists(options.experiment_directory):
        print('No Directory')
        return
    name = '-'.join([options.moa_learner, str(options.concept_limit), 'py'])
    print(name)
    datastream_filename = None
    datastream_pickle_filename = None
    fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"]))
    print(fns)
    for fn in fns:
        if fn.split('.')[-1] == 'ARFF':
            actual_fn = fn.split(os.sep)[-1]
            fn_path = os.sep.join(fn.split(os.sep)[:-1])
            print(actual_fn)
            print(fn_path)
            pickle_fn = f"{actual_fn.split('.')[0]}_concept_chain.pickle"
            pickle_full_fn = os.sep.join([fn_path, pickle_fn])
            csv_fn = f"{name}.csv"
            csv_full_fn = os.sep.join([fn_path, csv_fn])
            print(csv_full_fn)
            if os.path.exists(pickle_full_fn):
                skip_file = False
                if os.path.exists(csv_full_fn):
                    if os.path.getsize(csv_full_fn) > 2000:
                        skip_file = True
                if not skip_file:
                    datastream_filename = fn
                    datastream_pickle_filename = pickle_full_fn
                    break
                else:
                    print('csv exists')
    if datastream_filename == None:
        print('Not datastream file')
        return
    print(datastream_filename)

    bat_filename = f"{options.experiment_directory}{os.sep}{name}.{'bat' if not options.using_linux else 'sh'}"
    if not os.path.exists(bat_filename) or True:
        with open(f'{datastream_pickle_filename}', 'rb') as f:
            concept_chain = pickle.load(f)
        print(concept_chain)
        concepts = sorted(list(concept_chain.keys()))
        num_examples = concepts[-1] + (concepts[-1] - concepts[-2])
        stream_string = moaLink.get_moa_stream_from_filename(
            os.sep.join(datastream_filename.split(os.sep)[:-1]),
            datastream_filename.split(os.sep)[-1])
        moa_string = moaLink.make_moa_command(stream_string,
                                              options.moa_learner,
                                              options.concept_limit,
                                              'int',
                                              num_examples,
                                              config.report_window_length,
                                              options.experiment_directory,
                                              is_bat=not options.using_linux)
        moaLink.save_moa_bat(moa_string, bat_filename, not options.using_linux)
        # datastream = None
    t_start = process_time()
    command = f"{bat_filename} {options.moa_location}"
    print(command)
    print(options.moa_learner)
    if options.moa_learner != 'arf':
        if options.using_linux:

            subprocess.run(['chmod', '+x', bat_filename])
            subprocess.run([bat_filename, options.moa_location])
        else:
            subprocess.run(command)
    else:
        datastream_filename = f"{os.sep.join(datastream_filename.split(os.sep)[:-1])}{os.sep}{datastream_filename.split(os.sep)[-1]}"
        data = arff.loadarff(datastream_filename)
        df = pd.DataFrame(data[0], dtype='float64')
        df['y0'] = df['y0'].astype('int64')
        # df["y0"] = df["y0"].astype('category')
        print(df.info())
        datastream = DataStream(df)
        datastream.prepare_for_use()

        print(datastream.target_values)
        learner = AdaptiveRandomForest(n_estimators=int(options.concept_limit))
        right = 0
        wrong = 0
        overall_log = []
        while datastream.has_more_samples():
            X, y = datastream.next_sample()
            prediction = learner.predict(X)
            is_correct = prediction[0] == y[0]
            if is_correct:
                right += 1
            else:
                wrong += 1
            learner.partial_fit(X, y)
            if (right + wrong) > 0 and (right + wrong) % 200 == 0:
                overall_log.append((right + wrong, right / (right + wrong)))
                print(f'ex: {right + wrong}, Acc: {right / (right + wrong)}\r',
                      end="")
        overall = pd.DataFrame(overall_log, columns=['ex', 'overall_accuracy'])
        overall.to_csv(f"{options.experiment_directory}{os.sep}{name}.csv")
        print("")
        print(f'Accuracy: {right / (right + wrong)}')
    #fsm, system_stats, concept_chain, ds, stream_examples =  fsmsys.run_fsm(datastream, options, suppress = True, name = name, save_checkpoint=True)
    t_stop = process_time()
    print("")
    print("Elapsed time during the whole program in seconds:",
          t_stop - t_start)
Exemplo n.º 4
0
mine_std = []
mine_alpha = []
pr_min = []
std_min = []
pi = []
mine_x_mean = []
mine_sum = []
mine_threshold = []
pred_grace_ht = []
pred_grace_ht_p = []
ht_p = None
ML_accuracy = 0

ddm = DDM()
h = hpy()
while weather_stream.has_more_samples():
    n_global += 1

    X_test, y_test = weather_stream.next_sample()
    y_predict = ht.predict(X_test)

    ddm_start_time = time.time()
    ddm.add_element(y_test != y_predict)
    ML_accuracy += 1 if y_test == y_predict else 0
    ddm_running_time = time.time() - ddm_start_time
    RT_ddm.append(ddm_running_time)
    if (n_global > grace_end):
        if (n_global > detect_end):
            if ht_p is not None:
                drift_point = detect_end - 2 * grace
                print("Accuracy of ht: " + str(np.mean(pred_grace_ht)))
Exemplo n.º 5
0
def increment_model(ht_regressor):
    try:
        start_time = time.time()
        # val_df = pd.read_sql(engine.execute("select * from consumption where integrated = 0 limit 0,10").statement,session.bind)
        logging.info("[ML - modIncrement] Loading data... Time: " +
                     str(round(time.time() - start_time, 2)))
        val_df = pd.read_sql(
            session.query(Consumption).filter(
                Consumption.integrated == False).limit(2000000).statement,
            session.bind)
        logging.info("[ML - modIncrement] Data loaded... Time: " +
                     str(round(time.time() - start_time, 2)))
        n_samples = 0
        cnter = 0
        client_ids = []
        logging.info(
            "[ML - modIncrement] Starting model incremental fitting... Time: "
            + str(round(time.time() - start_time, 2)))
        client_id_max = max(val_df.client_id.unique())
        client_id_min = min(val_df.client_id.unique())
        df = val_df.drop(
            columns=['id', 'client_id', 'year', 'month', 'integrated'])

        stream = DataStream(data=df, target_idx=0)

        plr = []
        plprev_ht = []
        while stream.has_more_samples():
            X, y = stream.next_sample()
            if (cnter % 7000 == 0):
                y_prev = ht_regressor.predict(X)
                plr.append(y)
                plprev_ht.append(y_prev)
            ht_regressor.partial_fit(X, y)
            if (cnter % 10000 == 0):
                logging.info("[ML - modIncrement] Extracting element #" +
                             str(cnter) + " Time: " +
                             str(round(time.time() - start_time, 2)))
            n_samples += 1
            cnter += 1

        fig, ax = plt.subplots(figsize=(15, 6))
        plt.plot(range(len(plr)), plr, 'b-', label='Real')
        plt.plot(range(len(plprev_ht)),
                 plprev_ht,
                 'g--',
                 label='HoeffdingTreeRegressor')
        plt.legend()
        mse = mean_squared_error(plr, plprev_ht)
        r2 = r2_score(plr, plprev_ht)
        plt.suptitle(client_id_max, fontsize=12)
        plt.title("R2: " + str(r2) + " MSE: " + str(mse))
        filename = "images/predictionHT12F" + str(r2) + ".png"
        plt.savefig(filename)
        plt.close()
        #Updating

        logging.info("[ML - modIncrement] Execution %d --- %s seconds ---" %
                     (cnter, round(time.time() - start_time, 2)))
        return ht_regressor, client_id_min, client_id_max
    except:
        logging.error("[ML - modIncrement] Stopping...")