Exemplo n.º 1
0
 def partial_fit(self, data: pd.DataFrame) -> None:
     """ The most basic working version for now.
     TODO: improve, calculate accuracy, maybe add possibility to learn in batches? """
     stream = DataStream(self.prepare_data(data))
     n = stream.n_remaining_samples()
     for i in range(n):
         x, y = stream.next_sample()
         if self.model.predict(x)[0] == y[0]:
             self.correct_predictions += 1
         self.model.partial_fit(x, y)
     self.predictions += n
     self.accuracy = self.correct_predictions / self.predictions
Exemplo n.º 2
0
def get_error_hoeffdingtree(data, pre_train_size, **hf_kwargs):
    orig_X = data[:, :-1]
    orig_y = data[:, -1].astype(int)
    stream = DataStream(orig_X, orig_y)
    hf = HoeffdingTreeClassifier(**hf_kwargs)

    pretrainX, pretrainy = stream.next_sample(pre_train_size)

    # Pre-train
    hf.partial_fit(pretrainX, pretrainy, classes=stream.target_values)

    evaluations = []
    while stream.has_more_samples():
        X, y = stream.next_sample()

        # Evaluation
        y_hat = hf.predict(X)
        evaluations.append(int(y_hat[0] == y[0]))

        # Train
        hf.partial_fit(X, y, classes=stream.target_values)

    return evaluations
Exemplo n.º 3
0
def test_regressor_chains():
    X_reg, y_reg = make_regression(random_state=112,
                                   n_targets=3,
                                   n_samples=5150)
    stream = DataStream(X_reg, y_reg)

    estimator = SGDRegressor(random_state=112, max_iter=10)
    learner = RegressorChain(base_estimator=estimator, random_state=112)

    X, y = stream.next_sample(150)
    learner.partial_fit(X, y)

    cnt = 0
    max_samples = 5000
    predictions = []
    true_labels = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(list(learner.predict(X)[0]))
            true_labels.append(y[0])

        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = [
        [-21.932581119953333, 1265662295936.5574, 7.5406725414072326e+22],
        [-97.17297744582125, 5438576501559.791, -1.1370581201037737e+24],
        [-60.06308622605051, 26421144038311.047, 1.3207650552720094e+25],
        [-285.32687352244847, 8881551118262.033, -1.1322856827798374e+24],
        [-115.80322693771457, -24997431307818.508, 2.85747306174037e+24],
        [-12.184193815918672, 3510562166726.0283, -4.8590562435597834e+23],
        [-94.99008392491476, 4794062761133.606, -1.8849188211946465e+24],
        [66.35576182871232, -8147485653396.883, -7.492944375995595e+23],
        [-52.145505628056995, -1013810481101.9043, -4.5310283013446384e+23],
        [16.715060622072958, 562391244392.6193, 3.3789644409962397e+22],
        [96.32219400190282, -20397346086007.85, 1.558245298240083e+24],
        [-281.8168065846582, 118681520215938.52, 4.815807486956294e+25],
        [-135.62679760307105, 20260866750185.832, 1.605753540523006e+24],
        [0.07932047636460954, -708539394047.3298, -3.61482684929158e+22],
        [-292.1646176261883, -11162615183157.55, -8.674643964570704e+23],
        [-176.92746747754094, -29231218161585.13, 1.411600743825668e+24],
        [-348.0498644784687, -100615393132365.25, 9.759683002046948e+23],
        [30.948974669258675, -1199287119275.6328, 2.0866927007519847e+23],
        [214.0020659569134, -24437173206276.543, 9.450880718880671e+23],
        [153.98931593720746, 32675842205528.723, -1.7246747286222668e+24],
        [99.39074016354951, -11385065116243.611, 1.0770253102805811e+24],
        [127.81660709796127, 16929726964275.697, 7.14820947257164e+24],
        [40.45505653639006, -14311951591200.725, -9.33193290094133e+23],
        [117.52219878440611, 17952367624051.36, 4.5651719663788677e+23],
        [75.53942801239991, -9231543699137.594, 3.2317133158453914e+24],
        [31.795193207760704, -4084783706153.4004, -4.188095047309216e+23],
        [68.5318978502461, 5735810247065.921, 1.7284713503779943e+24],
        [65.18438567482129, -13298743450357.943, -1.4367047198923567e+24],
        [-116.63952028337805, -344127767223.9295, 2.3925104169428623e+22],
        [-76.81599010889556, 8711205431447.733, -1.1575305916673031e+24],
        [263.1077717649874, 32146618104196.434, -7.240279466740839e+24],
        [-94.07597099457413, -8216681977657.527, 2.3785728690780553e+24],
        [-175.78429788635424, -368856885004.46, -5.7200993095587195e+22],
        [59.648477499483285, -1752783828320.242, 2.1429953624557326e+23],
        [71.68447202426032, -27151271800666.492, 9.367463190825582e+24],
        [-189.96629636835922, -27090727476080.18, -3.8659883994544866e+24],
        [-240.7920206809074, 15406047062899.537, 2.0609123388035027e+24],
        [-105.80996634043589, -1518636404558.1646, -1.4166487855869706e+23],
        [-164.02527753963858, -61386039046571.125, -2.179071650432624e+25],
        [52.451759456657975, -988509747123.6125, -7.334899319683594e+22],
        [68.37044139814127, -7434200892467.581, -7.535677215142279e+23],
        [164.9457843624521, -9474550940989.51, -1.3512944635293625e+24],
        [189.34401690407307, -14349556896444.508, 1.0732760415617274e+24],
        [0.8944005517286119, 463945767759.78735, -1.9938544157612443e+22],
        [71.7856433565235, -9804063257174.584, 4.7874862540754335e+23],
        [-5.450502769025279, 281585481223.33276, 2.1974700575843552e+22],
        [248.00190755589915, -81874135462745.58, -2.6532557110860303e+25],
        [-113.86249490223707, 2634310697909.643, 1.580428629322546e+23],
        [-35.92856878407447, -5410985463428.589, 2.522168862637753e+23]
    ]

    print(predictions)
    assert np.allclose(
        np.array(predictions).all(),
        np.array(expected_predictions).all())
    assert type(learner.predict(X)) == np.ndarray

    expected_info = "RegressorChain(base_estimator=SGDRegressor(max_iter=10, random_state=112), " \
                    "order=None, random_state=112)"

    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
Exemplo n.º 4
0
ignore = 0

elec_data = arff.load("elecNormNew.arff")
elec_df = pandas.DataFrame(elec_data)
elec_df.columns = ['date', 'day', 'period', 'nswprice', 'nswdemand', 'vicprice', 'vicdemand', 'transfer', 'class']
mapping = {"day":{"1":1, "2":2, "3":3, "4":4, "5":5, "6":6, "7":7}, "class": {"UP": 0, "DOWN": 1}}
elec_df = elec_df.replace(mapping)

elec_full_df = pandas.concat([elec_df] * 200)

STREAM_SIZE = elec_full_df.shape[0]

elec_stream = DataStream(elec_full_df, name="elec")
elec_stream.prepare_for_use()

X_train, y_train = elec_stream.next_sample(TRAINING_SIZE)

ht = HoeffdingTreeClassifier()

ht.partial_fit(X_train, y_train)

n_global = ignore + TRAINING_SIZE  # Cumulative Number of observations
d_ddm = 0
w_ddm = 0
TP_ddm = []
FP_ddm = []
RT_ddm = []
DIST_ddm = []
mem_ddm = []
retrain = False
grace_end = n_global
def start_run(options):
    if not os.path.exists(options.experiment_directory):
        print('No Directory')
        return
    name = '-'.join([options.moa_learner, str(options.concept_limit), 'py'])
    print(name)
    datastream_filename = None
    datastream_pickle_filename = None
    fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"]))
    print(fns)
    for fn in fns:
        if fn.split('.')[-1] == 'ARFF':
            actual_fn = fn.split(os.sep)[-1]
            fn_path = os.sep.join(fn.split(os.sep)[:-1])
            print(actual_fn)
            print(fn_path)
            pickle_fn = f"{actual_fn.split('.')[0]}_concept_chain.pickle"
            pickle_full_fn = os.sep.join([fn_path, pickle_fn])
            csv_fn = f"{name}.csv"
            csv_full_fn = os.sep.join([fn_path, csv_fn])
            print(csv_full_fn)
            if os.path.exists(pickle_full_fn):
                skip_file = False
                if os.path.exists(csv_full_fn):
                    if os.path.getsize(csv_full_fn) > 2000:
                        skip_file = True
                if not skip_file:
                    datastream_filename = fn
                    datastream_pickle_filename = pickle_full_fn
                    break
                else:
                    print('csv exists')
    if datastream_filename == None:
        print('Not datastream file')
        return
    print(datastream_filename)

    bat_filename = f"{options.experiment_directory}{os.sep}{name}.{'bat' if not options.using_linux else 'sh'}"
    if not os.path.exists(bat_filename) or True:
        with open(f'{datastream_pickle_filename}', 'rb') as f:
            concept_chain = pickle.load(f)
        print(concept_chain)
        concepts = sorted(list(concept_chain.keys()))
        num_examples = concepts[-1] + (concepts[-1] - concepts[-2])
        stream_string = moaLink.get_moa_stream_from_filename(
            os.sep.join(datastream_filename.split(os.sep)[:-1]),
            datastream_filename.split(os.sep)[-1])
        moa_string = moaLink.make_moa_command(stream_string,
                                              options.moa_learner,
                                              options.concept_limit,
                                              'int',
                                              num_examples,
                                              config.report_window_length,
                                              options.experiment_directory,
                                              is_bat=not options.using_linux)
        moaLink.save_moa_bat(moa_string, bat_filename, not options.using_linux)
        # datastream = None
    t_start = process_time()
    command = f"{bat_filename} {options.moa_location}"
    print(command)
    print(options.moa_learner)
    if options.moa_learner != 'arf':
        if options.using_linux:

            subprocess.run(['chmod', '+x', bat_filename])
            subprocess.run([bat_filename, options.moa_location])
        else:
            subprocess.run(command)
    else:
        datastream_filename = f"{os.sep.join(datastream_filename.split(os.sep)[:-1])}{os.sep}{datastream_filename.split(os.sep)[-1]}"
        data = arff.loadarff(datastream_filename)
        df = pd.DataFrame(data[0], dtype='float64')
        df['y0'] = df['y0'].astype('int64')
        # df["y0"] = df["y0"].astype('category')
        print(df.info())
        datastream = DataStream(df)
        datastream.prepare_for_use()

        print(datastream.target_values)
        learner = AdaptiveRandomForest(n_estimators=int(options.concept_limit))
        right = 0
        wrong = 0
        overall_log = []
        while datastream.has_more_samples():
            X, y = datastream.next_sample()
            prediction = learner.predict(X)
            is_correct = prediction[0] == y[0]
            if is_correct:
                right += 1
            else:
                wrong += 1
            learner.partial_fit(X, y)
            if (right + wrong) > 0 and (right + wrong) % 200 == 0:
                overall_log.append((right + wrong, right / (right + wrong)))
                print(f'ex: {right + wrong}, Acc: {right / (right + wrong)}\r',
                      end="")
        overall = pd.DataFrame(overall_log, columns=['ex', 'overall_accuracy'])
        overall.to_csv(f"{options.experiment_directory}{os.sep}{name}.csv")
        print("")
        print(f'Accuracy: {right / (right + wrong)}')
    #fsm, system_stats, concept_chain, ds, stream_examples =  fsmsys.run_fsm(datastream, options, suppress = True, name = name, save_checkpoint=True)
    t_stop = process_time()
    print("")
    print("Elapsed time during the whole program in seconds:",
          t_stop - t_start)
Exemplo n.º 6
0
# Global variable
TRAINING_SIZE = 1
grace = 1000
ignore = 0

weather_data = arff.load('weatherAUS.arff')
weather_df = pandas.DataFrame(weather_data)

weather_full_df = pandas.concat([weather_df] * 150)

STREAM_SIZE = weather_full_df.shape[0]

weather_stream = DataStream(weather_full_df, name="weather")
weather_stream.prepare_for_use()

X_train, y_train = weather_stream.next_sample(TRAINING_SIZE)

ht = HoeffdingTreeClassifier()

ht.partial_fit(X_train, y_train)

n_global = ignore + TRAINING_SIZE  # Cumulative Number of observations
d_ddm = 0
w_ddm = 0
TP_ddm = []
FP_ddm = []
RT_ddm = []
DIST_ddm = []
mem_ddm = []
retrain = False
grace_end = n_global
Exemplo n.º 7
0
def increment_model(ht_regressor):
    try:
        start_time = time.time()
        # val_df = pd.read_sql(engine.execute("select * from consumption where integrated = 0 limit 0,10").statement,session.bind)
        logging.info("[ML - modIncrement] Loading data... Time: " +
                     str(round(time.time() - start_time, 2)))
        val_df = pd.read_sql(
            session.query(Consumption).filter(
                Consumption.integrated == False).limit(2000000).statement,
            session.bind)
        logging.info("[ML - modIncrement] Data loaded... Time: " +
                     str(round(time.time() - start_time, 2)))
        n_samples = 0
        cnter = 0
        client_ids = []
        logging.info(
            "[ML - modIncrement] Starting model incremental fitting... Time: "
            + str(round(time.time() - start_time, 2)))
        client_id_max = max(val_df.client_id.unique())
        client_id_min = min(val_df.client_id.unique())
        df = val_df.drop(
            columns=['id', 'client_id', 'year', 'month', 'integrated'])

        stream = DataStream(data=df, target_idx=0)

        plr = []
        plprev_ht = []
        while stream.has_more_samples():
            X, y = stream.next_sample()
            if (cnter % 7000 == 0):
                y_prev = ht_regressor.predict(X)
                plr.append(y)
                plprev_ht.append(y_prev)
            ht_regressor.partial_fit(X, y)
            if (cnter % 10000 == 0):
                logging.info("[ML - modIncrement] Extracting element #" +
                             str(cnter) + " Time: " +
                             str(round(time.time() - start_time, 2)))
            n_samples += 1
            cnter += 1

        fig, ax = plt.subplots(figsize=(15, 6))
        plt.plot(range(len(plr)), plr, 'b-', label='Real')
        plt.plot(range(len(plprev_ht)),
                 plprev_ht,
                 'g--',
                 label='HoeffdingTreeRegressor')
        plt.legend()
        mse = mean_squared_error(plr, plprev_ht)
        r2 = r2_score(plr, plprev_ht)
        plt.suptitle(client_id_max, fontsize=12)
        plt.title("R2: " + str(r2) + " MSE: " + str(mse))
        filename = "images/predictionHT12F" + str(r2) + ".png"
        plt.savefig(filename)
        plt.close()
        #Updating

        logging.info("[ML - modIncrement] Execution %d --- %s seconds ---" %
                     (cnter, round(time.time() - start_time, 2)))
        return ht_regressor, client_id_min, client_id_max
    except:
        logging.error("[ML - modIncrement] Stopping...")