def test_adaptive_forest(): test_data_directory = os.path.join(TEST_DIRECTORY, 'data') test_file = os.path.join( test_data_directory, 'test_data/weather.csv' ) raw_data = pd.read_csv(test_file) stream1 = DataStream(raw_data, name='Test') stream2 = DataStream(raw_data, name='Test') # learner = ExtendedHoeffdingAdaptiveTree() # learner1 = AdaptiveHoeffdingTreeEnsemble(n_estimators=4) # stream1_learner = calculate_accuracy(learner, stream1, stream1.n_samples) # stream2_learner = calculate_accuracy(learner1, stream2, stream2.n_samples) # stream1_learner = calculate_accuracy(learner, stream1, stream1.n_samples) learner3 = AdaptiveRandomForestClassifier(n_estimators=10) stream3_learner = calculate_accuracy(learner3, stream1, stream1.n_samples) # learner4 = StreamingRandomPatchesClassifier(n_estimators=3) # stream4_learner = calculate_accuracy(learner4, stream1, stream1.n_samples) # learner5 = DeepStreamLearner(classes=stream1.target_values) # stream5_learner = calculate_accuracy(learner5, stream1, stream1.n_samples) import pudb; pudb.set_trace() # XXX BREAKPOINT assert 1 == 1 # print(stream2_learner.base_estimator.accuracy) with open ( os.path.join(test_data_directory, 'test_data/adaptive_test_result.txt'), '+w' ) as f: f.write('stream2 average_accuracy:') import pudb; pudb.set_trace() # XXX BREAKPOINT assert 1 == 1
def partial_fit(self, data: pd.DataFrame) -> None: """ The most basic working version for now. TODO: improve, calculate accuracy, maybe add possibility to learn in batches? """ stream = DataStream(self.prepare_data(data)) n = stream.n_remaining_samples() for i in range(n): x, y = stream.next_sample() if self.model.predict(x)[0] == y[0]: self.correct_predictions += 1 self.model.partial_fit(x, y) self.predictions += n self.accuracy = self.correct_predictions / self.predictions
def test_hoeffding_tree_ensemble(): test_data_directory = os.path.join(TEST_DIRECTORY, 'data') test_file = os.path.join( test_data_directory, 'test_data/airlines.csv' ) test_file2 = os.path.join( test_data_directory, 'test_data/weather.csv' ) raw_data = pd.read_csv(test_file) raw_data2 = pd.read_csv(test_file2) stream3 = DataStream(raw_data2, name='Test2') stream1 = DataStream(raw_data, name='Test') stream2 = RandomTreeGenerator( tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15 ) # learner = HoeffdingTreeClassifier( # leaf_prediction='nb', # classes=stream.target_values # ) # learner = HoeffdingTreeEnsemble( # n_estimators=3, # classes=stream.target_values) # learner1 = DeepStreamLearner(classes=stream1.target_values) # stream1_learner = calculate_accuracy(learner1, stream1, stream1.n_samples) learner2 = DeepStreamLearner(classes=stream2.target_values) stream2_learner = calculate_accuracy(learner2, stream2, 100000) learner3 = DeepStreamLearner(classes=stream3.target_values) stream3_learner = calculate_accuracy(learner3, stream3, stream3.n_samples) with open ( os.path.join(test_data_directory, 'test_data/test_result.txt'), '+w' ) as f: # f.write('stream1 accuracy: {} \n'.format(stream1_learner.accuracy[-1])) # f.write('stream1 first_layer_accuracy: {} \n'.format(stream1_learner.first_layer_cascade.accuracy_per_sample[-1])) # f.write('stream1 average_accuracy: {} \n'.format(sum(stream1_learner.accuracy)/stream1_learner.number_of_samples)) # f.write('stream1 first_layer_average_accuracy: {} \n \n'.format(sum(stream1_learner.first_layer_cascade.accuracy_per_sample)/stream1_learner.number_of_samples)) # f.write('stream2 accuracy: {} \n'.format(stream2_learner.accuracy[-1])) # f.write('stream2 first_layer_accuracy: {} \n'.format(stream2_learner.first_layer_cascade.accuracy_per_sample[-1])) f.write('stream2 average_accuracy: {} \n'.format(sum(stream2_learner.accuracy)/stream2_learner.number_of_samples)) f.write('stream2 first_layer_average_accuracy: {} \n \n'.format(sum(stream2_learner.first_layer_cascade.accuracy_per_sample)/stream2_learner.number_of_samples)) f.write('stream3 accuracy: {} \n'.format(stream3_learner.accuracy[-1])) f.write('stream3 first_layer_accuracy: {} \n'.format(stream3_learner.first_layer_cascade.accuracy_per_sample[-1])) f.write('stream3 average_accuracy: {} \n'.format(sum(stream3_learner.accuracy)/stream3_learner.number_of_samples)) f.write('stream3 first_layer_average_accuracy: {} \n \n'.format(sum(stream3_learner.first_layer_cascade.accuracy_per_sample)/stream3_learner.number_of_samples)) import pudb; pudb.set_trace() # XXX BREAKPOINT assert 1 == 1
def test_pipeline(test_path): n_categories = 5 # Load test data generated using: # RandomTreeGenerator(tree_random_state=1, sample_random_state=1, # n_cat_features=n_categories, n_num_features=0) test_file = os.path.join(test_path, 'data-one-hot.npz') data = np.load(test_file) X = data['X'] y = data['y'] stream = DataStream(data=X, y=y) stream.prepare_for_use() # Setup transformer cat_att_idx = [[i + j for i in range(n_categories)] for j in range(0, n_categories * n_categories, n_categories) ] transformer = OneHotToCategorical(categorical_list=cat_att_idx) # Set up the classifier classifier = KNNAdwin(n_neighbors=2, max_window_size=50, leaf_size=40) # Setup the pipeline pipe = Pipeline([('one-hot', transformer), ('KNNAdwin', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(show_plot=False, pretrain_size=10, max_samples=100) # Evaluate evaluator.evaluate(stream=stream, model=pipe) metrics = evaluator.get_mean_measurements() expected_accuracy = 0.5555555555555556 assert np.isclose(expected_accuracy, metrics[0].accuracy_score()) expected_kappa = 0.11111111111111116 assert np.isclose(expected_kappa, metrics[0].kappa_score()) print(pipe.get_info()) expected_info = "Pipeline:\n" \ "[OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9],\n" \ " [10, 11, 12, 13, 14],\n" \ " [15, 16, 17, 18, 19],\n" \ " [20, 21, 22, 23, 24]])\n" \ "KNNAdwin(leaf_size=40, max_window_size=50, n_neighbors=2,\n" \ " nominal_attributes=None)]" assert pipe.get_info() == expected_info
def get_error_hoeffdingtree(data, pre_train_size, **hf_kwargs): orig_X = data[:, :-1] orig_y = data[:, -1].astype(int) stream = DataStream(orig_X, orig_y) hf = HoeffdingTreeClassifier(**hf_kwargs) pretrainX, pretrainy = stream.next_sample(pre_train_size) # Pre-train hf.partial_fit(pretrainX, pretrainy, classes=stream.target_values) evaluations = [] while stream.has_more_samples(): X, y = stream.next_sample() # Evaluation y_hat = hf.predict(X) evaluations.append(int(y_hat[0] == y[0])) # Train hf.partial_fit(X, y, classes=stream.target_values) return evaluations
def train(name, clusters, window, normalize=False): input_csv = '{}{}_clusters={}_window={}_prepared.csv'.format( DATA_LOCATION, name, clusters, window) data = pd.read_csv(input_csv, index_col=0) if normalize: states = data.filter(['current_state', 'next_state']) sensors = data.drop(columns=['current_state', 'next_state']) scaler = StandardScaler() data = pd.DataFrame(data=scaler.fit_transform(X=sensors), index=data.index, columns=sensors.columns) data = pd.concat([data, states], axis='columns') stream = DataStream(data) hf = HoeffdingTreeClassifier() sgd = SGDClassifier() evaluator = EvaluatePrequential() evaluator.evaluate(stream=stream, model=[hf, sgd]) # print('---------------------------------------------') # measurements = evaluator.get_mean_measurements()[0] # print(measurements.confusion_matrix) # print(measurements.accuracy_score()) data = [] for i, measurements in enumerate(evaluator.get_mean_measurements()): data.append([ name, clusters, window, MODEL_NAMES[i], normalize, measurements.accuracy_score(), measurements.precision_score(), measurements.recall_score(), measurements.f1_score() ]) return pd.DataFrame(data=data, columns=[ 'name', 'clusters', 'window', 'model', 'normalized', 'accuracy', 'precision', 'recall', 'f1' ])
df = pd.read_csv(filename, comment='#') y = df['Target'].values anoms = (y == "'Anomaly'") normal = (y == "'Normal'") y[anoms] = 1 y[normal] = 0 X = df.drop(["Target"], axis=1) return X, y n_samples = 5000 X, y = generate_data(n_samples) # data_file="data/mulcross.csv" # X,y=read_data(data_file) stream = DataStream(data=X, y=y) n_clusters = 3 random_state = 3 # 2. Prepare for use stream.prepare_for_use() kmeans1 = KMeans(n_clusters=n_clusters, random_state=random_state, init="k-means++") kmeans2 = KMeans(n_clusters=n_clusters, random_state=random_state, init="k-means++") coreset_size = 100
precisions = [] recalls = [] f1s = [] tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) for fold, split in enumerate(cross_validation.split(X_train, y_train)): fold_train_indexes, fold_test_indexes = split fold_X_train = X_train.iloc[fold_train_indexes] fold_y_train = y_train.iloc[fold_train_indexes] fold_X_test = X_train.iloc[fold_test_indexes] fold_y_test = y_train.iloc[fold_test_indexes] if (classifier_name == 'hoeffding'): stream = DataStream(X, y.values.ravel()) stream.prepare_for_use() evaluator = EvaluatePrequential( show_plot=False, pretrain_size=200, metrics=['accuracy']) model = evaluator.evaluate( stream=stream, model=classifier)[0] model.fit(fold_X_train, fold_y_train.values.ravel()) # elif (classifier_name == 'cn2'): # model = CrossValidation( # table_from_frame(data), [CN2Learner()], k=5) else: model = classifier.fit( fold_X_train, fold_y_train.values.ravel()) y_pred = model.predict(fold_X_test)
# Global variable TRAINING_SIZE = 1 grace = 1000 ignore = 0 elec_data = arff.load("elecNormNew.arff") elec_df = pandas.DataFrame(elec_data) elec_df.columns = ['date', 'day', 'period', 'nswprice', 'nswdemand', 'vicprice', 'vicdemand', 'transfer', 'class'] mapping = {"day":{"1":1, "2":2, "3":3, "4":4, "5":5, "6":6, "7":7}, "class": {"UP": 0, "DOWN": 1}} elec_df = elec_df.replace(mapping) elec_full_df = pandas.concat([elec_df] * 200) STREAM_SIZE = elec_full_df.shape[0] elec_stream = DataStream(elec_full_df, name="elec") elec_stream.prepare_for_use() X_train, y_train = elec_stream.next_sample(TRAINING_SIZE) ht = HoeffdingTreeClassifier() ht.partial_fit(X_train, y_train) n_global = ignore + TRAINING_SIZE # Cumulative Number of observations d_ddm = 0 w_ddm = 0 TP_ddm = [] FP_ddm = [] RT_ddm = [] DIST_ddm = []
from skmultiflow.evaluation import EvaluatePrequential from forget_tree import ForgetHATT from skmultiflow.data import DataStream import pandas as pd from skmultiflow.trees import HATT data_filepath = "../datasets/transient_chess.data" labels_filepath = "../datasets/transient_chess.labels" data = pd.read_csv(data_filepath, delimiter=" ") labels = pd.read_csv(labels_filepath, delimiter=" ") labels['y'] = labels['y'].astype('category') stream = DataStream(data=data, y=labels) stream.prepare_for_use() evaluator = EvaluatePrequential(output_file="log.log", show_plot=False, metrics=['accuracy'], max_time=60) models = [ ForgetHATT(data_filepath, labels_filepath, forget_percentage=0, delimiter=" "), ForgetHATT(data_filepath, labels_filepath, forget_percentage=0.1, delimiter=" "), ForgetHATT(data_filepath,
def start_run(options): if not os.path.exists(options.experiment_directory): print('No Directory') return name = '-'.join([options.moa_learner, str(options.concept_limit), 'py']) print(name) datastream_filename = None datastream_pickle_filename = None fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"])) print(fns) for fn in fns: if fn.split('.')[-1] == 'ARFF': actual_fn = fn.split(os.sep)[-1] fn_path = os.sep.join(fn.split(os.sep)[:-1]) print(actual_fn) print(fn_path) pickle_fn = f"{actual_fn.split('.')[0]}_concept_chain.pickle" pickle_full_fn = os.sep.join([fn_path, pickle_fn]) csv_fn = f"{name}.csv" csv_full_fn = os.sep.join([fn_path, csv_fn]) print(csv_full_fn) if os.path.exists(pickle_full_fn): skip_file = False if os.path.exists(csv_full_fn): if os.path.getsize(csv_full_fn) > 2000: skip_file = True if not skip_file: datastream_filename = fn datastream_pickle_filename = pickle_full_fn break else: print('csv exists') if datastream_filename == None: print('Not datastream file') return print(datastream_filename) bat_filename = f"{options.experiment_directory}{os.sep}{name}.{'bat' if not options.using_linux else 'sh'}" if not os.path.exists(bat_filename) or True: with open(f'{datastream_pickle_filename}', 'rb') as f: concept_chain = pickle.load(f) print(concept_chain) concepts = sorted(list(concept_chain.keys())) num_examples = concepts[-1] + (concepts[-1] - concepts[-2]) stream_string = moaLink.get_moa_stream_from_filename( os.sep.join(datastream_filename.split(os.sep)[:-1]), datastream_filename.split(os.sep)[-1]) moa_string = moaLink.make_moa_command(stream_string, options.moa_learner, options.concept_limit, 'int', num_examples, config.report_window_length, options.experiment_directory, is_bat=not options.using_linux) moaLink.save_moa_bat(moa_string, bat_filename, not options.using_linux) # datastream = None t_start = process_time() command = f"{bat_filename} {options.moa_location}" print(command) print(options.moa_learner) if options.moa_learner != 'arf': if options.using_linux: subprocess.run(['chmod', '+x', bat_filename]) subprocess.run([bat_filename, options.moa_location]) else: subprocess.run(command) else: datastream_filename = f"{os.sep.join(datastream_filename.split(os.sep)[:-1])}{os.sep}{datastream_filename.split(os.sep)[-1]}" data = arff.loadarff(datastream_filename) df = pd.DataFrame(data[0], dtype='float64') df['y0'] = df['y0'].astype('int64') # df["y0"] = df["y0"].astype('category') print(df.info()) datastream = DataStream(df) datastream.prepare_for_use() print(datastream.target_values) learner = AdaptiveRandomForest(n_estimators=int(options.concept_limit)) right = 0 wrong = 0 overall_log = [] while datastream.has_more_samples(): X, y = datastream.next_sample() prediction = learner.predict(X) is_correct = prediction[0] == y[0] if is_correct: right += 1 else: wrong += 1 learner.partial_fit(X, y) if (right + wrong) > 0 and (right + wrong) % 200 == 0: overall_log.append((right + wrong, right / (right + wrong))) print(f'ex: {right + wrong}, Acc: {right / (right + wrong)}\r', end="") overall = pd.DataFrame(overall_log, columns=['ex', 'overall_accuracy']) overall.to_csv(f"{options.experiment_directory}{os.sep}{name}.csv") print("") print(f'Accuracy: {right / (right + wrong)}') #fsm, system_stats, concept_chain, ds, stream_examples = fsmsys.run_fsm(datastream, options, suppress = True, name = name, save_checkpoint=True) t_stop = process_time() print("") print("Elapsed time during the whole program in seconds:", t_stop - t_start)
def start_run(options): if not os.path.exists(options.experiment_directory): print('No Directory') return datastream_filename = None datastream_pickle_filename = None fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"])) print(fns) # mm_options = ['rA', 'age', 'LRU', 'acc'] if options.memory_management == 'all' else [options.memory_management] for fn in fns: save_mm = options.memory_management mm_options = [options.memory_management ] if options.memory_management != 'all' else [ "score", "rA", 'auc', "age", "LRU", 'acc', 'div' ] mm_options = mm_options if options.memory_management != 'mine' else [ 'auc', "score", "rA" ] for mm in mm_options: print(mm) options.memory_management = mm sys_name = 'system' if options.drift_detector != 'adwin': sys_name += f"{options.drift_detector}" name = '-'.join([ sys_name, str(options.noise), str(options.concept_limit), str(options.memory_management), str(options.sensitivity), str(options.window), str(options.optimal_selection), str(options.learner_str), str(options.poisson), str(options.seed), str(options.optimal_drift), str(options.similarity_measure), str(options.merge_strategy), str(options.merge_similarity) ]) name_no_seed = '-'.join([ sys_name, str(options.noise), str(options.concept_limit), str(options.memory_management), str(options.sensitivity), str(options.window), str(options.optimal_selection), str(options.learner_str), str(options.poisson), "*", str(options.optimal_drift), str(options.similarity_measure), str(options.merge_strategy), str(options.merge_similarity) ]) print(name) if fn.split('.')[-1] == 'ARFF': actual_fn = fn.split(os.sep)[-1] fn_path = os.sep.join(fn.split(os.sep)[:-1]) print(actual_fn) print(fn_path) pickle_fn = f"{actual_fn.split('.')[0]}_concept_chain.pickle" pickle_full_fn = os.sep.join([fn_path, pickle_fn]) csv_fn = f"{name}.csv" csv_full_fn = os.sep.join([fn_path, csv_fn]) print(f"checking {csv_full_fn}") concept_chain_exists = os.path.exists(pickle_full_fn) if not options.no_chain and not concept_chain_exists: print("No concept chain pickle file") continue skip_file = False existing_matches = glob.glob( os.sep.join([fn_path, f"{name_no_seed}.csv"])) if len(existing_matches): if any( [os.path.getsize(x) > 2000 for x in existing_matches]): skip_file = True if not skip_file: datastream_filename = fn datastream_pickle_filename = pickle_full_fn else: print(f'{csv_full_fn} exists') if datastream_filename == None: print('Not datastream file') continue print(datastream_filename) if not options.no_chain: with open(f'{datastream_pickle_filename}', 'rb') as f: concept_chain = pickle.load(f) else: concept_chain = None with open(f"{options.experiment_directory}{os.sep}{name}_info.txt", "w") as f: f.write( json.dumps(options.__dict__, default=lambda o: '<not serializable>')) f.write( f"\n Git Commit: {subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).strip()}" ) try: data = arff.loadarff(datastream_filename) df = pd.DataFrame(data[0]) except Exception as e: print(e) print("trying csv") df = pd.read_csv(datastream_filename, header=None) for c_i, c in enumerate(df.columns): if pd.api.types.is_string_dtype(df[c]): print(f"Factoizing {c}") print(pd.factorize(df[c])[0].shape) df[c] = pd.factorize(df[c])[0] # print(f"{c_i}: {len(df.columns) - 1}") # if c_i == len(df.columns) - 1: # print(f"converting {c}") # df[c] = df[c].astype('category') print(df.info()) datastream = DataStream(df) datastream.concept_chain = concept_chain print(concept_chain) datastream.prepare_for_use() t_start = process_time() print(options.__dict__) classifier = FSMClassifier( concept_limit=options.concept_limit, memory_management=options.memory_management, learner=options.learner, window=options.window, sensitivity=options.sensitivity, concept_chain=concept_chain, optimal_selection=options.optimal_selection, optimal_drift=options.optimal_drift, rand_weights=options.rand_weights, poisson=options.poisson, similarity_measure=options.similarity_measure, merge_strategy=options.merge_strategy, use_clean=options.use_clean, merge_similarity=options.merge_similarity, drift_detector=options.drift_detector) avg_memory, max_memory = evaluate_prequential.evaluate_prequential( datastream, classifier, directory=options.experiment_directory, name=name, noise=options.noise, seed=options.seed) t_stop = process_time() print("") print("Elapsed time during the whole program in seconds:", t_stop - t_start) with open( f"{options.experiment_directory}{os.sep}{name}_timer.txt", "w") as f: f.write( f"Elapsed time during the whole program in seconds: {t_stop-t_start}" ) with open( f"{options.experiment_directory}{os.sep}{name}_memory.txt", "w") as f: f.write(f"Average: {avg_memory}\n") f.write(f"Max: {max_memory}") options.memory_management = save_mm options.memory_management = save_mm
X = tdf[["Pressure (millibars)", "Humidity", "Wind Speed (km/h)"]].resample("6H").mean() y = tdf[["Temperature (C)"]].resample("6H").max() X.plot(subplots=True, layout=(1, 3)) y.plot() #%% reload(samknnreg) from samknnreg import SAMKNNRegressor sam = SAMKNNRegressor() hat = RegressionHAT() rht = RegressionHoeffdingTree() ds = DataStream(X, y=y) ds.prepare_for_use() evaluator = EvaluatePrequential( show_plot=True, n_wait=730, batch_size=28, metrics=['mean_square_error', 'true_vs_predicted']) #%% evaluator.evaluate(stream=ds, model=[sam, rht, hat], model_names=[ "SAM", "Hoeffding Tree Regressor", "Hoeffding Tree Regressor (Adaptive)" ])
def start_run(options): if not os.path.exists(options.experiment_directory): print('No Directory') return name = '-'.join([ options.moa_learner, str(options.concept_limit), 'pyn', str(options.seed) ]) print(name) datastream_filename = None datastream_pickle_filename = None fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"])) print(fns) for fn in fns: if fn.split('.')[-1] == 'ARFF': actual_fn = fn.split(os.sep)[-1] fn_path = os.sep.join(fn.split(os.sep)[:-1]) print(actual_fn) print(fn_path) csv_fn = f"{name}.csv" csv_full_fn = os.sep.join([fn_path, csv_fn]) print(csv_full_fn) skip_file = False if os.path.exists(csv_full_fn): if os.path.getsize(csv_full_fn) > 2000: skip_file = True if not skip_file: datastream_filename = fn break else: print('csv exists') if datastream_filename == None: print('Not datastream file') return print(datastream_filename) datastream_filename = f"{os.sep.join(datastream_filename.split(os.sep)[:-1])}{os.sep}{datastream_filename.split(os.sep)[-1]}" data = arff.loadarff(datastream_filename) df = pd.DataFrame(data[0]) print(df.tail()) for c in df.columns: print(f"Factoizing {c}") if pd.api.types.is_string_dtype(df[c]): print(pd.factorize(df[c])[0].shape) df[c] = pd.factorize(df[c])[0] print(df.tail()) bat_filename = f"{options.experiment_directory}{os.sep}{name}.{'bat' if not options.using_linux else 'sh'}" if not os.path.exists(bat_filename) or True: num_examples = df.shape[0] stream_string = moaLink.get_moa_stream_from_filename( os.sep.join(datastream_filename.split(os.sep)[:-1]), datastream_filename.split(os.sep)[-1]) moa_string = moaLink.make_moa_command(stream_string, options.moa_learner, options.concept_limit, 'int', num_examples, config.report_window_length, options.experiment_directory, is_bat=not options.using_linux, name=name, num_features=len(df.columns) - 1) moaLink.save_moa_bat(moa_string, bat_filename, not options.using_linux) # datastream = None t_start = process_time() command = f'{bat_filename} "{options.moa_location}"' print(command) print(options.moa_learner) if options.moa_learner != 'arf' or options.use_moa: if options.using_linux: subprocess.run(['chmod', '+x', bat_filename]) subprocess.run([bat_filename, options.moa_location]) else: subprocess.run(command) else: # df['y0'] = df['y0'].astype('int64') # df["y0"] = df["y0"].astype('category') print(df.info()) datastream = DataStream(df) datastream.prepare_for_use() print(datastream.target_values) learner = AdaptiveRandomForest(n_estimators=int(options.concept_limit)) avg_memory, max_memory = evaluate_prequential( datastream=datastream, classifier=learner, directory=options.experiment_directory, name=name) # right = 0 # wrong = 0 # overall_log = [] # while datastream.has_more_samples(): # X,y = datastream.next_sample() # prediction = learner.predict(X) # is_correct = prediction[0] == y[0] # if is_correct: # right += 1 # else: # wrong += 1 # learner.partial_fit(X, y) # if (right + wrong) > 0 and (right + wrong) % 200 == 0: # overall_log.append((right+ wrong, right / (right + wrong))) # print(f'ex: {right + wrong}, Acc: {right / (right + wrong)}\r', end = "") # overall = pd.DataFrame(overall_log, columns = ['ex', 'overall_accuracy']) # overall.to_csv(f"{options.experiment_directory}{os.sep}{name}.csv") # print("") # print(f'Accuracy: {right / (right + wrong)}') #fsm, system_stats, concept_chain, ds, stream_examples = fsmsys.run_fsm(datastream, options, suppress = True, name = name, save_checkpoint=True) t_stop = process_time() print("") print("Elapsed time during the whole program in seconds:", t_stop - t_start) with open(f"{options.experiment_directory}{os.sep}{name}_timer.txt", "w") as f: f.write( f"Elapsed time during the whole program in seconds: {t_stop-t_start}" )
warnings.filterwarnings('ignore') plt.style.use("seaborn-whitegrid") # Global variable TRAINING_SIZE = 1 grace = 1000 ignore = 0 weather_data = arff.load('weatherAUS.arff') weather_df = pandas.DataFrame(weather_data) weather_full_df = pandas.concat([weather_df] * 150) STREAM_SIZE = weather_full_df.shape[0] weather_stream = DataStream(weather_full_df, name="weather") weather_stream.prepare_for_use() X_train, y_train = weather_stream.next_sample(TRAINING_SIZE) ht = HoeffdingTreeClassifier() ht.partial_fit(X_train, y_train) n_global = ignore + TRAINING_SIZE # Cumulative Number of observations d_ddm = 0 w_ddm = 0 TP_ddm = [] FP_ddm = [] RT_ddm = [] DIST_ddm = []
def make_stream(path): data = prepare_data(path) stream = DataStream(data, y=None, target_idx=-1, n_targets=1, cat_features=None, name=None, allow_nan=False) stream = stream.y return stream
def test_regressor_chains(): X_reg, y_reg = make_regression(random_state=112, n_targets=3, n_samples=5150) stream = DataStream(X_reg, y_reg) estimator = SGDRegressor(random_state=112, max_iter=10) learner = RegressorChain(base_estimator=estimator, random_state=112) X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(list(learner.predict(X)[0])) true_labels.append(y[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = [ [-21.932581119953333, 1265662295936.5574, 7.5406725414072326e+22], [-97.17297744582125, 5438576501559.791, -1.1370581201037737e+24], [-60.06308622605051, 26421144038311.047, 1.3207650552720094e+25], [-285.32687352244847, 8881551118262.033, -1.1322856827798374e+24], [-115.80322693771457, -24997431307818.508, 2.85747306174037e+24], [-12.184193815918672, 3510562166726.0283, -4.8590562435597834e+23], [-94.99008392491476, 4794062761133.606, -1.8849188211946465e+24], [66.35576182871232, -8147485653396.883, -7.492944375995595e+23], [-52.145505628056995, -1013810481101.9043, -4.5310283013446384e+23], [16.715060622072958, 562391244392.6193, 3.3789644409962397e+22], [96.32219400190282, -20397346086007.85, 1.558245298240083e+24], [-281.8168065846582, 118681520215938.52, 4.815807486956294e+25], [-135.62679760307105, 20260866750185.832, 1.605753540523006e+24], [0.07932047636460954, -708539394047.3298, -3.61482684929158e+22], [-292.1646176261883, -11162615183157.55, -8.674643964570704e+23], [-176.92746747754094, -29231218161585.13, 1.411600743825668e+24], [-348.0498644784687, -100615393132365.25, 9.759683002046948e+23], [30.948974669258675, -1199287119275.6328, 2.0866927007519847e+23], [214.0020659569134, -24437173206276.543, 9.450880718880671e+23], [153.98931593720746, 32675842205528.723, -1.7246747286222668e+24], [99.39074016354951, -11385065116243.611, 1.0770253102805811e+24], [127.81660709796127, 16929726964275.697, 7.14820947257164e+24], [40.45505653639006, -14311951591200.725, -9.33193290094133e+23], [117.52219878440611, 17952367624051.36, 4.5651719663788677e+23], [75.53942801239991, -9231543699137.594, 3.2317133158453914e+24], [31.795193207760704, -4084783706153.4004, -4.188095047309216e+23], [68.5318978502461, 5735810247065.921, 1.7284713503779943e+24], [65.18438567482129, -13298743450357.943, -1.4367047198923567e+24], [-116.63952028337805, -344127767223.9295, 2.3925104169428623e+22], [-76.81599010889556, 8711205431447.733, -1.1575305916673031e+24], [263.1077717649874, 32146618104196.434, -7.240279466740839e+24], [-94.07597099457413, -8216681977657.527, 2.3785728690780553e+24], [-175.78429788635424, -368856885004.46, -5.7200993095587195e+22], [59.648477499483285, -1752783828320.242, 2.1429953624557326e+23], [71.68447202426032, -27151271800666.492, 9.367463190825582e+24], [-189.96629636835922, -27090727476080.18, -3.8659883994544866e+24], [-240.7920206809074, 15406047062899.537, 2.0609123388035027e+24], [-105.80996634043589, -1518636404558.1646, -1.4166487855869706e+23], [-164.02527753963858, -61386039046571.125, -2.179071650432624e+25], [52.451759456657975, -988509747123.6125, -7.334899319683594e+22], [68.37044139814127, -7434200892467.581, -7.535677215142279e+23], [164.9457843624521, -9474550940989.51, -1.3512944635293625e+24], [189.34401690407307, -14349556896444.508, 1.0732760415617274e+24], [0.8944005517286119, 463945767759.78735, -1.9938544157612443e+22], [71.7856433565235, -9804063257174.584, 4.7874862540754335e+23], [-5.450502769025279, 281585481223.33276, 2.1974700575843552e+22], [248.00190755589915, -81874135462745.58, -2.6532557110860303e+25], [-113.86249490223707, 2634310697909.643, 1.580428629322546e+23], [-35.92856878407447, -5410985463428.589, 2.522168862637753e+23] ] print(predictions) assert np.allclose( np.array(predictions).all(), np.array(expected_predictions).all()) assert type(learner.predict(X)) == np.ndarray expected_info = "RegressorChain(base_estimator=SGDRegressor(max_iter=10, random_state=112), " \ "order=None, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def increment_model(ht_regressor): try: start_time = time.time() # val_df = pd.read_sql(engine.execute("select * from consumption where integrated = 0 limit 0,10").statement,session.bind) logging.info("[ML - modIncrement] Loading data... Time: " + str(round(time.time() - start_time, 2))) val_df = pd.read_sql( session.query(Consumption).filter( Consumption.integrated == False).limit(2000000).statement, session.bind) logging.info("[ML - modIncrement] Data loaded... Time: " + str(round(time.time() - start_time, 2))) n_samples = 0 cnter = 0 client_ids = [] logging.info( "[ML - modIncrement] Starting model incremental fitting... Time: " + str(round(time.time() - start_time, 2))) client_id_max = max(val_df.client_id.unique()) client_id_min = min(val_df.client_id.unique()) df = val_df.drop( columns=['id', 'client_id', 'year', 'month', 'integrated']) stream = DataStream(data=df, target_idx=0) plr = [] plprev_ht = [] while stream.has_more_samples(): X, y = stream.next_sample() if (cnter % 7000 == 0): y_prev = ht_regressor.predict(X) plr.append(y) plprev_ht.append(y_prev) ht_regressor.partial_fit(X, y) if (cnter % 10000 == 0): logging.info("[ML - modIncrement] Extracting element #" + str(cnter) + " Time: " + str(round(time.time() - start_time, 2))) n_samples += 1 cnter += 1 fig, ax = plt.subplots(figsize=(15, 6)) plt.plot(range(len(plr)), plr, 'b-', label='Real') plt.plot(range(len(plprev_ht)), plprev_ht, 'g--', label='HoeffdingTreeRegressor') plt.legend() mse = mean_squared_error(plr, plprev_ht) r2 = r2_score(plr, plprev_ht) plt.suptitle(client_id_max, fontsize=12) plt.title("R2: " + str(r2) + " MSE: " + str(mse)) filename = "images/predictionHT12F" + str(r2) + ".png" plt.savefig(filename) plt.close() #Updating logging.info("[ML - modIncrement] Execution %d --- %s seconds ---" % (cnter, round(time.time() - start_time, 2))) return ht_regressor, client_id_min, client_id_max except: logging.error("[ML - modIncrement] Stopping...")