def test_adaptive_random_forests_nb(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112, leaf_prediction='nb') X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(int(learner.predict(X)[0])) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 last_version_predictions = [ 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1 ] # Performance below does not need to be guaranteed. This check is set up so that anything that changes # to predictions are caught in the unit test. This helps prevent accidental changes. assert type(learner.predict(X)) == np.ndarray assert np.alltrue(predictions == last_version_predictions) expected_info = "AdaptiveRandomForest(binary_split=False, disable_weighted_vote=False,\n" \ " drift_detection_method=ADWIN(delta=0.001), grace_period=50,\n" \ " lambda_value=6, leaf_prediction='nb',\n" \ " max_byte_size=33554432, max_features=5,\n" \ " memory_estimate_period=2000000, n_estimators=3,\n" \ " nb_threshold=0, no_preprune=False, nominal_attributes=None,\n" \ " performance_metric='acc', random_state=112,\n" \ " remove_poor_atts=False, split_confidence=0.01,\n" \ " split_criterion='info_gain', stop_mem_management=False,\n" \ " tie_threshold=0.05,\n" \ " warning_detection_method=ADWIN(delta=0.01))" assert learner.get_info() == expected_info
def test_adaptive_random_forests_labels_given(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112) X, y = stream.next_sample(150) learner.partial_fit(X, y, classes=[0, 1]) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict_proba(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1].argmax()): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 assert np.alltrue([ np.isclose(y_proba.sum(), 1) for y_proba in predictions ]), "Probabilities should sum to 1." class_probabilities = np.asarray(predictions).squeeze() assert class_probabilities.shape == (49, 2) predictions = class_probabilities.argmax(axis=1) last_version_predictions = [ 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0 ] assert np.alltrue(predictions == last_version_predictions)
def test_adaptive_random_forests_batch_predict_proba(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112, n_classes=2) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112) X, y = stream.next_sample(150) learner.partial_fit(X, y, classes=[0, 1]) cnt = 0 max_samples = 500 predictions = [] true_labels = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample(5) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): p = learner.predict_proba(X) assert p.shape == (5, 2) predictions.append(p) true_labels.append(y) learner.partial_fit(X, y) cnt += 1 all_predictions = np.concatenate(predictions) # all_true_labels = np.asarray(true_labels).flatten() # correct_predictions = sum(np.equal(all_true_labels, all_predictions.argmax(axis=1))) assert np.alltrue([ np.isclose(y_proba.sum(), 1) for y_proba in all_predictions ]), "Probabilities should sum to 1." assert all_predictions.shape == (4 * 5, 2) last_version_predictions = [ 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1 ] assert type(learner.predict_proba(X)) == np.ndarray assert np.alltrue( all_predictions.argmax(axis=1) == last_version_predictions)
def test_adaptive_random_forests(): stream = RandomTreeGenerator(tree_random_state=112, sample_random_state=112) stream.prepare_for_use() learner = AdaptiveRandomForest(n_estimators=3, random_state=112) X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(int(learner.predict(X)[0])) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 last_version_predictions = [1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0] # Performance below does not need to be guaranteed. This check is set up so that anything that changes # to predictions are caught in the unit test. This helps prevent accidental changes. # If these tests fail, make sure that what is worked on *should* change the predictions of ARF. if sys.version_info.major == 3 and sys.version_info.minor >= 6: # Temporary disable as pre-3.6 give different predictions than 3.6+ assert np.alltrue(predictions == last_version_predictions) assert type(learner.predict(X)) == np.ndarray
def start_run(options): if not os.path.exists(options.experiment_directory): print('No Directory') return name = '-'.join([options.moa_learner, str(options.concept_limit), 'py']) print(name) datastream_filename = None datastream_pickle_filename = None fns = glob.glob(os.sep.join([options.experiment_directory, "*.ARFF"])) print(fns) for fn in fns: if fn.split('.')[-1] == 'ARFF': actual_fn = fn.split(os.sep)[-1] fn_path = os.sep.join(fn.split(os.sep)[:-1]) print(actual_fn) print(fn_path) pickle_fn = f"{actual_fn.split('.')[0]}_concept_chain.pickle" pickle_full_fn = os.sep.join([fn_path, pickle_fn]) csv_fn = f"{name}.csv" csv_full_fn = os.sep.join([fn_path, csv_fn]) print(csv_full_fn) if os.path.exists(pickle_full_fn): skip_file = False if os.path.exists(csv_full_fn): if os.path.getsize(csv_full_fn) > 2000: skip_file = True if not skip_file: datastream_filename = fn datastream_pickle_filename = pickle_full_fn break else: print('csv exists') if datastream_filename == None: print('Not datastream file') return print(datastream_filename) bat_filename = f"{options.experiment_directory}{os.sep}{name}.{'bat' if not options.using_linux else 'sh'}" if not os.path.exists(bat_filename) or True: with open(f'{datastream_pickle_filename}', 'rb') as f: concept_chain = pickle.load(f) print(concept_chain) concepts = sorted(list(concept_chain.keys())) num_examples = concepts[-1] + (concepts[-1] - concepts[-2]) stream_string = moaLink.get_moa_stream_from_filename( os.sep.join(datastream_filename.split(os.sep)[:-1]), datastream_filename.split(os.sep)[-1]) moa_string = moaLink.make_moa_command(stream_string, options.moa_learner, options.concept_limit, 'int', num_examples, config.report_window_length, options.experiment_directory, is_bat=not options.using_linux) moaLink.save_moa_bat(moa_string, bat_filename, not options.using_linux) # datastream = None t_start = process_time() command = f"{bat_filename} {options.moa_location}" print(command) print(options.moa_learner) if options.moa_learner != 'arf': if options.using_linux: subprocess.run(['chmod', '+x', bat_filename]) subprocess.run([bat_filename, options.moa_location]) else: subprocess.run(command) else: datastream_filename = f"{os.sep.join(datastream_filename.split(os.sep)[:-1])}{os.sep}{datastream_filename.split(os.sep)[-1]}" data = arff.loadarff(datastream_filename) df = pd.DataFrame(data[0], dtype='float64') df['y0'] = df['y0'].astype('int64') # df["y0"] = df["y0"].astype('category') print(df.info()) datastream = DataStream(df) datastream.prepare_for_use() print(datastream.target_values) learner = AdaptiveRandomForest(n_estimators=int(options.concept_limit)) right = 0 wrong = 0 overall_log = [] while datastream.has_more_samples(): X, y = datastream.next_sample() prediction = learner.predict(X) is_correct = prediction[0] == y[0] if is_correct: right += 1 else: wrong += 1 learner.partial_fit(X, y) if (right + wrong) > 0 and (right + wrong) % 200 == 0: overall_log.append((right + wrong, right / (right + wrong))) print(f'ex: {right + wrong}, Acc: {right / (right + wrong)}\r', end="") overall = pd.DataFrame(overall_log, columns=['ex', 'overall_accuracy']) overall.to_csv(f"{options.experiment_directory}{os.sep}{name}.csv") print("") print(f'Accuracy: {right / (right + wrong)}') #fsm, system_stats, concept_chain, ds, stream_examples = fsmsys.run_fsm(datastream, options, suppress = True, name = name, save_checkpoint=True) t_stop = process_time() print("") print("Elapsed time during the whole program in seconds:", t_stop - t_start)
class ARFPredictor(AbstractPredictor): def __init__(self, n_estimators=10, max_features='auto', grace_period=50): super(ARFPredictor, self).__init__() self._n_estimators = n_estimators self._max_features = max_features self._classifier = AdaptiveRandomForest( n_estimators=self._n_estimators, max_features=self._max_features, grace_period=grace_period, random_state=42) self._trained_samples = 0 def predict(self, training_data, training_labels, todays_features, training_years, trading_days_per_year) -> int: self._classifier.partial_fit(training_data[self._trained_samples:], training_labels[self._trained_samples:], np.unique(training_labels)) self._trained_samples += len(training_labels[self._trained_samples:]) prediction = self._classifier.predict([todays_features]) return prediction[-1] def tune(self, stock_data, symbols, num_features, measure='f1', trading_frequency=10, training_years=3, trading_days_per_year=246): gp_space = [10, 15, 20] n_est_space = [50, 100] sqrt_feat = sqrt(num_features) max_f_space = [round(sqrt_feat)] accuracies = [] f_scores = [] params = [] for gp in gp_space: for n_estimators in n_est_space: for max_features in max_f_space: accs = [] fs = [] for stock in symbols: print(stock) self._n_estimators = n_estimators self._max_features = max_features self._trained_samples = 0 self._classifier = AdaptiveRandomForest( n_estimators=self._n_estimators, max_features=self._max_features, grace_period=gp, random_state=42) preds = [] X = stock_data[stock][[ 'I{}'.format(x) for x in range(1, num_features + 1) ]] y = stock_data[stock]['target'] first_trading_day = training_years * trading_days_per_year + trading_frequency last_trading_day = first_trading_day - trading_frequency for trading_day in range(first_trading_day, len(y)): sys.stdout.write('\r%i/%i' % (trading_day, len(y))) sys.stdout.flush() preds.append( self.predict(X.iloc[:last_trading_day].values, y.iloc[:last_trading_day].values, X.iloc[trading_day], training_years, trading_days_per_year)) last_trading_day = trading_day print('') preds = pd.Series(preds, name='Predictions') accuracy = metrics.accuracy_score( y.iloc[first_trading_day:], preds) accs.append(accuracy) f_score = metrics.f1_score(y.iloc[first_trading_day:], preds) fs.append(f_score) mean_acc = np.array(accs).mean() accuracies.append(mean_acc) mean_f_score = np.array(fs).mean() f_scores.append(mean_f_score) params.append((n_estimators, max_features, gp)) print('\nARF n_estimators=%i max_features=%i gp=%i' % (n_estimators, max_features, gp)) print('ARF Accuracy: %.3f' % mean_acc) print('ARF F1 Score: %.3f' % mean_f_score) accuracies = np.array(accuracies) f_scores = np.array(f_scores) print('\nARF FINAL RESULTS') if measure == 'accuracy': (n_estimators, max_features, gp) = params[accuracies.argmax()] print( 'ARF Best result: accuracy %.3f (n_estimators=%i max_features=%i gp=%i)' % (accuracies.max(), n_estimators, max_features, gp)) elif measure == 'f1': (n_estimators, max_features, gp) = params[f_scores.argmax()] print( 'ARF Best result: f score %.3f (n_estimators=%i max_features=%i gp=%i)' % (f_scores.max(), n_estimators, max_features, gp)) else: raise NotImplementedError return [n_estimators, max_features]