def test_hoeffding_adaptive_tree_categorical_features(test_path): data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy') stream = np.load(data_path) # Removes the last two columns (regression targets) stream = stream[:, :-2] X, y = stream[:, :-1], stream[:, -1] nominal_attr_idx = np.arange(7).tolist() learner = HAT(nominal_attributes=nominal_attr_idx) learner.partial_fit(X, y, classes=np.unique(y)) expected_description = "if Attribute 0 = -15.0:\n" \ " Leaf = Class 2 | {2: 475.0}\n" \ "if Attribute 0 = 0.0:\n" \ " Leaf = Class 0 | {0: 560.0, 1: 345.0}\n" \ "if Attribute 0 = 1.0:\n" \ " Leaf = Class 1 | {0: 416.0, 1: 464.0}\n" \ "if Attribute 0 = 2.0:\n" \ " Leaf = Class 1 | {0: 335.0, 1: 504.0}\n" \ "if Attribute 0 = 3.0:\n" \ " Leaf = Class 1 | {0: 244.0, 1: 644.0}\n" \ "if Attribute 0 = -30.0:\n" \ " Leaf = Class 3.0 | {3.0: 65.0, 4.0: 55.0}\n" assert learner.get_model_description() == expected_description
def test_HAT(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) stream.prepare_for_use() nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HAT(nominal_attributes=nominal_attr_idx) cnt = 0 max_samples = 5000 predictions = array('d') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('d', [2.0, 1.0, 1.0, 1.0, 0.0, 3.0, 0.0, 1.0, 1.0, 2.0, 0.0, 2.0, 1.0, 1.0, 2.0, 1.0, 3.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 3.0, 1.0, 2.0, 1.0, 1.0, 3.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 3.0, 2.0]) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree.npy') data = np.load(test_file) assert np.alltrue(predictions == expected_predictions) assert np.allclose(proba_predictions, data) expected_info = 'HAT: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200' \ ' - split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05' \ ' - binary_split: False - stop_mem_management: False - remove_poor_atts: False' \ ' - no_pre_prune: False - leaf_prediction: nba - nb_threshold: 0' \ ' - nominal_attributes: [5, 6, 7, 8, 9, 10, 11, 12, 13, 14] - ' assert learner.get_info() == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1367.3628584299263, 1.0: 1702.2738590243584,' \ ' 2.0: 952.1668539501372, 3.0: 822.1964285955778}\n' expected_model_2 = 'Leaf = Class 1.0 | {1.0: 1702.2738590243584, 2.0: 952.1668539501372,' \ ' 0.0: 1367.3628584299263, 3.0: 822.1964285955778}\n' expected_model_3 = 'Leaf = Class 1.0 | {1.0: 1702.2738590243584, 2.0: 952.16685395013724, ' \ '0.0: 1367.3628584299263, 3.0: 822.1964285955778}\n' # Python 3.6 expected_model_4 = 'Leaf = Class 1.0 | {0.0: 1367.3628584299263, 1.0: 1702.2738590243584,' \ ' 2.0: 952.16685395013724, 3.0: 822.1964285955778}\n' # Python 3.4 assert (learner.get_model_description() == expected_model_1) \ or (learner.get_model_description() == expected_model_2) \ or (learner.get_model_description() == expected_model_3) \ or (learner.get_model_description() == expected_model_4)
def _choose_classifier(job: Job): method, config = get_method_config(job) config.pop('classification_method', None) logger.info("Using method {} with config {}".format(method, config)) if method == ClassificationMethods.KNN.value: classifier = KNeighborsClassifier(**config) elif method == ClassificationMethods.RANDOM_FOREST.value: classifier = RandomForestClassifier(**config) elif method == ClassificationMethods.DECISION_TREE.value: classifier = DecisionTreeClassifier(**config) elif method == ClassificationMethods.XGBOOST.value: classifier = XGBClassifier(**config) elif method == ClassificationMethods.MULTINOMIAL_NAIVE_BAYES.value: classifier = MultinomialNB(**config) elif method == ClassificationMethods.ADAPTIVE_TREE.value: classifier = HAT(**config) elif method == ClassificationMethods.HOEFFDING_TREE.value: classifier = HoeffdingTree(**config) elif method == ClassificationMethods.SGDCLASSIFIER.value: classifier = SGDClassifier(**config) elif method == ClassificationMethods.PERCEPTRON.value: classifier = Perceptron(**config) elif method == ClassificationMethods.NN.value: config['encoding'] = job.encoding.value_encoding config['is_binary_classifier'] = _check_is_binary_classifier( job.labelling.type) classifier = NNClassifier(**config) else: raise ValueError("Unexpected classification method {}".format(method)) return classifier
def test_grid(): clfs = [AdaptiveRandomForest(), SAMKNN(), HAT()] cv = CrossValidation(clfs=clfs, max_samples=1000000, test_size=1) cv.streams = cv.init_real_world() + cv.init_standard_streams( ) + cv.init_reoccuring_standard_streams() cv.test() cv.save_summary()
def _choose_classifier(job: Job): if job.type == JobTypes.UPDATE.value: classifier = _load_model(job.incremental_train) # TODO: check if this instruction still makes sense # are we updating a predictive_model with its own methods? assert classifier[0].__class__.__name__ == job.method else: method, config = get_method_config(job) config.pop('classification_method', None) print("Using method {} with config {}".format(method, config)) if method == ClassificationMethods.KNN.value: classifier = KNeighborsClassifier(**config) elif method == ClassificationMethods.RANDOM_FOREST.value: classifier = RandomForestClassifier(**config) elif method == ClassificationMethods.DECISION_TREE.value: classifier = DecisionTreeClassifier(**config) elif method == ClassificationMethods.XGBOOST.value: classifier = XGBClassifier(**config) elif method == ClassificationMethods.MULTINOMIAL_NAIVE_BAYES.value: classifier = MultinomialNB(**config) elif method == ClassificationMethods.ADAPTIVE_TREE.value: classifier = HAT(**config) elif method == ClassificationMethods.HOEFFDING_TREE.value: classifier = HoeffdingTree(**config) elif method == ClassificationMethods.SGDCLASSIFIER.value: classifier = SGDClassifier(**config) elif method == ClassificationMethods.PERCEPTRON.value: classifier = Perceptron(**config) elif method == ClassificationMethods.NN.value: config['encoding'] = job.encoding.value_encoding config['is_binary_classifier'] = _check_is_binary_classifier(job.labelling.type) classifier = NNClassifier(**config) else: raise ValueError("Unexpected classification method {}".format(method)) return classifier
def test_hat_nb(test_path): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1, noise_percentage=0.05), drift_stream=SEAGenerator( random_state=2, classification_function=2, noise_percentage=0.05), random_state=1, position=250, width=10) stream.prepare_for_use() learner = HAT(leaf_prediction='nb') cnt = 0 max_samples = 1000 y_pred = array('i') y_proba = [] wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_nb.npy') data = np.load(test_file) assert np.allclose(y_proba, data) expected_info = "HAT(binary_split=False, grace_period=200, leaf_prediction='nb',\n" \ " max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0,\n" \ " no_preprune=False, nominal_attributes=None, remove_poor_atts=False,\n" \ " split_confidence=1e-07, split_criterion='info_gain',\n" \ " stop_mem_management=False, tie_threshold=0.05)" assert learner.get_info() == expected_info assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def __init__(self, regression=False): if regression: model = self._model = RegressionHAT( # leaf_prediction='mc' ) else: model = HAT( # leaf_prediction='mc', # nominal_attributes=[ 4], ) super().__init__(_model=model)
def __init__(self, regression=False): if regression: model_initializer = lambda: RegressionHAT( # leaf_prediction='mc' ) else: model_initializer = lambda: HAT( # leaf_prediction='mc', # nominal_attributes=[ 4], ) super().__init__(_model_initializer=model_initializer)
def test_hat_nba(test_path): stream = HyperplaneGenerator(mag_change=0.001, noise_percentage=0.1, random_state=2) stream.prepare_for_use() learner = HAT(leaf_prediction='nba') cnt = 0 max_samples = 5000 y_pred = array('i') y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_nba.npy') data = np.load(test_file) assert np.allclose(y_proba, data) expected_info = "HAT(binary_split=False, bootstrap_sampling=True, grace_period=200,\n" \ " leaf_prediction='nba', max_byte_size=33554432,\n" \ " memory_estimate_period=1000000, nb_threshold=0, no_preprune=False,\n" \ " nominal_attributes=None, remove_poor_atts=False, split_confidence=1e-07,\n" \ " split_criterion='info_gain', stop_mem_management=False, tie_threshold=0.05)" assert learner.get_info() == expected_info assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_hat_nba(test_path): stream = HyperplaneGenerator(mag_change=0.001, noise_percentage=0.1, random_state=2) stream.prepare_for_use() learner = HAT(leaf_prediction='nba') cnt = 0 max_samples = 5000 y_pred = array('i') y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_nba.npy') data = np.load(test_file) assert np.allclose(y_proba, data) expected_info = 'HAT: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200' \ ' - split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05' \ ' - binary_split: False - stop_mem_management: False - remove_poor_atts: False' \ ' - no_pre_prune: False - leaf_prediction: nba - nb_threshold: 0' \ ' - nominal_attributes: [] - ' assert learner.get_info() == expected_info assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) X_train, X_test, y_train, y_test = X_train.to_numpy(), X_test.to_numpy( ), y_train.to_numpy(), y_test.to_numpy() print(type(X_train)) #print(type(X_train.to_numpy())) # ### Entrenar el clasificador (modelo) # In[2]: clasificador = HAT() print(clasificador.get_info()) print("start training") clasificador.fit(X_train, y_train, classes=None, sample_weight=None) print("end training") # In[3]: print("start predict") predict = clasificador.predict(X_test) print("end predict") # In[4]: print("shape_predict")
def test_hat_mc(test_path): stream = ConceptDriftStream(stream=SEAGenerator(random_state=1, noise_percentage=0.05), drift_stream=SEAGenerator( random_state=2, classification_function=2, noise_percentage=0.05), random_state=1, position=250, width=10) stream.prepare_for_use() learner = HAT(leaf_prediction='mc') cnt = 0 max_samples = 1000 y_pred = array('i') y_proba = [] wait_samples = 20 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_mc.npy') data = np.load(test_file) assert np.allclose(y_proba, data) expected_info = 'HAT: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200' \ ' - split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05' \ ' - binary_split: False - stop_mem_management: False - remove_poor_atts: False' \ ' - no_pre_prune: False - leaf_prediction: mc - nb_threshold: 0' \ ' - nominal_attributes: [] - ' assert learner.get_info() == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 0.005295278636481529, 1.0: 1.9947047213635185}\n' expected_model_2 = 'Leaf = Class 1.0 | {0.0: 0.0052952786364815294, 1.0: 1.9947047213635185}\n' expected_model_3 = 'Leaf = Class 1.0 | {1.0: 1.9947047213635185, 0.0: 0.0052952786364815294}\n' assert (learner.get_model_description() == expected_model_1) \ or (learner.get_model_description() == expected_model_2) \ or (learner.get_model_description() == expected_model_3) stream.restart() X, y = stream.next_sample(5000) learner = HAT(max_byte_size=30, leaf_prediction='mc', grace_period=10) learner.partial_fit(X, y)
def test_led(): led_a = ConceptDriftStream( stream=LEDGeneratorDrift(has_noise=False, noise_percentage=0.0, n_drift_features=3), drift_stream=LEDGeneratorDrift(has_noise=False, noise_percentage=0.0, n_drift_features=7), random_state=None, alpha=90.0, # angle of change grade 0 - 90 position=250000, width=1) led_a.name = "led_a" led_g = ConceptDriftStream(stream=LEDGeneratorDrift(has_noise=False, noise_percentage=0.0, n_drift_features=3), drift_stream=LEDGeneratorDrift( has_noise=False, noise_percentage=0.0, n_drift_features=7), random_state=None, position=250000, width=50000) led_g.name = "led_g" led_fa = ReoccuringDriftStream( stream=LEDGeneratorDrift(has_noise=False, noise_percentage=0.0, n_drift_features=3), drift_stream=LEDGeneratorDrift(has_noise=False, noise_percentage=0.0, n_drift_features=7), random_state=None, alpha=90.0, # angle of change grade 0 - 90 position=2000, width=1) led_fg = ReoccuringDriftStream( stream=LEDGeneratorDrift(has_noise=False, noise_percentage=0.0, n_drift_features=3), drift_stream=LEDGeneratorDrift(has_noise=False, noise_percentage=0.0, n_drift_features=7), random_state=None, position=2000, width=1000) np = 2 sigma = 3 clfs = [ ARSLVQ(prototypes_per_class=np, sigma=sigma, confidence=0.0001, window_size=1500), OzaBaggingAdwin(), AdaptiveRandomForest(), HAT(), RSLVQ(prototypes_per_class=np, sigma=sigma), SAMKNN() ] cv = CrossValidation(clfs=clfs, parallel=1) cv.streams = [led_a, led_g, led_fa, led_fg] cv.search() cv.save_summary()
def cargaClassifiers(params,n_classes): gamma=params[0][0] n_gaussianRF=params[0][1] window_size=params[1][0] vecinos=params[1][1] hoja_size=params[1][2] #KNN and GRF_KNN clf_1 = KNN(n_neighbors=vecinos, leaf_size=hoja_size, max_window_size=window_size) clf_2 = GRF_KNN(n_neighbors=vecinos, leaf_size=hoja_size, max_window_size=window_size) clf_2.gamma=gamma clf_2.n_gaussianRF=n_gaussianRF #HoeffdingTree, HoeffdingTree_GRF clf_3 = HoeffdingTree() clf_4=GRF_HoeffdingTree() clf_4.gamma=gamma clf_4.n_gaussianRF=n_gaussianRF #HoeffdingAdaptiveTree and GRF_HoeffdingAdaptiveTree clf_5=HAT() clf_6=GRF_HoeffdingAdaptiveTree() clf_6.gamma=gamma clf_6.n_gaussianRF=n_gaussianRF #NaiveBayes and GRF_NaiveBayes # clf_7=NaiveBayes() # # clf_8=GRF_NaiveBayes() # clf_8.gamma=gamma # clf_8.n_gaussianRF=n_gaussianRF #GNB and GRF_GNB clf_9=GaussianNB() clf_10=GRF_GaussianNB() clf_10.gamma=gamma clf_10.n_gaussianRF=n_gaussianRF #SGDClassifier and GRF_SGDClassifier clf_11=SGDClassifier(max_iter=1) clf_12=GRF_SGDClassifier(max_iter=1) clf_12.gamma=gamma clf_12.n_gaussianRF=n_gaussianRF #Perceptron and GRF_Perceptron clf_13=SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None,max_iter=1) clf_14=GRF_SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None,max_iter=1) clf_14.gamma=gamma clf_14.n_gaussianRF=n_gaussianRF #PassiveAggressiveClassifier and GRF_PassiveAggressiveClassifier clf_15=PassiveAggressiveClassifier(max_iter=1) clf_16=GRF_PassiveAggressiveClassifier(max_iter=1) clf_16.gamma=gamma clf_16.n_gaussianRF=n_gaussianRF #MLPClassifier and GRF_MLPClassifier clf_17=MLPClassifier(batch_size=1,max_iter=1,hidden_layer_sizes=(100,)) clf_18=GRF_MLPClassifier(batch_size=1,max_iter=1,hidden_layer_sizes=(100,)) clf_18.gamma=gamma clf_18.n_gaussianRF=n_gaussianRF classifiers = [clf_1,clf_2,clf_3,clf_4,clf_5,clf_6,clf_9,clf_10,clf_11,clf_12,clf_13,clf_14,clf_15,clf_16,clf_17,clf_18] classifiers_init = [clf_1,clf_2,clf_3,clf_4,clf_5,clf_6,clf_9,clf_10,clf_11,clf_12,clf_13,clf_14,clf_15,clf_16,clf_17,clf_18] # classifiers = [clf_1,clf_2] # classifiers_init = [clf_1,clf_2] names=[] for c in range(len(classifiers)): classifier=classifiers[c] class_name='' if str(classifier)[26:33]=='GRF_KNN': class_name=str(classifier)[26:33] elif str(classifier)[22:25]=='KNN': class_name=str(classifier)[22:25] elif str(classifier)[34:47]=='HoeffdingTree': class_name='HT' elif str(classifier)[38:55]=='GRF_HoeffdingTree': class_name='GRF_HT' elif str(classifier)[43:46]=='HAT': class_name=str(classifier)[43:46] elif str(classifier)[47:72]=='GRF_HoeffdingAdaptiveTree': class_name='GRF_HAT' # elif str(classifier)[31:41]=='NaiveBayes': # class_name='MNB' # elif str(classifier)[35:49]=='GRF_NaiveBayes': # class_name='GRF_MNB' elif str(classifier)[0:10]=='GaussianNB': class_name='GNB' elif str(classifier)[0:14]=='GRF_GaussianNB': class_name='GRF_GNB' elif str(classifier)[0:13]=='SGDClassifier' and classifier.loss=='hinge': class_name='SGD' elif str(classifier)[0:17]=='GRF_SGDClassifier' and classifier.loss=='hinge': class_name='GRF_SGD' elif str(classifier)[0:13]=='SGDClassifier' and classifier.loss=='perceptron': class_name='Perceptron' elif str(classifier)[0:17]=='GRF_SGDClassifier' and classifier.loss=='perceptron': class_name='GRF_Perceptron' elif str(classifier)[0:27]=='PassiveAggressiveClassifier': class_name='PA' elif str(classifier)[0:31]=='GRF_PassiveAggressiveClassifier': class_name='GRF_PA' elif str(classifier)[0:13]=='MLPClassifier': class_name='MLP' elif str(classifier)[0:17]=='GRF_MLPClassifier': class_name='GRF_MLP' # elif str(classifier)[0:9]=='OnlineGRF': # class_name=str(classifier)[0:9] names.append(class_name) return classifiers,names,classifiers_init