def demo(): # The classifier we will use (other options: SAMKNNClassifier, LeveragingBaggingClassifier, SGD) h1 = [ HoeffdingTreeClassifier(), SAMKNNClassifier(), LeveragingBaggingClassifier(random_state=1), SGDClassifier() ] h2 = [ HoeffdingTreeClassifier(), SAMKNNClassifier(), LeveragingBaggingClassifier(random_state=1), SGDClassifier() ] h3 = [ HoeffdingTreeClassifier(), SAMKNNClassifier(), LeveragingBaggingClassifier(random_state=1), SGDClassifier() ] model_names = ['HT', 'SAMKNNClassifier', 'LBkNN', 'SGDC'] # Demo 1 -- plot should not fail demo_parameterized(h1, model_names=model_names) # Demo 2 -- csv output should look nice demo_parameterized(h2, "sea_stream.csv", False, model_names) # Demo 3 -- should not give "'NoneType' object is not iterable" error demo_parameterized(h3, "covtype.csv", False, model_names)
def test_hoeffding_tree_categorical_features(test_path): data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy') stream = np.load(data_path) # Removes the last two columns (regression targets) stream = stream[:, :-2] X, y = stream[:, :-1], stream[:, -1] nominal_attr_idx = np.arange(7).tolist() learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) learner.partial_fit(X, y, classes=np.unique(y)) expected_description = "if Attribute 0 = -15.0:\n" \ " Leaf = Class 2 | {2: 350.0}\n" \ "if Attribute 0 = 0.0:\n" \ " Leaf = Class 0 | {0: 420.0, 1: 252.0}\n" \ "if Attribute 0 = 1.0:\n" \ " Leaf = Class 1 | {0: 312.0, 1: 332.0}\n" \ "if Attribute 0 = 2.0:\n" \ " Leaf = Class 1 | {0: 236.0, 1: 383.0}\n" \ "if Attribute 0 = 3.0:\n" \ " Leaf = Class 1 | {0: 168.0, 1: 459.0}\n" \ "if Attribute 0 = -30.0:\n" \ " Leaf = Class 3.0 | {3.0: 46.0, 4.0: 42.0}\n" assert learner.get_model_description() == expected_description
def test_hoeffding_tree_model_information(): stream = SEAGenerator(random_state=1, noise_percentage=0.05) X, y = stream.next_sample(5000) nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) learner.partial_fit(X, y, classes=stream.target_values) expected_info = { 'Tree size (nodes)': 5, 'Tree size (leaves)': 3, 'Active learning nodes': 3, 'Tree depth': 2, 'Active leaf byte size estimate': 0.0, 'Inactive leaf byte size estimate': 0.0, 'Byte size estimate overhead': 1.0 } observed_info = learner.get_model_measurements for k in expected_info: assert k in observed_info assert expected_info[k] == observed_info[k] expected_description = "if Attribute 0 <= 4.549969620513424:\n" \ " if Attribute 1 <= 5.440182925299016:\n" \ " Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \ " if Attribute 1 > 5.440182925299016:\n" \ " Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \ "if Attribute 0 > 4.549969620513424:\n" \ " Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \ assert expected_description == learner.get_model_description()
def __init__(self, window_size): self.window_size = window_size self.model = HoeffdingTreeClassifier() self.history = None self.accuracy = None # Number of all predictions and correct predictions for calculating accuracy self.predictions = 0 self.correct_predictions = 0
def test_pprint(): learner = HoeffdingTreeClassifier() expected_string = "binary_split=False, grace_period=200, leaf_prediction='nba',\n" \ " max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0,\n" \ " no_preprune=False, nominal_attributes=None, remove_poor_atts=False,\n" \ " split_confidence=1e-07, split_criterion='info_gain',\n" \ " stop_mem_management=False, tie_threshold=0.05" assert _pprint(learner.get_params()) == expected_string
def test_get_tags(): classifier = HoeffdingTreeClassifier() regressor = HoeffdingTreeRegressor() multi_output_regressor = iSOUPTreeRegressor() classifier_tags = classifier._get_tags() expected_tags = { 'X_types': ['2darray'], '_skip_test': False, 'allow_nan': False, 'multilabel': False, 'multioutput': False, 'multioutput_only': False, 'no_validation': False, 'non_deterministic': False, 'poor_score': False, 'requires_positive_data': False, 'stateless': False } assert classifier_tags == expected_tags regressor_tags = regressor._get_tags() expected_tags = { 'X_types': ['2darray'], '_skip_test': False, 'allow_nan': False, 'multilabel': False, 'multioutput': False, 'multioutput_only': False, 'no_validation': False, 'non_deterministic': False, 'poor_score': False, 'requires_positive_data': False, 'stateless': False } assert regressor_tags == expected_tags multi_output_regressor_tags = multi_output_regressor._get_tags() expected_tags = { 'X_types': ['2darray'], '_skip_test': False, 'allow_nan': False, 'multilabel': False, 'multioutput': True, 'multioutput_only': True, 'no_validation': False, 'non_deterministic': False, 'poor_score': False, 'requires_positive_data': False, 'stateless': False } assert multi_output_regressor_tags == expected_tags
def main(): # start agent network server agentNetwork = AgentNetwork() # init agents gen_agent = agentNetwork.add_agent(agentType=DataStreamAgent) trainer_agent = agentNetwork.add_agent(agentType=Trainer) predictor_agent = agentNetwork.add_agent(agentType=Predictor) evaluator_agent = agentNetwork.add_agent(agentType=Evaluator) monitor_agent_1 = agentNetwork.add_agent(agentType=MonitorAgent) monitor_agent_2 = agentNetwork.add_agent(agentType=MonitorAgent) gen_agent.init_parameters(stream=SineGenerator(), pretrain_size=1000, batch_size=1) trainer_agent.init_parameters(ml_model=HoeffdingTreeClassifier()) # connect agents : We can connect multiple agents to any particular agent # However the agent needs to implement handling multiple input types agentNetwork.bind_agents(gen_agent, trainer_agent) agentNetwork.bind_agents(gen_agent, predictor_agent) agentNetwork.bind_agents(trainer_agent, predictor_agent) agentNetwork.bind_agents(predictor_agent, evaluator_agent) agentNetwork.bind_agents(evaluator_agent, monitor_agent_1) agentNetwork.bind_agents(predictor_agent, monitor_agent_2) # set all agents states to "Running" agentNetwork.set_running_state() # allow for shutting down the network after execution return agentNetwork
def demo(): """ _test_pipeline This demo demonstrates the Pipeline structure seemingly working as a learner, while being passed as parameter to an EvaluatePrequential object. """ # # Setup the stream # stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" # "master/covtype.csv") # # If used for Hoeffding Trees then need to pass indices for Nominal attributes # Test with RandomTreeGenerator # stream = RandomTreeGenerator(n_classes=2, n_numerical_attributes=5) # Test with WaveformGenerator stream = WaveformGenerator() # Setup the classifier #classifier = PerceptronMask() #classifier = NaiveBayes() #classifier = PassiveAggressiveClassifier() classifier = HoeffdingTreeClassifier() # Setup the pipeline pipe = Pipeline([('Hoeffding Tree', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(show_plot=True, pretrain_size=1000, max_samples=100000) # Evaluate evaluator.evaluate(stream=stream, model=pipe)
def test_evaluate_classification_coverage(tmpdir): # A simple coverage test. Tests for metrics are placed in the corresponding test module. stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) # Learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) max_samples = 1000 output_file = os.path.join(str(tmpdir), "prequential_summary.csv") metrics = [ 'accuracy', 'kappa', 'kappa_t', 'kappa_m', 'f1', 'precision', 'recall', 'gmean', 'true_vs_predicted' ] evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate evaluator.evaluate(stream=stream, model=learner) mean_performance, current_performance = evaluator.get_measurements( model_idx=0) expected_current_accuracy = 0.685 assert np.isclose(current_performance.accuracy_score(), expected_current_accuracy)
def test_active_learning_window_extraction(): df = pd.read_csv(METADB_PATH) stream = DataStream(df) learner = ActiveLearner(0.1, stream, HoeffdingTreeClassifier(), store_history=True) for i in range(1000): learner.next_data() wind1 = learner.get_last_window() for i in range(1000): learner.next_data() wind2 = learner.get_last_window(n_classes=5) print(wind1) print(wind2) assert wind1.shape[0] == 1 assert wind1.shape[1] > 0 assert wind2.shape[0] == 1 assert wind2.shape[1] > 0
def test_active_learning_window_extraction_with_delta(): df = pd.read_csv(METADB_PATH) stream = DataStream(df) learner = ActiveLearner(0.1, stream, HoeffdingTreeClassifier(), store_history=True) for i in range(1000): learner.next_data() new_curr1 = mean([x[2] for x in learner.history]) old_last_window_acc1 = learner.last_window_acc expected_delta1 = new_curr1 - old_last_window_acc1 wind1 = learner.get_last_window(delta_acc_summary_func="mean") for i in range(1000): learner.next_data() new_curr2 = max([x[2] for x in learner.history]) old_last_window_acc2 = learner.last_window_acc expected_delta2 = new_curr2 - old_last_window_acc2 wind2 = learner.get_last_window(n_classes=5, delta_acc_summary_func="max") print(wind1) print(wind2) assert wind1.shape[0] == 1 assert wind1.shape[1] > 0 assert wind2.shape[0] == 1 assert wind2.shape[1] > 0 assert expected_delta1 == wind1["window_acc_delta"].to_numpy()[0] assert expected_delta2 == wind2["window_acc_delta"].to_numpy()[0] assert old_last_window_acc2 == new_curr1
def test_hoeffding_tree_coverage(): # Cover memory management stream = SEAGenerator(random_state=1, noise_percentage=0.05) X, y = stream.next_sample(5000) learner = HoeffdingTreeClassifier(max_byte_size=30, memory_estimate_period=100, grace_period=10, leaf_prediction='mc') learner.partial_fit(X, y, classes=stream.target_values) learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0, n_categories_per_cat_feature=2) X, y = stream.next_sample(1000) learner = HoeffdingTreeClassifier(leaf_prediction='mc', nominal_attributes=[i for i in range(10)]) learner.partial_fit(X, y, classes=stream.target_values)
def test_evaluate_holdout_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) # Setup evaluator n_wait = 200 max_samples = 1000 metrics = ['accuracy', 'kappa', 'kappa_t'] output_file = os.path.join(str(tmpdir), "holdout_summary.csv") evaluator = EvaluateHoldout(n_wait=n_wait, max_samples=max_samples, test_size=50, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=learner) result_learner = result[0] assert isinstance(result_learner, HoeffdingTreeClassifier) assert learner.get_model_measurements == result_learner.get_model_measurements expected_file = os.path.join(test_path, 'holdout_summary.csv') compare_files(output_file, expected_file) mean_performance, current_performance = evaluator.get_measurements(model_idx=0) expected_mean_accuracy = 0.344000 assert np.isclose(mean_performance.accuracy_score(), expected_mean_accuracy) expected_mean_kappa = 0.135021 assert np.isclose(mean_performance.kappa_score(), expected_mean_kappa) expected_mean_kappa_t = 0.180000 assert np.isclose(mean_performance.kappa_t_score(), expected_mean_kappa_t) expected_current_accuracy = 0.360000 assert np.isclose(current_performance.accuracy_score(), expected_current_accuracy) expected_current_kappa = 0.152542 assert np.isclose(current_performance.kappa_score(), expected_current_kappa) expected_current_kappa_t = 0.200000 assert np.isclose(current_performance.kappa_t_score(), expected_current_kappa_t) expected_info = "EvaluateHoldout(batch_size=1, dynamic_test_set=False, max_samples=1000,\n" \ " max_time=inf, metrics=['accuracy', 'kappa', 'kappa_t'],\n" \ " n_wait=200,\n" \ " output_file='holdout_summary.csv',\n" \ " restart_stream=True, show_plot=False, test_size=50)" assert evaluator.get_info() == expected_info
def _init_ensemble_member(self): #randomise the hoeffding tree three parameters grace_period = (random.randint(0, 20) + 1) * 10 split_confidence = (random.randint(0, 20) + 1) * 0.05 tie_threshold = (random.randint(0, 20) + 1) * 0.05 return MyEnsembleBaseLearner(classifier=HoeffdingTreeClassifier( grace_period=grace_period, split_confidence=split_confidence, tie_threshold=tie_threshold))
def test_evaluate_prequential_classifier(tmpdir, test_path): # Setup file stream stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) # Setup learner nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) # Setup evaluator max_samples = 1000 metrics = ['accuracy', 'kappa', 'kappa_t'] output_file = os.path.join(str(tmpdir), "prequential_summary.csv") evaluator = EvaluatePrequential(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=[learner]) result_learner = result[0] assert isinstance(result_learner, HoeffdingTreeClassifier) assert learner.model_measurements == result_learner.model_measurements expected_file = os.path.join(test_path, 'prequential_summary.csv') compare_files(output_file, expected_file) mean_performance, current_performance = evaluator.get_measurements(model_idx=0) # Simple test. Tests for metrics are placed in the corresponding test module. expected_mean_accuracy = 0.436250 assert np.isclose(mean_performance.accuracy_score(), expected_mean_accuracy) expected_mean_kappa = 0.231791 assert np.isclose(mean_performance.kappa_score(), expected_mean_kappa) expected_mean_kappa_t = 0.236887 assert np.isclose(mean_performance.kappa_t_score(), expected_mean_kappa_t) expected_current_accuracy = 0.430000 assert np.isclose(current_performance.accuracy_score(), expected_current_accuracy) expected_current_kappa = 0.223909 assert np.isclose(current_performance.kappa_score(), expected_current_kappa) expected_current_kappa_t = 0.240000 assert np.isclose(current_performance.kappa_t_score(), expected_current_kappa_t) expected_info = "EvaluatePrequential(batch_size=1, data_points_for_classification=False,\n" \ " max_samples=1000, max_time=inf,\n" \ " metrics=['accuracy', 'kappa', 'kappa_t'], n_wait=200,\n" \ " output_file='prequential_summary.csv',\n" \ " pretrain_size=200, restart_stream=True, show_plot=False)" assert evaluator.get_info() == expected_info
def demo(): # The classifier we will use (other options: SAMKNNClassifier, LeverageBaggingClassifier, SGD) h = HoeffdingTreeClassifier() # Setup Stream stream = FileStream("../data/datasets/sea_stream.csv") pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_filestream.csv', max_samples=10000, batch_size=1, n_wait=1000, show_plot=True) evaluator.evaluate(stream=stream, model=h)
def __init__(self, base_estimator=HoeffdingTreeClassifier(grace_period=50, split_confidence=0.01), n_estimators: int = 100, subspace_mode: str = "percentage", subspace_size: int = 60, training_method: str = "randompatches", lam: float = 6.0, drift_detection_method: BaseDriftDetector = ADWIN(delta=1e-5), warning_detection_method: BaseDriftDetector = ADWIN(delta=1e-4), disable_weighted_vote: bool = False, disable_drift_detection: bool = False, disable_background_learner: bool = False, nominal_attributes=None, random_state=None): self.base_estimator = base_estimator # Not restricted to a specific base estimator. self.n_estimators = n_estimators if subspace_mode not in {self._FEATURES_SQRT, self._FEATURES_SQRT_INV, self._FEATURES_PERCENT, self._FEATURES_M}: raise ValueError("Invalid subspace_mode: {}.\n" "Valid options are: {}".format(subspace_mode, {self._FEATURES_M, self._FEATURES_SQRT, self._FEATURES_SQRT_INV, self._FEATURES_PERCENT})) self.subspace_mode = subspace_mode self.subspace_size = subspace_size if training_method not in {self._TRAIN_RESAMPLING, self._TRAIN_RANDOM_PATCHES, self._TRAIN_RANDOM_SUBSPACES}: raise ValueError("Invalid training_method: {}.\n" "Valid options are: {}".format(training_method, {self._TRAIN_RANDOM_PATCHES, self._TRAIN_RANDOM_SUBSPACES, self._TRAIN_RESAMPLING})) self.training_method = training_method self.lam = lam self.drift_detection_method = drift_detection_method self.warning_detection_method = warning_detection_method self.disable_weighted_vote = disable_weighted_vote self.disable_drift_detection = disable_drift_detection self.disable_background_learner = disable_background_learner # Single option (accuracy) for drift detection criteria. Could be extended in the future. self.drift_detection_criteria = 'accuracy' self.nominal_attributes = nominal_attributes if nominal_attributes else [] self.random_state = random_state # self._random_state is the actual object used internally self._random_state = check_random_state(self.random_state) self.ensemble = None self._n_samples_seen = 0 self._subspaces = None self._base_performance_evaluator = ClassificationPerformanceEvaluator() self._base_learner_class = StreamingRandomPatchesBaseLearner
def test_hoeffding_tree_coverage(): # Cover memory management max_samples = 5000 max_size_kb = 50 stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=10, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=15, min_leaf_depth=3, fraction_leaves_per_level=0.15) nominal_attr_idx = [x for x in range(5, stream.n_features)] # Unconstrained model has over 72 kB learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx, leaf_prediction='mc', memory_estimate_period=100, max_byte_size=max_size_kb * 2**10) X, y = stream.next_sample(max_samples) learner.partial_fit(X, y) assert calculate_object_size(learner, 'kB') <= max_size_kb learner.reset()
def init_parameters(self, mode="prequential", ml_model=HoeffdingTreeClassifier(), split_type=None): self.mode = mode self.ml_model = ml_model self.results = [] if split_type is not None: self.split_type = split_type else: self.split_type = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
def test_evaluate_delayed_classification_single_time_delay(tmpdir): # Test using a single delay by time data = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) # Number of samples to use max_samples = 1000 # Get X and y X, y = data.next_sample(max_samples) y = y.astype(int) time = generate_random_dates(seed=1, samples=max_samples) # Setup temporal stream stream = TemporalDataStream(X, y, time, sample_delay=np.timedelta64(30, "D"), ordered=False) # Setup learner nominal_attr_idx = [x for x in range(15, len(data.feature_names))] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) output_file = os.path.join(str(tmpdir), "prequential_delayed_summary.csv") metrics = [ 'accuracy', 'kappa', 'kappa_t', 'kappa_m', 'f1', 'precision', 'recall', 'gmean', 'true_vs_predicted' ] evaluator = EvaluatePrequentialDelayed(max_samples=max_samples, metrics=metrics, output_file=output_file) # Evaluate evaluator.evaluate(stream=stream, model=learner) mean_performance, current_performance = evaluator.get_measurements( model_idx=0) expected_current_accuracy = 0.715 assert np.isclose(current_performance.accuracy_score(), expected_current_accuracy)
def demo(): # The classifier we will use (other options: SAMKNNClassifier, LeveragingBaggingClassifier, SGD) h = HoeffdingTreeClassifier() # Setup Stream stream = FileStream( "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/" "master/sea_stream.csv") pretrain = 100 evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_filestream.csv', max_samples=10000, batch_size=1, n_wait=1000, show_plot=True) evaluator.evaluate(stream=stream, model=h)
def demo(output_file=None, instances=40000): """ _test_comparison_holdout This demo will test a holdout evaluation task when more than one learner is evaluated, which makes it a comparison task. Parameters ---------- output_file: string, optional If passed this parameter indicates the output file name. If left blank, no output file will be generated. instances: int (Default: 40000) The evaluation's maximum number of instances. """ # Setup the File Stream # stream = FileStream("../data/datasets/covtype.csv", -1, 1) stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier clf_one = HoeffdingTreeClassifier() # clf_two = KNNADWINClassifier(n_neighbors=8, max_window_size=2000) # classifier = PassiveAggressiveClassifier() # classifier = SGDRegressor() # classifier = PerceptronMask() # Setup the pipeline classifier = [clf_one] # Setup the evaluator evaluator = EvaluateHoldout(test_size=500, dynamic_test_set=True, max_samples=instances, batch_size=1, n_wait=5000, max_time=1000, output_file=output_file, show_plot=True, metrics=['kappa']) # Evaluate evaluator.evaluate(stream=stream, model=classifier)
def test_set_params(): learner = HoeffdingTreeClassifier() original_info = learner.get_info() params = learner.get_params() params.update(leaf_prediction='nb', split_criterion='gini', remove_poor_atts=True) learner.set_params(**params) updated_info = learner.get_info() assert original_info != updated_info expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nb', " \ "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \ "nominal_attributes=None, remove_poor_atts=True, split_confidence=1e-07, split_criterion='gini', " \ "stop_mem_management=False, tie_threshold=0.05)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def __init__(self, min_estimators=5, max_estimators=20, base_estimators=[NaiveBayes(), HoeffdingTreeClassifier()], period=1000, alpha=0.002, beta=1.5, theta=0.05, enable_diversity=True): """ Creates a new instance of DiversifiedDynamicClassWeightedClassifier. """ super().__init__() self.enable_diversity = enable_diversity self.min_estimators = min_estimators self.max_estimators = max_estimators self.base_estimators = base_estimators self.alpha = alpha self.beta = beta self.theta = theta self.period = period self.p = -1 self.n_estimators = max_estimators self.epochs = None self.num_classes = None self.experts = None self.div = [] self.window_size = None self.X_batch = None self.y_batch = None self.y_batch_experts = None # custom measurements atributes self.custom_measurements = [] self.custom_time = [] self.reset()
def demo(output_file=None, instances=40000): """ _test_prequential_bagging This demo shows the evaluation process of a LeverageBaggingClassifier, initialized with different base estimators. Parameters ---------- output_file: string The name of the csv output file instances: int The evaluation's max number of instances """ # Setup the File Stream # stream = FileStream("../data/datasets/sea_big.csv", -1, 1) #stream = SEAGenerator(classification_function=2, noise_percentage=0.0) #stream.prepare_for_use() stream = WaveformGenerator() stream.prepare_for_use() # Setup the classifier #classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000, # leaf_size=30)) #classifier = LeverageBaggingClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000, # leaf_size=30), # n_estimators=1) pipe = LeverageBaggingClassifier(base_estimator=HoeffdingTreeClassifier(), n_estimators=2) # Setup the pipeline #pipe = Pipeline([('Classifier', classifier)]) # Setup the evaluator evaluator = EvaluatePrequential(pretrain_size=2000, max_samples=instances, output_file=output_file, show_plot=False) # Evaluate evaluator.evaluate(stream=stream, model=pipe)
def test_hoeffding_tree_nb(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx, leaf_prediction='nb') cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3, 1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3, 2 ]) assert np.alltrue(predictions == expected_predictions) expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nb', " \ "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \ "nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14], remove_poor_atts=False, " \ "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, " \ "tie_threshold=0.05)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def run_one_test(test_idx): # create test_files: create_all_test_files(test_idx) # create the trees: ht = HoeffdingTreeClassifier(binary_split=True, grace_period=200, split_confidence=0.0000001, tie_threshold=0.05, split_criterion="hellinger") # pretrain on 200 pos and 1000 neg: train_path = Path("dataset") / "htru" / "train_200_1000.csv" train_tree(train_path, ht) print("\n") # create 4 copies of the tree: trees = [] for _ in range(4): trees.append(copy.deepcopy(ht)) # do tests and get fscore and gm: results = [] results.append(get_tree_results(trees[0], 10)) results.append(get_tree_results(trees[1], 100)) results.append(get_tree_results(trees[2], 1000)) results.append(get_tree_results(trees[3], 10000)) # save results: new_file_content = "" for _, row in enumerate(results): new_line = str(row)[1:-1] new_file_content += new_line + "\n" result_file_path = Path("results") / "htru" / "HDVFDT" / str( "results_" + str(test_idx) + ".csv") with open(result_file_path, "w") as file: file.write(new_file_content) return results
def train(name, clusters, window, normalize=False): input_csv = '{}{}_clusters={}_window={}_prepared.csv'.format( DATA_LOCATION, name, clusters, window) data = pd.read_csv(input_csv, index_col=0) if normalize: states = data.filter(['current_state', 'next_state']) sensors = data.drop(columns=['current_state', 'next_state']) scaler = StandardScaler() data = pd.DataFrame(data=scaler.fit_transform(X=sensors), index=data.index, columns=sensors.columns) data = pd.concat([data, states], axis='columns') stream = DataStream(data) hf = HoeffdingTreeClassifier() sgd = SGDClassifier() evaluator = EvaluatePrequential() evaluator.evaluate(stream=stream, model=[hf, sgd]) # print('---------------------------------------------') # measurements = evaluator.get_mean_measurements()[0] # print(measurements.confusion_matrix) # print(measurements.accuracy_score()) data = [] for i, measurements in enumerate(evaluator.get_mean_measurements()): data.append([ name, clusters, window, MODEL_NAMES[i], normalize, measurements.accuracy_score(), measurements.precision_score(), measurements.recall_score(), measurements.f1_score() ]) return pd.DataFrame(data=data, columns=[ 'name', 'clusters', 'window', 'model', 'normalized', 'accuracy', 'precision', 'recall', 'f1' ])
def test_data_stream(test_path): test_file = os.path.join(test_path, 'data/data_n30000.csv') raw_data = pd.read_csv(test_file) stream = DataStream(raw_data, name='Test') normal_knn_learner = KNNClassifier( n_neighbors=8, max_window_size=2000, leaf_size=40, ) weighted_knn_learner = WeightedKNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40) standardize_knn_learner = KNNClassifier(n_neighbors=8, max_window_size=2000, leaf_size=40, standardize=True) nominal_attr_idx = [x for x in range(15, len(stream.feature_names))] hoeffding_learner = HoeffdingTreeClassifier( nominal_attributes=nominal_attr_idx) nb_learner = NaiveBayes() metrics = ['accuracy', 'kappa_m', 'kappa_t', 'recall'] output_file = os.path.join(test_path, 'data/kkn_output.csv') evaluator = EvaluatePrequential(metrics=metrics, output_file=output_file) # Evaluate result = evaluator.evaluate(stream=stream, model=[ normal_knn_learner, weighted_knn_learner, standardize_knn_learner, hoeffding_learner, nb_learner, ]) mean_performance, current_performance = evaluator.get_measurements() assert 1 == 1
def main(): global agentNetwork # start agent network agentNetwork = AgentNetwork() # add agents data_stream_agent_1 = agentNetwork.add_agent(agentType=DataStreamAgent) ml_agent_hoeffdingTree = agentNetwork.add_agent(agentType=ML_Model) ml_agent_neuralNets = agentNetwork.add_agent(agentType=ML_Model) monitor_agent_1 = agentNetwork.add_agent(agentType=MonitorAgent) # init parameters data_stream_agent_1.init_parameters(stream=WaveformGenerator(), pretrain_size=1000, batch_size=100) ml_agent_hoeffdingTree.init_parameters(ml_model=HoeffdingTreeClassifier()) ml_agent_neuralNets.init_parameters(ml_model=NaiveBayes()) # connect agents agentNetwork.bind_agents(data_stream_agent_1, ml_agent_hoeffdingTree) agentNetwork.bind_agents(data_stream_agent_1, ml_agent_neuralNets) agentNetwork.bind_agents(ml_agent_hoeffdingTree, monitor_agent_1) agentNetwork.bind_agents(ml_agent_neuralNets, monitor_agent_1) agentNetwork.set_running_state() # allow for shutting down the network after execution return agentNetwork