def demo():

    # The classifier we will use (other options: SAMKNNClassifier, LeveragingBaggingClassifier, SGD)
    h1 = [
        HoeffdingTreeClassifier(),
        SAMKNNClassifier(),
        LeveragingBaggingClassifier(random_state=1),
        SGDClassifier()
    ]
    h2 = [
        HoeffdingTreeClassifier(),
        SAMKNNClassifier(),
        LeveragingBaggingClassifier(random_state=1),
        SGDClassifier()
    ]
    h3 = [
        HoeffdingTreeClassifier(),
        SAMKNNClassifier(),
        LeveragingBaggingClassifier(random_state=1),
        SGDClassifier()
    ]
    model_names = ['HT', 'SAMKNNClassifier', 'LBkNN', 'SGDC']

    # Demo 1 -- plot should not fail
    demo_parameterized(h1, model_names=model_names)

    # Demo 2 -- csv output should look nice
    demo_parameterized(h2, "sea_stream.csv", False, model_names)

    # Demo 3 -- should not give "'NoneType' object is not iterable" error
    demo_parameterized(h3, "covtype.csv", False, model_names)
def test_hoeffding_tree_categorical_features(test_path):
    data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy')
    stream = np.load(data_path)
    # Removes the last two columns (regression targets)
    stream = stream[:, :-2]
    X, y = stream[:, :-1], stream[:, -1]

    nominal_attr_idx = np.arange(7).tolist()
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    learner.partial_fit(X, y, classes=np.unique(y))

    expected_description = "if Attribute 0 = -15.0:\n" \
                           "  Leaf = Class 2 | {2: 350.0}\n" \
                           "if Attribute 0 = 0.0:\n" \
                           "  Leaf = Class 0 | {0: 420.0, 1: 252.0}\n" \
                           "if Attribute 0 = 1.0:\n" \
                           "  Leaf = Class 1 | {0: 312.0, 1: 332.0}\n" \
                           "if Attribute 0 = 2.0:\n" \
                           "  Leaf = Class 1 | {0: 236.0, 1: 383.0}\n" \
                           "if Attribute 0 = 3.0:\n" \
                           "  Leaf = Class 1 | {0: 168.0, 1: 459.0}\n" \
                           "if Attribute 0 = -30.0:\n" \
                           "  Leaf = Class 3.0 | {3.0: 46.0, 4.0: 42.0}\n"

    assert learner.get_model_description() == expected_description
def test_hoeffding_tree_model_information():
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    X, y = stream.next_sample(5000)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    learner.partial_fit(X, y, classes=stream.target_values)

    expected_info = {
        'Tree size (nodes)': 5,
        'Tree size (leaves)': 3,
        'Active learning nodes': 3,
        'Tree depth': 2,
        'Active leaf byte size estimate': 0.0,
        'Inactive leaf byte size estimate': 0.0,
        'Byte size estimate overhead': 1.0
    }

    observed_info = learner.get_model_measurements
    for k in expected_info:
        assert k in observed_info
        assert expected_info[k] == observed_info[k]

    expected_description = "if Attribute 0 <= 4.549969620513424:\n" \
                            "  if Attribute 1 <= 5.440182925299016:\n" \
                            "    Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \
                            "  if Attribute 1 > 5.440182925299016:\n" \
                            "    Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \
                            "if Attribute 0 > 4.549969620513424:\n" \
                            "  Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \

    assert expected_description == learner.get_model_description()
예제 #4
0
 def __init__(self, window_size):
     self.window_size = window_size
     self.model = HoeffdingTreeClassifier()
     self.history = None
     self.accuracy = None
     # Number of all predictions and correct predictions for calculating accuracy
     self.predictions = 0
     self.correct_predictions = 0
예제 #5
0
def test_pprint():
    learner = HoeffdingTreeClassifier()

    expected_string = "binary_split=False, grace_period=200, leaf_prediction='nba',\n" \
                      " max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0,\n" \
                      " no_preprune=False, nominal_attributes=None, remove_poor_atts=False,\n" \
                      " split_confidence=1e-07, split_criterion='info_gain',\n" \
                      " stop_mem_management=False, tie_threshold=0.05"
    assert _pprint(learner.get_params()) == expected_string
예제 #6
0
def test_get_tags():
    classifier = HoeffdingTreeClassifier()
    regressor = HoeffdingTreeRegressor()
    multi_output_regressor = iSOUPTreeRegressor()

    classifier_tags = classifier._get_tags()

    expected_tags = {
        'X_types': ['2darray'],
        '_skip_test': False,
        'allow_nan': False,
        'multilabel': False,
        'multioutput': False,
        'multioutput_only': False,
        'no_validation': False,
        'non_deterministic': False,
        'poor_score': False,
        'requires_positive_data': False,
        'stateless': False
    }
    assert classifier_tags == expected_tags

    regressor_tags = regressor._get_tags()
    expected_tags = {
        'X_types': ['2darray'],
        '_skip_test': False,
        'allow_nan': False,
        'multilabel': False,
        'multioutput': False,
        'multioutput_only': False,
        'no_validation': False,
        'non_deterministic': False,
        'poor_score': False,
        'requires_positive_data': False,
        'stateless': False
    }
    assert regressor_tags == expected_tags

    multi_output_regressor_tags = multi_output_regressor._get_tags()
    expected_tags = {
        'X_types': ['2darray'],
        '_skip_test': False,
        'allow_nan': False,
        'multilabel': False,
        'multioutput': True,
        'multioutput_only': True,
        'no_validation': False,
        'non_deterministic': False,
        'poor_score': False,
        'requires_positive_data': False,
        'stateless': False
    }
    assert multi_output_regressor_tags == expected_tags
예제 #7
0
def main():
    # start agent network server
    agentNetwork = AgentNetwork()
    # init agents
    gen_agent = agentNetwork.add_agent(agentType=DataStreamAgent)
    trainer_agent = agentNetwork.add_agent(agentType=Trainer)
    predictor_agent = agentNetwork.add_agent(agentType=Predictor)
    evaluator_agent = agentNetwork.add_agent(agentType=Evaluator)
    monitor_agent_1 = agentNetwork.add_agent(agentType=MonitorAgent)
    monitor_agent_2 = agentNetwork.add_agent(agentType=MonitorAgent)
    gen_agent.init_parameters(stream=SineGenerator(),
                              pretrain_size=1000,
                              batch_size=1)
    trainer_agent.init_parameters(ml_model=HoeffdingTreeClassifier())
    # connect agents : We can connect multiple agents to any particular agent
    # However the agent needs to implement handling multiple input types
    agentNetwork.bind_agents(gen_agent, trainer_agent)
    agentNetwork.bind_agents(gen_agent, predictor_agent)
    agentNetwork.bind_agents(trainer_agent, predictor_agent)
    agentNetwork.bind_agents(predictor_agent, evaluator_agent)
    agentNetwork.bind_agents(evaluator_agent, monitor_agent_1)
    agentNetwork.bind_agents(predictor_agent, monitor_agent_2)
    # set all agents states to "Running"
    agentNetwork.set_running_state()

    # allow for shutting down the network after execution
    return agentNetwork
예제 #8
0
def demo():
    """ _test_pipeline
    
    This demo demonstrates the Pipeline structure seemingly working as a 
    learner, while being passed as parameter to an EvaluatePrequential 
    object.
     
    """
    # # Setup the stream
    # stream = FileStream("https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
    #                     "master/covtype.csv")
    # # If used for Hoeffding Trees then need to pass indices for Nominal attributes

    # Test with RandomTreeGenerator
    # stream = RandomTreeGenerator(n_classes=2, n_numerical_attributes=5)

    # Test with WaveformGenerator
    stream = WaveformGenerator()

    # Setup the classifier
    #classifier = PerceptronMask()
    #classifier = NaiveBayes()
    #classifier = PassiveAggressiveClassifier()
    classifier = HoeffdingTreeClassifier()

    # Setup the pipeline
    pipe = Pipeline([('Hoeffding Tree', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(show_plot=True, pretrain_size=1000, max_samples=100000)

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)
def test_evaluate_classification_coverage(tmpdir):
    # A simple coverage test. Tests for metrics are placed in the corresponding test module.
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=2,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    # Learner
    nominal_attr_idx = [x for x in range(15, len(stream.feature_names))]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    max_samples = 1000
    output_file = os.path.join(str(tmpdir), "prequential_summary.csv")
    metrics = [
        'accuracy', 'kappa', 'kappa_t', 'kappa_m', 'f1', 'precision', 'recall',
        'gmean', 'true_vs_predicted'
    ]
    evaluator = EvaluatePrequential(max_samples=max_samples,
                                    metrics=metrics,
                                    output_file=output_file)

    # Evaluate
    evaluator.evaluate(stream=stream, model=learner)
    mean_performance, current_performance = evaluator.get_measurements(
        model_idx=0)

    expected_current_accuracy = 0.685
    assert np.isclose(current_performance.accuracy_score(),
                      expected_current_accuracy)
예제 #10
0
def test_active_learning_window_extraction():
    df = pd.read_csv(METADB_PATH)
    stream = DataStream(df)

    learner = ActiveLearner(0.1,
                            stream,
                            HoeffdingTreeClassifier(),
                            store_history=True)

    for i in range(1000):
        learner.next_data()

    wind1 = learner.get_last_window()

    for i in range(1000):
        learner.next_data()

    wind2 = learner.get_last_window(n_classes=5)

    print(wind1)
    print(wind2)

    assert wind1.shape[0] == 1
    assert wind1.shape[1] > 0
    assert wind2.shape[0] == 1
    assert wind2.shape[1] > 0
예제 #11
0
def test_active_learning_window_extraction_with_delta():
    df = pd.read_csv(METADB_PATH)
    stream = DataStream(df)

    learner = ActiveLearner(0.1,
                            stream,
                            HoeffdingTreeClassifier(),
                            store_history=True)

    for i in range(1000):
        learner.next_data()

    new_curr1 = mean([x[2] for x in learner.history])
    old_last_window_acc1 = learner.last_window_acc
    expected_delta1 = new_curr1 - old_last_window_acc1
    wind1 = learner.get_last_window(delta_acc_summary_func="mean")

    for i in range(1000):
        learner.next_data()

    new_curr2 = max([x[2] for x in learner.history])
    old_last_window_acc2 = learner.last_window_acc
    expected_delta2 = new_curr2 - old_last_window_acc2
    wind2 = learner.get_last_window(n_classes=5, delta_acc_summary_func="max")

    print(wind1)
    print(wind2)

    assert wind1.shape[0] == 1
    assert wind1.shape[1] > 0
    assert wind2.shape[0] == 1
    assert wind2.shape[1] > 0
    assert expected_delta1 == wind1["window_acc_delta"].to_numpy()[0]
    assert expected_delta2 == wind2["window_acc_delta"].to_numpy()[0]
    assert old_last_window_acc2 == new_curr1
def test_hoeffding_tree_coverage():
    # Cover memory management
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    X, y = stream.next_sample(5000)

    learner = HoeffdingTreeClassifier(max_byte_size=30, memory_estimate_period=100, grace_period=10, leaf_prediction='mc')

    learner.partial_fit(X, y, classes=stream.target_values)

    learner.reset()

    # Cover nominal attribute observer
    stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0,
                                 n_categories_per_cat_feature=2)
    X, y = stream.next_sample(1000)
    learner = HoeffdingTreeClassifier(leaf_prediction='mc', nominal_attributes=[i for i in range(10)])
    learner.partial_fit(X, y, classes=stream.target_values)
def test_evaluate_holdout_classifier(tmpdir, test_path):
    # Setup file stream
    stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2,
                                 n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    # Setup learner
    nominal_attr_idx = [x for x in range(15, len(stream.feature_names))]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    # Setup evaluator
    n_wait = 200
    max_samples = 1000
    metrics = ['accuracy', 'kappa', 'kappa_t']
    output_file = os.path.join(str(tmpdir), "holdout_summary.csv")
    evaluator = EvaluateHoldout(n_wait=n_wait,
                                max_samples=max_samples,
                                test_size=50,
                                metrics=metrics,
                                output_file=output_file)

    # Evaluate
    result = evaluator.evaluate(stream=stream, model=learner)
    result_learner = result[0]

    assert isinstance(result_learner, HoeffdingTreeClassifier)

    assert learner.get_model_measurements == result_learner.get_model_measurements

    expected_file = os.path.join(test_path, 'holdout_summary.csv')
    compare_files(output_file, expected_file)

    mean_performance, current_performance = evaluator.get_measurements(model_idx=0)

    expected_mean_accuracy = 0.344000
    assert np.isclose(mean_performance.accuracy_score(), expected_mean_accuracy)

    expected_mean_kappa = 0.135021
    assert np.isclose(mean_performance.kappa_score(), expected_mean_kappa)

    expected_mean_kappa_t = 0.180000
    assert np.isclose(mean_performance.kappa_t_score(), expected_mean_kappa_t)

    expected_current_accuracy = 0.360000
    assert np.isclose(current_performance.accuracy_score(), expected_current_accuracy)

    expected_current_kappa = 0.152542
    assert np.isclose(current_performance.kappa_score(), expected_current_kappa)

    expected_current_kappa_t = 0.200000
    assert np.isclose(current_performance.kappa_t_score(), expected_current_kappa_t)

    expected_info = "EvaluateHoldout(batch_size=1, dynamic_test_set=False, max_samples=1000,\n" \
                    "                max_time=inf, metrics=['accuracy', 'kappa', 'kappa_t'],\n" \
                    "                n_wait=200,\n" \
                    "                output_file='holdout_summary.csv',\n" \
                    "                restart_stream=True, show_plot=False, test_size=50)"
    assert evaluator.get_info() == expected_info
 def _init_ensemble_member(self):
     #randomise the hoeffding tree three parameters
     grace_period = (random.randint(0, 20) + 1) * 10
     split_confidence = (random.randint(0, 20) + 1) * 0.05
     tie_threshold = (random.randint(0, 20) + 1) * 0.05
     return MyEnsembleBaseLearner(classifier=HoeffdingTreeClassifier(
         grace_period=grace_period,
         split_confidence=split_confidence,
         tie_threshold=tie_threshold))
예제 #15
0
def test_evaluate_prequential_classifier(tmpdir, test_path):
    # Setup file stream
    stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2,
                                 n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    # Setup learner
    nominal_attr_idx = [x for x in range(15, len(stream.feature_names))]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    # Setup evaluator
    max_samples = 1000
    metrics = ['accuracy', 'kappa', 'kappa_t']
    output_file = os.path.join(str(tmpdir), "prequential_summary.csv")
    evaluator = EvaluatePrequential(max_samples=max_samples,
                                    metrics=metrics,
                                    output_file=output_file)

    # Evaluate
    result = evaluator.evaluate(stream=stream, model=[learner])
    result_learner = result[0]

    assert isinstance(result_learner, HoeffdingTreeClassifier)

    assert learner.model_measurements == result_learner.model_measurements

    expected_file = os.path.join(test_path, 'prequential_summary.csv')
    compare_files(output_file, expected_file)

    mean_performance, current_performance = evaluator.get_measurements(model_idx=0)

    # Simple test. Tests for metrics are placed in the corresponding test module.
    expected_mean_accuracy = 0.436250
    assert np.isclose(mean_performance.accuracy_score(), expected_mean_accuracy)

    expected_mean_kappa = 0.231791
    assert np.isclose(mean_performance.kappa_score(), expected_mean_kappa)

    expected_mean_kappa_t = 0.236887
    assert np.isclose(mean_performance.kappa_t_score(), expected_mean_kappa_t)

    expected_current_accuracy = 0.430000
    assert np.isclose(current_performance.accuracy_score(), expected_current_accuracy)

    expected_current_kappa = 0.223909
    assert np.isclose(current_performance.kappa_score(), expected_current_kappa)

    expected_current_kappa_t = 0.240000
    assert np.isclose(current_performance.kappa_t_score(), expected_current_kappa_t)

    expected_info = "EvaluatePrequential(batch_size=1, data_points_for_classification=False,\n" \
                    "                    max_samples=1000, max_time=inf,\n" \
                    "                    metrics=['accuracy', 'kappa', 'kappa_t'], n_wait=200,\n" \
                    "                    output_file='prequential_summary.csv',\n" \
                    "                    pretrain_size=200, restart_stream=True, show_plot=False)"
    assert evaluator.get_info() == expected_info
def demo(): 

    # The classifier we will use (other options: SAMKNNClassifier, LeverageBaggingClassifier, SGD)
    h = HoeffdingTreeClassifier()

    # Setup Stream
    stream = FileStream("../data/datasets/sea_stream.csv")

    pretrain = 100
    evaluator = EvaluatePrequential(pretrain_size=pretrain, output_file='test_filestream.csv', max_samples=10000,
                                    batch_size=1, n_wait=1000, show_plot=True)
    evaluator.evaluate(stream=stream, model=h)
    def __init__(self, base_estimator=HoeffdingTreeClassifier(grace_period=50,
                                                              split_confidence=0.01),
                 n_estimators: int = 100,
                 subspace_mode: str = "percentage",
                 subspace_size: int = 60,
                 training_method: str = "randompatches",
                 lam: float = 6.0,
                 drift_detection_method: BaseDriftDetector = ADWIN(delta=1e-5),
                 warning_detection_method: BaseDriftDetector = ADWIN(delta=1e-4),
                 disable_weighted_vote: bool = False,
                 disable_drift_detection: bool = False,
                 disable_background_learner: bool = False,
                 nominal_attributes=None,
                 random_state=None):

        self.base_estimator = base_estimator   # Not restricted to a specific base estimator.
        self.n_estimators = n_estimators
        if subspace_mode not in {self._FEATURES_SQRT, self._FEATURES_SQRT_INV,
                                 self._FEATURES_PERCENT, self._FEATURES_M}:
            raise ValueError("Invalid subspace_mode: {}.\n"
                             "Valid options are: {}".format(subspace_mode,
                                                            {self._FEATURES_M, self._FEATURES_SQRT,
                                                             self._FEATURES_SQRT_INV,
                                                             self._FEATURES_PERCENT}))
        self.subspace_mode = subspace_mode
        self.subspace_size = subspace_size
        if training_method not in {self._TRAIN_RESAMPLING, self._TRAIN_RANDOM_PATCHES,
                                   self._TRAIN_RANDOM_SUBSPACES}:
            raise ValueError("Invalid training_method: {}.\n"
                             "Valid options are: {}".format(training_method,
                                                            {self._TRAIN_RANDOM_PATCHES,
                                                             self._TRAIN_RANDOM_SUBSPACES,
                                                             self._TRAIN_RESAMPLING}))
        self.training_method = training_method
        self.lam = lam
        self.drift_detection_method = drift_detection_method
        self.warning_detection_method = warning_detection_method
        self.disable_weighted_vote = disable_weighted_vote
        self.disable_drift_detection = disable_drift_detection
        self.disable_background_learner = disable_background_learner
        # Single option (accuracy) for drift detection criteria. Could be extended in the future.
        self.drift_detection_criteria = 'accuracy'
        self.nominal_attributes = nominal_attributes if nominal_attributes else []
        self.random_state = random_state
        # self._random_state is the actual object used internally
        self._random_state = check_random_state(self.random_state)
        self.ensemble = None

        self._n_samples_seen = 0
        self._subspaces = None

        self._base_performance_evaluator = ClassificationPerformanceEvaluator()
        self._base_learner_class = StreamingRandomPatchesBaseLearner
def test_hoeffding_tree_coverage():
    # Cover memory management
    max_samples = 5000
    max_size_kb = 50
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=10,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=15,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    # Unconstrained model has over 72 kB
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx,
                                      leaf_prediction='mc',
                                      memory_estimate_period=100,
                                      max_byte_size=max_size_kb * 2**10)

    X, y = stream.next_sample(max_samples)
    learner.partial_fit(X, y)

    assert calculate_object_size(learner, 'kB') <= max_size_kb

    learner.reset()
예제 #19
0
파일: coupled_ML.py 프로젝트: Met4FoF/Code
 def init_parameters(self,
                     mode="prequential",
                     ml_model=HoeffdingTreeClassifier(),
                     split_type=None):
     self.mode = mode
     self.ml_model = ml_model
     self.results = []
     if split_type is not None:
         self.split_type = split_type
     else:
         self.split_type = StratifiedKFold(n_splits=5,
                                           shuffle=True,
                                           random_state=0)
예제 #20
0
def test_evaluate_delayed_classification_single_time_delay(tmpdir):
    # Test using a single delay by time
    data = RandomTreeGenerator(tree_random_state=23,
                               sample_random_state=12,
                               n_classes=2,
                               n_cat_features=2,
                               n_num_features=5,
                               n_categories_per_cat_feature=5,
                               max_tree_depth=6,
                               min_leaf_depth=3,
                               fraction_leaves_per_level=0.15)
    # Number of samples to use
    max_samples = 1000

    # Get X and y
    X, y = data.next_sample(max_samples)
    y = y.astype(int)
    time = generate_random_dates(seed=1, samples=max_samples)

    # Setup temporal stream
    stream = TemporalDataStream(X,
                                y,
                                time,
                                sample_delay=np.timedelta64(30, "D"),
                                ordered=False)

    # Setup learner
    nominal_attr_idx = [x for x in range(15, len(data.feature_names))]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    output_file = os.path.join(str(tmpdir), "prequential_delayed_summary.csv")
    metrics = [
        'accuracy', 'kappa', 'kappa_t', 'kappa_m', 'f1', 'precision', 'recall',
        'gmean', 'true_vs_predicted'
    ]
    evaluator = EvaluatePrequentialDelayed(max_samples=max_samples,
                                           metrics=metrics,
                                           output_file=output_file)

    # Evaluate
    evaluator.evaluate(stream=stream, model=learner)
    mean_performance, current_performance = evaluator.get_measurements(
        model_idx=0)

    expected_current_accuracy = 0.715
    assert np.isclose(current_performance.accuracy_score(),
                      expected_current_accuracy)
예제 #21
0
def demo():

    # The classifier we will use (other options: SAMKNNClassifier, LeveragingBaggingClassifier, SGD)
    h = HoeffdingTreeClassifier()

    # Setup Stream
    stream = FileStream(
        "https://raw.githubusercontent.com/scikit-multiflow/streaming-datasets/"
        "master/sea_stream.csv")

    pretrain = 100
    evaluator = EvaluatePrequential(pretrain_size=pretrain,
                                    output_file='test_filestream.csv',
                                    max_samples=10000,
                                    batch_size=1,
                                    n_wait=1000,
                                    show_plot=True)
    evaluator.evaluate(stream=stream, model=h)
def demo(output_file=None, instances=40000):
    """ _test_comparison_holdout
    
    This demo will test a holdout evaluation task when more than one learner is 
    evaluated, which makes it a comparison task. 
    
    Parameters
    ----------
    output_file: string, optional
        If passed this parameter indicates the output file name. If left blank, 
        no output file will be generated.
    
    instances: int (Default: 40000)
        The evaluation's maximum number of instances.
    
    """
    # Setup the File Stream
    # stream = FileStream("../data/datasets/covtype.csv", -1, 1)
    stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    clf_one = HoeffdingTreeClassifier()
    # clf_two = KNNADWINClassifier(n_neighbors=8, max_window_size=2000)
    # classifier = PassiveAggressiveClassifier()
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    classifier = [clf_one]

    # Setup the evaluator
    evaluator = EvaluateHoldout(test_size=500,
                                dynamic_test_set=True,
                                max_samples=instances,
                                batch_size=1,
                                n_wait=5000,
                                max_time=1000,
                                output_file=output_file,
                                show_plot=True,
                                metrics=['kappa'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)
예제 #23
0
def test_set_params():
    learner = HoeffdingTreeClassifier()
    original_info = learner.get_info()

    params = learner.get_params()
    params.update(leaf_prediction='nb',
                  split_criterion='gini',
                  remove_poor_atts=True)

    learner.set_params(**params)

    updated_info = learner.get_info()

    assert original_info != updated_info

    expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nb', " \
                    "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \
                    "nominal_attributes=None, remove_poor_atts=True, split_confidence=1e-07, split_criterion='gini', " \
                    "stop_mem_management=False, tie_threshold=0.05)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
예제 #24
0
    def __init__(self,
                 min_estimators=5,
                 max_estimators=20,
                 base_estimators=[NaiveBayes(),
                                  HoeffdingTreeClassifier()],
                 period=1000,
                 alpha=0.002,
                 beta=1.5,
                 theta=0.05,
                 enable_diversity=True):
        """
        Creates a new instance of DiversifiedDynamicClassWeightedClassifier.
        """
        super().__init__()

        self.enable_diversity = enable_diversity
        self.min_estimators = min_estimators
        self.max_estimators = max_estimators
        self.base_estimators = base_estimators

        self.alpha = alpha
        self.beta = beta
        self.theta = theta
        self.period = period

        self.p = -1

        self.n_estimators = max_estimators
        self.epochs = None
        self.num_classes = None
        self.experts = None
        self.div = []

        self.window_size = None
        self.X_batch = None
        self.y_batch = None
        self.y_batch_experts = None

        # custom measurements atributes
        self.custom_measurements = []
        self.custom_time = []

        self.reset()
def demo(output_file=None, instances=40000):
    """ _test_prequential_bagging
    
    This demo shows the evaluation process of a LeverageBaggingClassifier,
    initialized with different base estimators.
    
    Parameters
    ----------
    output_file: string
        The name of the csv output file
    
    instances: int
        The evaluation's max number of instances
    
    """
    # Setup the File Stream
    # stream = FileStream("../data/datasets/sea_big.csv", -1, 1)
    #stream = SEAGenerator(classification_function=2, noise_percentage=0.0)
    #stream.prepare_for_use()
    stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    #classifier = OzaBaggingADWINClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000,
    #                                                                    leaf_size=30))
    #classifier = LeverageBaggingClassifier(base_estimator=KNNClassifier(n_neighbors=8, max_window_size=2000,
    #                                                                    leaf_size=30),
    #                                       n_estimators=1)
    pipe = LeverageBaggingClassifier(base_estimator=HoeffdingTreeClassifier(),
                                     n_estimators=2)

    # Setup the pipeline
    #pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(pretrain_size=2000,
                                    max_samples=instances,
                                    output_file=output_file,
                                    show_plot=False)

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)
def test_hoeffding_tree_nb(test_path):
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=4,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx,
                                      leaf_prediction='nb')

    cnt = 0
    max_samples = 5000
    predictions = array('i')
    proba_predictions = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            proba_predictions.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1
    expected_predictions = array('i', [
        0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3,
        1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3,
        2
    ])

    assert np.alltrue(predictions == expected_predictions)

    expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nb', " \
                    "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \
                    "nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14], remove_poor_atts=False, " \
                    "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, " \
                    "tie_threshold=0.05)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info
def run_one_test(test_idx):
    # create test_files:
    create_all_test_files(test_idx)

    # create the trees:
    ht = HoeffdingTreeClassifier(binary_split=True,
                                 grace_period=200,
                                 split_confidence=0.0000001,
                                 tie_threshold=0.05,
                                 split_criterion="hellinger")

    # pretrain on 200 pos and 1000 neg:
    train_path = Path("dataset") / "htru" / "train_200_1000.csv"
    train_tree(train_path, ht)
    print("\n")

    # create 4 copies of the tree:
    trees = []
    for _ in range(4):
        trees.append(copy.deepcopy(ht))


# do tests and get fscore and gm:
    results = []
    results.append(get_tree_results(trees[0], 10))
    results.append(get_tree_results(trees[1], 100))
    results.append(get_tree_results(trees[2], 1000))
    results.append(get_tree_results(trees[3], 10000))

    # save results:
    new_file_content = ""
    for _, row in enumerate(results):
        new_line = str(row)[1:-1]
        new_file_content += new_line + "\n"

    result_file_path = Path("results") / "htru" / "HDVFDT" / str(
        "results_" + str(test_idx) + ".csv")
    with open(result_file_path, "w") as file:
        file.write(new_file_content)

    return results
예제 #28
0
def train(name, clusters, window, normalize=False):
    input_csv = '{}{}_clusters={}_window={}_prepared.csv'.format(
        DATA_LOCATION, name, clusters, window)
    data = pd.read_csv(input_csv, index_col=0)

    if normalize:
        states = data.filter(['current_state', 'next_state'])
        sensors = data.drop(columns=['current_state', 'next_state'])
        scaler = StandardScaler()
        data = pd.DataFrame(data=scaler.fit_transform(X=sensors),
                            index=data.index,
                            columns=sensors.columns)
        data = pd.concat([data, states], axis='columns')

    stream = DataStream(data)

    hf = HoeffdingTreeClassifier()
    sgd = SGDClassifier()

    evaluator = EvaluatePrequential()
    evaluator.evaluate(stream=stream, model=[hf, sgd])
    # print('---------------------------------------------')
    # measurements = evaluator.get_mean_measurements()[0]
    # print(measurements.confusion_matrix)
    # print(measurements.accuracy_score())
    data = []
    for i, measurements in enumerate(evaluator.get_mean_measurements()):
        data.append([
            name, clusters, window, MODEL_NAMES[i], normalize,
            measurements.accuracy_score(),
            measurements.precision_score(),
            measurements.recall_score(),
            measurements.f1_score()
        ])
    return pd.DataFrame(data=data,
                        columns=[
                            'name', 'clusters', 'window', 'model',
                            'normalized', 'accuracy', 'precision', 'recall',
                            'f1'
                        ])
def test_data_stream(test_path):
    test_file = os.path.join(test_path, 'data/data_n30000.csv')
    raw_data = pd.read_csv(test_file)
    stream = DataStream(raw_data, name='Test')
    normal_knn_learner = KNNClassifier(
        n_neighbors=8,
        max_window_size=2000,
        leaf_size=40,
    )
    weighted_knn_learner = WeightedKNNClassifier(n_neighbors=8,
                                                 max_window_size=2000,
                                                 leaf_size=40)
    standardize_knn_learner = KNNClassifier(n_neighbors=8,
                                            max_window_size=2000,
                                            leaf_size=40,
                                            standardize=True)
    nominal_attr_idx = [x for x in range(15, len(stream.feature_names))]

    hoeffding_learner = HoeffdingTreeClassifier(
        nominal_attributes=nominal_attr_idx)
    nb_learner = NaiveBayes()

    metrics = ['accuracy', 'kappa_m', 'kappa_t', 'recall']
    output_file = os.path.join(test_path, 'data/kkn_output.csv')
    evaluator = EvaluatePrequential(metrics=metrics, output_file=output_file)

    # Evaluate
    result = evaluator.evaluate(stream=stream,
                                model=[
                                    normal_knn_learner,
                                    weighted_knn_learner,
                                    standardize_knn_learner,
                                    hoeffding_learner,
                                    nb_learner,
                                ])
    mean_performance, current_performance = evaluator.get_measurements()
    assert 1 == 1
예제 #30
0
파일: coupled_ML.py 프로젝트: Met4FoF/Code
def main():
    global agentNetwork
    # start agent network
    agentNetwork = AgentNetwork()
    # add agents
    data_stream_agent_1 = agentNetwork.add_agent(agentType=DataStreamAgent)
    ml_agent_hoeffdingTree = agentNetwork.add_agent(agentType=ML_Model)
    ml_agent_neuralNets = agentNetwork.add_agent(agentType=ML_Model)
    monitor_agent_1 = agentNetwork.add_agent(agentType=MonitorAgent)
    # init parameters
    data_stream_agent_1.init_parameters(stream=WaveformGenerator(),
                                        pretrain_size=1000,
                                        batch_size=100)
    ml_agent_hoeffdingTree.init_parameters(ml_model=HoeffdingTreeClassifier())
    ml_agent_neuralNets.init_parameters(ml_model=NaiveBayes())
    # connect agents
    agentNetwork.bind_agents(data_stream_agent_1, ml_agent_hoeffdingTree)
    agentNetwork.bind_agents(data_stream_agent_1, ml_agent_neuralNets)
    agentNetwork.bind_agents(ml_agent_hoeffdingTree, monitor_agent_1)
    agentNetwork.bind_agents(ml_agent_neuralNets, monitor_agent_1)
    agentNetwork.set_running_state()

    # allow for shutting down the network after execution
    return agentNetwork