예제 #1
0
def test_hoeffding_tree(test_path):
    stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2,
                                 n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)
    stream.prepare_for_use()

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    cnt = 0
    max_samples = 5000
    predictions = array('d')
    proba_predictions = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if cnt % wait_samples == 0:
            predictions.append(learner.predict(X)[0])
            proba_predictions.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('d', [0.0, 0.0, 1.0, 3.0, 0.0, 0.0, 3.0, 0.0, 1.0, 1.0,
                                       2.0, 0.0, 2.0, 1.0, 1.0, 2.0, 1.0, 3.0, 0.0, 1.0,
                                       1.0, 1.0, 1.0, 0.0, 3.0, 1.0, 2.0, 1.0, 1.0, 3.0,
                                       2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 0.0, 1.0,
                                       2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 3.0, 2.0])

    test_file = os.path.join(test_path, 'test_hoeffding_tree.npz')

    data = np.load(test_file)
    expected_proba_predictions_0 = data["a"]
    expected_proba_predictions_1 = data["b"]

    assert np.alltrue(predictions == expected_predictions)

    assert np.alltrue(proba_predictions == expected_proba_predictions_0) or \
           np.alltrue(proba_predictions == expected_proba_predictions_1)
    assert np.alltrue(predictions == expected_predictions)

    expected_info = 'HoeffdingTree: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200 ' \
                    '- split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05 ' \
                    '- binary_split: False - stop_mem_management: False - remove_poor_atts: False ' \
                    '- no_pre_prune: False - leaf_prediction: nba - nb_threshold: 0 - nominal_attributes: [5, 6, 7,' \
                    ' 8, 9, 10, 11, 12, 13, 14] - '
    assert learner.get_info() == expected_info

    expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1423.0, 1.0: 1745.0, 2.0: 978.0, 3.0: 854.0}\n'
    expected_model_2 = 'Leaf = Class 1.0 | {1.0: 1745.0, 2.0: 978.0, 0.0: 1423.0, 3.0: 854.0}\n'
    assert (learner.get_model_description() == expected_model_1) \
           or (learner.get_model_description() == expected_model_2)
 def filter_instance_to_leaves(self,
                               X,
                               y,
                               weight,
                               parent,
                               parent_branch,
                               update_splitter_counts=False,
                               found_nodes=None):
     if found_nodes is None:
         found_nodes = []
     if update_splitter_counts:
         try:
             self._observed_class_distribution[
                 y] += weight  # Dictionary (class_value, weight)
         except KeyError:
             self._observed_class_distribution[y] = weight
     child_index = self.instance_child_index(X)
     if child_index >= 0:
         child = self.get_child(child_index)
         if child is not None:
             child.filter_instance_to_leaves(X, y, weight, parent,
                                             parent_branch,
                                             update_splitter_counts,
                                             found_nodes)
         else:
             found_nodes.append(
                 HoeffdingTree.FoundNode(None, self, child_index))
     if self._alternate_tree is not None:
         self._alternate_tree.filter_instance_to_leaves(
             X, y, weight, self, -999, update_splitter_counts,
             found_nodes)
def test_evaluate_prequential_classifier(tmpdir, test_path):
    # Setup file stream
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=4,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)
    stream.prepare_for_use()

    # Setup learner
    nominal_attr_idx = [x for x in range(15, len(stream.feature_names))]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    # Setup evaluator
    max_samples = 1000
    metrics = ['kappa', 'kappa_t', 'performance']
    output_file = os.path.join(str(tmpdir), "prequential_summary.csv")
    evaluator = EvaluatePrequential(max_samples=max_samples,
                                    metrics=metrics,
                                    output_file=output_file)

    # Evaluate
    result = evaluator.evaluate(stream=stream, model=learner)
    result_learner = result[0]

    assert isinstance(result_learner, HoeffdingTree)

    assert learner.get_model_measurements == result_learner.get_model_measurements

    expected_file = os.path.join(test_path, 'prequential_summary.csv')
    compare_files(output_file, expected_file)
 def filter_instance_to_leaves(self,
                               X,
                               split_parent,
                               parent_branch,
                               update_splitter_counts,
                               found_nodes=None):
     if found_nodes is None:
         found_nodes = []
     found_nodes.append(
         HoeffdingTree.FoundNode(self, split_parent, parent_branch))
def demo():

    # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD)
    h = [HoeffdingTree(), SAMKNN(), LeverageBagging(), SGDClassifier()]

    # Demo 1 -- plot should not fail
    demo_parameterized(h)

    # Demo 2 -- csv output should look nice
    demo_parameterized(h, "sea_stream.csv", False)

    # Demo 3 -- should not give "'NoneType' object is not iterable" error
    demo_parameterized(h, "covtype.csv", False)
예제 #6
0
def test_hoeffding_tree():
    stream = RandomTreeGenerator(tree_seed=23,
                                 instance_seed=12,
                                 n_classes=4,
                                 n_nominal_attributes=2,
                                 n_numerical_attributes=5,
                                 n_values_per_nominal=5,
                                 max_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)
    stream.prepare_for_use()

    nominal_attr_idx = [
        x for x in range(15, len(stream.get_attributes_header()))
    ]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    cnt = 0
    max_samples = 5000
    predictions = array('d')
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_instance()
        # Test every n samples
        if cnt % wait_samples == 0:
            predictions.append(learner.predict(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('d', [
        0.0, 3.0, 2.0, 1.0, 1.0, 2.0, 0.0, 2.0, 0.0, 3.0, 3.0, 0.0, 1.0, 0.0,
        0.0, 1.0, 1.0, 1.0, 0.0, 3.0, 0.0, 1.0, 1.0, 0.0, 2.0, 2.0, 1.0, 2.0,
        0.0, 0.0, 0.0, 2.0, 1.0, 0.0, 2.0, 0.0, 2.0, 2.0, 0.0, 1.0, 1.0, 3.0,
        1.0, 0.0, 3.0, 0.0, 1.0, 1.0, 0.0, 0.0
    ])
    assert np.alltrue(predictions == expected_predictions)

    expected_info = 'HoeffdingTree: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200 ' \
                    '- split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05 ' \
                    '- binary_split: False - stop_mem_management: False - remove_poor_atts: False ' \
                    '- no_pre_prune: False - leaf_prediction: nba - nb_threshold: 0 - nominal_attributes: [] - '
    assert learner.get_info() == expected_info

    expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1384.0, 1.0: 1720.0, 2.0: 1005.0, 3.0: 891.0}\n'
    expected_model_2 = 'Leaf = Class 1.0 | {1.0: 1720.0, 2.0: 1005.0, 0.0: 1384.0, 3.0: 891.0}\n'
    assert (learner.get_model_description() == expected_model_1) \
           or (learner.get_model_description() == expected_model_2)
def demo(output_file=None, instances=40000):
    """ _test_prequential_bagging
    
    This demo shows the evaluation process of a LeverageBagging classifier, 
    initialized with KNN classifiers.
    
    Parameters
    ----------
    output_file: string
        The name of the csv output file
    
    instances: int
        The evaluation's max number of instances
    
    """
    # Setup the File Stream
    # opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_big.csv", "CSV", False)
    # stream = FileStream(opt, -1, 1)
    stream = SEAGenerator(classification_function=2,
                          instance_seed=755437,
                          noise_percentage=0.0)
    stream.prepare_for_use()

    # Setup the classifier
    #classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    #classifier = LeverageBagging(h=KNN(k=8, max_window_size=2000, leaf_size=30), ensemble_length=1)
    classifier = LeverageBagging(h=HoeffdingTree(), ensemble_length=2)

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    eval = EvaluatePrequential(
        pretrain_size=2000,
        max_instances=instances,
        batch_size=1,
        n_wait=200,
        max_time=1000,
        output_file=output_file,
        task_type='classification',
        show_plot=False,
        plot_options=['kappa', 'kappa_t', 'performance'])

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)
예제 #8
0
def demo():

    # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD)
    h = HoeffdingTree()

    # Setup Stream
    stream = FileStream("../datasets/sea_stream.csv", -1, 1)
    stream.prepare_for_use()

    pretrain = 100
    evaluator = EvaluatePrequential(pretrain_size=pretrain,
                                    output_file='output.csv',
                                    max_samples=10000,
                                    batch_size=1,
                                    n_wait=1000,
                                    show_plot=True,
                                    metrics=['performance'])
    evaluator.evaluate(stream=stream, model=h)
예제 #9
0
def demo():

    # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD)
    h = HoeffdingTree()

    # Setup Stream
    opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_stream.csv", "CSV",
                     False)
    stream = FileStream(opt, -1, 1)
    stream.prepare_for_use()

    T_init = 100
    eval = EvaluatePrequential(pretrain_size=T_init,
                               output_file='output.csv',
                               max_instances=10000,
                               batch_size=1,
                               n_wait=1000,
                               task_type='classification',
                               show_plot=True,
                               plot_options=['performance'])
    eval.eval(stream=stream, classifier=h)
def demo(output_file=None, instances=40000):
    """ _test_prequential_bagging
    
    This demo shows the evaluation process of a LeverageBagging classifier, 
    initialized with KNN classifiers.
    
    Parameters
    ----------
    output_file: string
        The name of the csv output file
    
    instances: int
        The evaluation's max number of instances
    
    """
    # Setup the File Stream
    # stream = FileStream("../datasets/sea_big.csv", -1, 1)
    #stream = SEAGenerator(classification_function=2, noise_percentage=0.0)
    #stream.prepare_for_use()
    stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    #classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    #classifier = LeverageBagging(h=KNN(k=8, max_window_size=2000, leaf_size=30), ensemble_length=1)
    pipe = LeverageBagging(h=HoeffdingTree(), ensemble_length=2)

    # Setup the pipeline
    #pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(pretrain_size=2000,
                                    max_samples=instances,
                                    output_file=output_file,
                                    show_plot=False)

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)
예제 #11
0
def demo():
    """ _test_pipeline
    
    This demo demonstrates the Pipeline structure seemingly working as a 
    learner, while being passed as parameter to an EvaluatePrequential 
    object.
     
    """
    # # Setup the stream
    # opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
    # stream = FileStream(opt, -1, 1)
    # stream.prepare_for_use()
    # # If used for Hoeffding Trees then need to pass indices for Nominal attributes

    # Test with RandomTreeGenerator
    # stream = RandomTreeGenerator(n_classes=2, n_numerical_attributes=5)
    # stream.prepare_for_use()

    # Test with WaveformGenerator
    stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    #classifier = PerceptronMask()
    #classifier = NaiveBayes()
    #classifier = PassiveAggressiveClassifier()
    classifier = HoeffdingTree()

    # Setup the pipeline
    pipe = Pipeline([('Hoeffding Tree', classifier)])

    # Setup the evaluator
    eval = EvaluatePrequential(show_plot=True,
                               pretrain_size=1000,
                               max_instances=100000)

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)
        def filter_instance_to_leaves(self,
                                      X,
                                      parent,
                                      parent_branch,
                                      update_splitter_counts,
                                      found_nodes=None):
            if found_nodes is None:
                found_nodes = []

            child_index = self.instance_child_index(X)

            if child_index >= 0:
                child = self.get_child(child_index)

                if child is not None:
                    child.filter_instance_to_leaves(X, parent, parent_branch,
                                                    update_splitter_counts,
                                                    found_nodes)
                else:
                    found_nodes.append(
                        HoeffdingTree.FoundNode(None, self, child_index))
            if self._alternate_tree is not None:
                self._alternate_tree.filter_instance_to_leaves(
                    X, self, -999, update_splitter_counts, found_nodes)
from skmultiflow.options.file_option import FileOption

from my_classifier import BatchClassifier

dataset = "elec"

# 1. Create a stream
opt = FileOption("FILE", "OPT_NAME", "./data/" + dataset + ".csv", "CSV",
                 False)
stream = FileStream(opt, -1, 1)
# 2. Prepare for use
stream.prepare_for_use()
# 2. Instantiate the HoeffdingTree classifier
h = [
    KNN(k=10, max_window_size=100, leaf_size=30),
    HoeffdingTree(),
    BatchClassifier(window_size=100, max_models=10),
]
# 3. Setup the evaluator
eval = EvaluatePrequential(pretrain_size=1000,
                           output_file='result_' + dataset + '.csv',
                           max_instances=10000,
                           batch_size=1,
                           n_wait=500,
                           max_time=1000000000,
                           task_type='classification',
                           show_plot=True,
                           plot_options=['performance'])
# 4. Run
eval.eval(stream=stream, classifier=h)
예제 #14
0
from skmultiflow.core.pipeline import Pipeline
from skmultiflow.data.file_stream import FileStream
from skmultiflow.options.file_option import FileOption
from skmultiflow.evaluation.evaluate_prequential import EvaluatePrequential
from skmultiflow.classification.trees.hoeffding_tree import HoeffdingTree

# Setup the File Stream
#opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
#opt = FileOption("FILE", "OPT_NAME", "../datasets/movingSquares.csv", "CSV", False)
opt = FileOption("FILE", "OPT_NAME", "../datasets/sea_stream.csv", "CSV",
                 False)
stream = FileStream(opt, -1, 1)
stream.prepare_for_use()

# Setup the classifiers
clf_one = HoeffdingTree()
clf_two = AdaptiveRandomForest()

# Setup the pipeline for clf_one
pipe = Pipeline([('Classifier', clf_one)])

# Create the list to hold both classifiers
classifier = [pipe, clf_two]

# Setup the evaluator
eval = EvaluatePrequential(pretrain_size=200,
                           max_instances=100000,
                           batch_size=1,
                           max_time=1000,
                           output_file='comparison_Hoeffding_ADFH_Preq.csv',
                           task_type='classification',