예제 #1
0
def test_hoeffding_tree_coverage():
    # Cover memory management
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    stream.prepare_for_use()
    X, y = stream.next_sample(5000)

    learner = HoeffdingTree(max_byte_size=30,
                            memory_estimate_period=100,
                            grace_period=10,
                            leaf_prediction='mc')

    learner.partial_fit(X, y, classes=stream.target_values)

    learner.reset()

    # Cover nominal attribute observer
    stream = RandomTreeGenerator(tree_random_state=1,
                                 sample_random_state=1,
                                 n_num_features=0,
                                 n_categories_per_cat_feature=2)
    stream.prepare_for_use()
    X, y = stream.next_sample(1000)
    learner = HoeffdingTree(leaf_prediction='mc',
                            nominal_attributes=[i for i in range(10)])
    learner.partial_fit(X, y, classes=stream.target_values)
예제 #2
0
def test_hoeffding_tree_model_information():
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    stream.prepare_for_use()
    X, y = stream.next_sample(5000)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    learner.partial_fit(X, y, classes=stream.target_values)

    expected_info = {
        'Tree size (nodes)': 5,
        'Tree size (leaves)': 3,
        'Active learning nodes': 3,
        'Tree depth': 2,
        'Active leaf byte size estimate': 0.0,
        'Inactive leaf byte size estimate': 0.0,
        'Byte size estimate overhead': 1.0
    }

    observed_info = learner.get_model_measurements
    for k in expected_info:
        assert k in observed_info
        assert expected_info[k] == observed_info[k]

    expected_description = "if Attribute 0 <= 4.549969620513424:\n" \
                            "  if Attribute 1 <= 5.440182925299016:\n" \
                            "    Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \
                            "  if Attribute 1 > 5.440182925299016:\n" \
                            "    Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \
                            "if Attribute 0 > 4.549969620513424:\n" \
                            "  Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \

    assert expected_description == learner.get_model_description()
def demo():

    # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD)
    h1 = [
        HoeffdingTree(),
        SAMKNN(),
        LeverageBagging(random_state=1),
        SGDClassifier()
    ]
    h2 = [
        HoeffdingTree(),
        SAMKNN(),
        LeverageBagging(random_state=1),
        SGDClassifier()
    ]
    h3 = [
        HoeffdingTree(),
        SAMKNN(),
        LeverageBagging(random_state=1),
        SGDClassifier()
    ]
    model_names = ['HT', 'SAMKNN', 'LBkNN', 'SGDC']

    # Demo 1 -- plot should not fail
    demo_parameterized(h1, model_names=model_names)

    # Demo 2 -- csv output should look nice
    demo_parameterized(h2, "sea_stream.csv", False, model_names)

    # Demo 3 -- should not give "'NoneType' object is not iterable" error
    demo_parameterized(h3, "covtype.csv", False, model_names)
예제 #4
0
def test_hoeffding_tree_categorical_features(test_path):
    data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy')
    stream = np.load(data_path)
    # Removes the last two columns (regression targets)
    stream = stream[:, :-2]
    X, y = stream[:, :-1], stream[:, -1]

    nominal_attr_idx = np.arange(7).tolist()
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    learner.partial_fit(X, y, classes=np.unique(y))

    expected_description = "if Attribute 0 = -15.0:\n" \
                           "  Leaf = Class 2 | {2: 350.0}\n" \
                           "if Attribute 0 = 0.0:\n" \
                           "  Leaf = Class 0 | {0: 420.0, 1: 252.0}\n" \
                           "if Attribute 0 = 1.0:\n" \
                           "  Leaf = Class 1 | {0: 312.0, 1: 332.0}\n" \
                           "if Attribute 0 = 2.0:\n" \
                           "  Leaf = Class 1 | {0: 236.0, 1: 383.0}\n" \
                           "if Attribute 0 = 3.0:\n" \
                           "  Leaf = Class 1 | {0: 168.0, 1: 459.0}\n" \
                           "if Attribute 0 = -30.0:\n" \
                           "  Leaf = Class 3.0 | {3.0: 46.0, 4.0: 42.0}\n"

    assert learner.get_model_description() == expected_description
예제 #5
0
 def __init__(self,
              grace_period=200,
              split_confidence=0.5,
              leaf_prediction='nba',
              split_criterion='info_gain'):
     super().__init__()
     self.clf = HoeffdingTree(split_confidence=split_confidence,
                              grace_period=grace_period,
                              leaf_prediction=leaf_prediction,
                              split_criterion=split_criterion)
def test_hoeffding_tree(test_path):
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=4,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)
    stream.prepare_for_use()

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    cnt = 0
    max_samples = 5000
    predictions = array('d')
    proba_predictions = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            proba_predictions.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('d', [
        0.0, 1.0, 3.0, 0.0, 0.0, 3.0, 0.0, 1.0, 1.0, 2.0, 0.0, 2.0, 1.0, 1.0,
        2.0, 1.0, 3.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 3.0, 1.0, 2.0, 1.0, 1.0,
        3.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0, 2.0,
        0.0, 0.0, 0.0, 0.0, 1.0, 3.0, 2.0
    ])

    test_file = os.path.join(test_path, 'test_hoeffding_tree.npy')

    data = np.load(test_file)

    assert np.alltrue(predictions == expected_predictions)
    assert np.allclose(proba_predictions, data)

    expected_info = 'HoeffdingTree: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200 ' \
                    '- split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05 ' \
                    '- binary_split: False - stop_mem_management: False - remove_poor_atts: False ' \
                    '- no_pre_prune: False - leaf_prediction: nba - nb_threshold: 0 - nominal_attributes: [5, 6, 7,' \
                    ' 8, 9, 10, 11, 12, 13, 14] - '
    assert learner.get_info() == expected_info

    expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1423.0, 1.0: 1745.0, 2.0: 978.0, 3.0: 854.0}\n'
    expected_model_2 = 'Leaf = Class 1.0 | {1.0: 1745.0, 2.0: 978.0, 0.0: 1423.0, 3.0: 854.0}\n'
    assert (learner.get_model_description() == expected_model_1) \
           or (learner.get_model_description() == expected_model_2)
예제 #7
0
def _choose_classifier(job: Job):
    if job.type == JobTypes.UPDATE.value:
        classifier = _load_model(job.incremental_train)
        # TODO: check if this instruction still makes sense
        # are we updating a predictive_model with its own methods?
        assert classifier[0].__class__.__name__ == job.method
    else:
        method, config = get_method_config(job)
        config.pop('classification_method', None)
        print("Using method {} with config {}".format(method, config))
        if method == ClassificationMethods.KNN.value:
            classifier = KNeighborsClassifier(**config)
        elif method == ClassificationMethods.RANDOM_FOREST.value:
            classifier = RandomForestClassifier(**config)
        elif method == ClassificationMethods.DECISION_TREE.value:
            classifier = DecisionTreeClassifier(**config)
        elif method == ClassificationMethods.XGBOOST.value:
            classifier = XGBClassifier(**config)
        elif method == ClassificationMethods.MULTINOMIAL_NAIVE_BAYES.value:
            classifier = MultinomialNB(**config)
        elif method == ClassificationMethods.ADAPTIVE_TREE.value:
            classifier = HAT(**config)
        elif method == ClassificationMethods.HOEFFDING_TREE.value:
            classifier = HoeffdingTree(**config)
        elif method == ClassificationMethods.SGDCLASSIFIER.value:
            classifier = SGDClassifier(**config)
        elif method == ClassificationMethods.PERCEPTRON.value:
            classifier = Perceptron(**config)
        elif method == ClassificationMethods.NN.value:
            config['encoding'] = job.encoding.value_encoding
            config['is_binary_classifier'] = _check_is_binary_classifier(job.labelling.type)
            classifier = NNClassifier(**config)
        else:
            raise ValueError("Unexpected classification method {}".format(method))
    return classifier
 def filter_instance_to_leaves(self,
                               X,
                               y,
                               weight,
                               parent,
                               parent_branch,
                               update_splitter_counts=False,
                               found_nodes=None):
     if found_nodes is None:
         found_nodes = []
     if update_splitter_counts:
         try:
             self._observed_class_distribution[
                 y] += weight  # Dictionary (class_value, weight)
         except KeyError:
             self._observed_class_distribution[y] = weight
     child_index = self.instance_child_index(X)
     if child_index >= 0:
         child = self.get_child(child_index)
         if child is not None:
             child.filter_instance_to_leaves(X, y, weight, parent,
                                             parent_branch,
                                             update_splitter_counts,
                                             found_nodes)
         else:
             found_nodes.append(
                 HoeffdingTree.FoundNode(None, self, child_index))
     if self._alternate_tree is not None:
         self._alternate_tree.filter_instance_to_leaves(
             X, y, weight, self, -999, update_splitter_counts,
             found_nodes)
예제 #9
0
def test_evaluate_classification_coverage(tmpdir):
    # A simple coverage test. Tests for metrics are placed in the corresponding test module.
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=2,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)
    stream.prepare_for_use()

    # Learner
    nominal_attr_idx = [x for x in range(15, len(stream.feature_names))]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    max_samples = 1000
    output_file = os.path.join(str(tmpdir), "prequential_summary.csv")
    metrics = [
        'accuracy', 'kappa', 'kappa_t', 'kappa_m', 'f1', 'precision', 'recall',
        'gmean', 'true_vs_predicted'
    ]
    evaluator = EvaluatePrequential(max_samples=max_samples,
                                    metrics=metrics,
                                    output_file=output_file)

    # Evaluate
    evaluator.evaluate(stream=stream, model=learner)
    mean_performance, current_performance = evaluator.get_measurements(
        model_idx=0)

    expected_current_accuracy = 0.685
    assert np.isclose(current_performance.accuracy_score(),
                      expected_current_accuracy)
예제 #10
0
def _choose_classifier(job: Job):
    method, config = get_method_config(job)
    config.pop('classification_method', None)
    logger.info("Using method {} with config {}".format(method, config))
    if method == ClassificationMethods.KNN.value:
        classifier = KNeighborsClassifier(**config)
    elif method == ClassificationMethods.RANDOM_FOREST.value:
        classifier = RandomForestClassifier(**config)
    elif method == ClassificationMethods.DECISION_TREE.value:
        classifier = DecisionTreeClassifier(**config)
    elif method == ClassificationMethods.XGBOOST.value:
        classifier = XGBClassifier(**config)
    elif method == ClassificationMethods.MULTINOMIAL_NAIVE_BAYES.value:
        classifier = MultinomialNB(**config)
    elif method == ClassificationMethods.ADAPTIVE_TREE.value:
        classifier = HAT(**config)
    elif method == ClassificationMethods.HOEFFDING_TREE.value:
        classifier = HoeffdingTree(**config)
    elif method == ClassificationMethods.SGDCLASSIFIER.value:
        classifier = SGDClassifier(**config)
    elif method == ClassificationMethods.PERCEPTRON.value:
        classifier = Perceptron(**config)
    elif method == ClassificationMethods.NN.value:
        config['encoding'] = job.encoding.value_encoding
        config['is_binary_classifier'] = _check_is_binary_classifier(
            job.labelling.type)
        classifier = NNClassifier(**config)
    else:
        raise ValueError("Unexpected classification method {}".format(method))
    return classifier
예제 #11
0
def test_evaluate_prequential_classifier(tmpdir, test_path):
    # Setup file stream
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=4,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)
    stream.prepare_for_use()

    # Setup learner
    nominal_attr_idx = [x for x in range(15, len(stream.feature_names))]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    # Setup evaluator
    max_samples = 1000
    metrics = ['kappa', 'kappa_t', 'performance']
    output_file = os.path.join(str(tmpdir), "prequential_summary.csv")
    evaluator = EvaluatePrequential(max_samples=max_samples,
                                    metrics=metrics,
                                    output_file=output_file)

    # Evaluate
    result = evaluator.evaluate(stream=stream, model=learner)
    result_learner = result[0]

    assert isinstance(result_learner, HoeffdingTree)

    assert learner.get_model_measurements == result_learner.get_model_measurements

    expected_file = os.path.join(test_path, 'prequential_summary.csv')
    compare_files(output_file, expected_file)
예제 #12
0
def main():
    # start agent network server
    agentNetwork = AgentNetwork()
    # init agents
    gen_agent = agentNetwork.add_agent(agentType=DataStreamAgent)
    trainer_agent = agentNetwork.add_agent(agentType=Trainer)
    predictor_agent = agentNetwork.add_agent(agentType=Predictor)
    evaluator_agent = agentNetwork.add_agent(agentType=Evaluator)
    monitor_agent_1 = agentNetwork.add_agent(agentType=MonitorAgent)
    monitor_agent_2 = agentNetwork.add_agent(agentType=MonitorAgent)
    gen_agent.init_parameters(stream=SineGenerator(), pretrain_size=1000,
                              batch_size=1)
    trainer_agent.init_parameters(ml_model=HoeffdingTree())
    # connect agents : We can connect multiple agents to any particular agent
    # However the agent needs to implement handling multiple input types
    agentNetwork.bind_agents(gen_agent, trainer_agent)
    agentNetwork.bind_agents(gen_agent, predictor_agent)
    agentNetwork.bind_agents(trainer_agent, predictor_agent)
    agentNetwork.bind_agents(predictor_agent, evaluator_agent)
    agentNetwork.bind_agents(evaluator_agent, monitor_agent_1)
    agentNetwork.bind_agents(predictor_agent, monitor_agent_2)
    # set all agents states to "Running"
    agentNetwork.set_running_state()

    # allow for shutting down the network after execution
    return agentNetwork
예제 #13
0
class VFDT(IncrementalClassifier):
    def __init__(self,
                 grace_period=200,
                 split_confidence=0.5,
                 leaf_prediction='nba',
                 split_criterion='info_gain'):
        super().__init__()
        self.clf = HoeffdingTree(split_confidence=split_confidence,
                                 grace_period=grace_period,
                                 leaf_prediction=leaf_prediction,
                                 split_criterion=split_criterion)

    def partial_fit(self, one_row):
        self.clf.partial_fit([one_row[0]], [one_row[1]])

    def predict(self, x):
        return self.clf.predict(x)
예제 #14
0
def test_evaluate_prequential_classifier(tmpdir, test_path):
    # Setup file stream
    stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2,
                                 n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)
    stream.prepare_for_use()

    # Setup learner
    nominal_attr_idx = [x for x in range(15, len(stream.feature_names))]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    # Setup evaluator
    max_samples = 1000
    metrics = ['accuracy', 'kappa', 'kappa_t']
    output_file = os.path.join(str(tmpdir), "prequential_summary.csv")
    evaluator = EvaluatePrequential(max_samples=max_samples,
                                    metrics=metrics,
                                    output_file=output_file)

    # Evaluate
    result = evaluator.evaluate(stream=stream, model=learner)
    result_learner = result[0]

    assert isinstance(result_learner, HoeffdingTree)

    assert learner.get_model_measurements == result_learner.get_model_measurements

    expected_file = os.path.join(test_path, 'prequential_summary.csv')
    compare_files(output_file, expected_file)

    mean_performance, current_performance = evaluator.get_measurements(model_idx=0)

    expected_mean_accuracy = 0.436250
    assert np.isclose(mean_performance.get_accuracy(), expected_mean_accuracy)

    expected_mean_kappa = 0.231791
    assert np.isclose(mean_performance.get_kappa(), expected_mean_kappa)

    expected_mean_kappa_t = 0.236887
    assert np.isclose(mean_performance.get_kappa_t(), expected_mean_kappa_t)

    expected_current_accuracy = 0.430000
    assert np.isclose(current_performance.get_accuracy(), expected_current_accuracy)

    expected_current_kappa = 0.223909
    assert np.isclose(current_performance.get_kappa(), expected_current_kappa)

    expected_current_kappa_t = 0.240000
    assert np.isclose(current_performance.get_kappa_t(), expected_current_kappa_t)

    expected_info = "EvaluatePrequential(batch_size=1, data_points_for_classification=False,\n" \
                    "                    max_samples=1000, max_time=inf,\n" \
                    "                    metrics=['accuracy', 'kappa', 'kappa_t'], n_wait=200,\n" \
                    "                    output_file='prequential_summary.csv',\n" \
                    "                    pretrain_size=200, restart_stream=True, show_plot=False)"
    assert evaluator.get_info() == expected_info
예제 #15
0
def run_indefinetly(input_topic, output_topic, target_index, model=HoeffdingTree()):
    print(f'Running AutoML for input_topic={input_topic}, output_topic={output_topic}, target_index={target_index} and broker={BOOTSTRAP_SERVERS}.')
    consumer = KafkaConsumer(
            input_topic,
            bootstrap_servers=BOOTSTRAP_SERVERS,
            group_id=None,
            auto_offset_reset='earliest',
            value_deserializer=lambda x: x.decode('utf-8')
        )
    producer = KafkaProducer(bootstrap_servers=BOOTSTRAP_SERVERS, value_serializer=lambda x: x.encode('utf-8'))

    i = 0
    total_predictions = 0
    correct_predictions = 0
    accuracy = 0

    for message in consumer:
        sample = pd.read_csv(StringIO(message.value), header=None)
        i += 1

        if any(sample.dtypes == 'object'):
            print(f'Streamed sample contains text or malformatted data.')
            continue
        
        X = sample.iloc[:,:target_index]
        y = sample.iloc[:,target_index]

        # Collect metrics
        try:
            prediction = model.predict(X)
            total_predictions += 1
            if prediction[0] == y[0]:
                correct_predictions += 1
            accuracy = correct_predictions / total_predictions
            print(f'Accuracy at sample {i}: {accuracy}')
            producer.send(output_topic + '__accuracy', str(accuracy))
            producer.send(output_topic + '__pred_count', str(total_predictions))
            producer.flush()
        except Exception:
            pass

        if y.isnull().any():
            # Predict
            try:
                y_pred = pd.DataFrame(model.predict(X))
                producer.send(output_topic, y_pred.to_csv(header=False, index=False))
                producer.flush()
            except Exception as e:
                print('An exception occured during prediction', e)
        else:
            # Train
            try:
                model.partial_fit(X, y)
            except Exception as e:
                print('An exception occured during training', e)
def test_evaluate_holdout_classifier(tmpdir, test_path):
    # Setup file stream
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=4,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)
    stream.prepare_for_use()

    # Setup learner
    nominal_attr_idx = [x for x in range(15, len(stream.feature_names))]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    # Setup evaluator
    n_wait = 200
    max_samples = 1000
    metrics = ['accuracy', 'kappa', 'kappa_t']
    output_file = os.path.join(str(tmpdir), "holdout_summary.csv")
    evaluator = EvaluateHoldout(n_wait=n_wait,
                                max_samples=max_samples,
                                test_size=50,
                                metrics=metrics,
                                output_file=output_file)

    # Evaluate
    result = evaluator.evaluate(stream=stream, model=learner)
    result_learner = result[0]

    assert isinstance(result_learner, HoeffdingTree)

    assert learner.get_model_measurements == result_learner.get_model_measurements

    expected_file = os.path.join(test_path, 'holdout_summary.csv')
    compare_files(output_file, expected_file)

    mean_performance, current_performance = evaluator.get_measurements(
        model_idx=0)
    expected_mean_accuracy = 0.344000
    expected_mean_kappa = 0.135021
    expected_mean_kappa_t = 0.180000
    expected_current_accuracy = 0.360000
    expected_current_kappa = 0.152542
    expected_current_kappa_t = 0.200000
    assert np.isclose(mean_performance.get_accuracy(), expected_mean_accuracy)
    assert np.isclose(mean_performance.get_kappa(), expected_mean_kappa)
    assert np.isclose(mean_performance.get_kappa_t(), expected_mean_kappa_t)
    assert np.isclose(current_performance.get_accuracy(),
                      expected_current_accuracy)
    assert np.isclose(current_performance.get_kappa(), expected_current_kappa)
    assert np.isclose(current_performance.get_kappa_t(),
                      expected_current_kappa_t)
예제 #17
0
 def __init__(self,
              estimator=HoeffdingTree(leaf_prediction='nb'),
              weight_mc=10,
              weight_inv=0.3,
              max_session_size=20):
     super().__init__()
     self.ht = estimator
     self.w_mc = weight_mc
     self.w_inv = weight_inv
     self.counter = Counter()
     self.max_session_size = max_session_size
     self._rec_tracker = defaultdict(list)
 def filter_instance_to_leaves(self,
                               X,
                               y,
                               weight,
                               parent,
                               parent_branch,
                               update_splitter_counts,
                               found_nodes=None):
     if found_nodes is None:
         found_nodes = []
     found_nodes.append(
         HoeffdingTree.FoundNode(self, parent, parent_branch))
예제 #19
0
 def init_parameters(self,
                     mode="prequential",
                     ml_model=HoeffdingTree(),
                     split_type=None):
     self.mode = mode
     self.ml_model = ml_model
     self.results = []
     if split_type is not None:
         self.split_type = split_type
     else:
         self.split_type = StratifiedKFold(n_splits=5,
                                           shuffle=True,
                                           random_state=0)
예제 #20
0
    def __init__(self, model=HoeffdingTree(), random_state=None):

        super().__init__()

        self.classes = None
        self._alpha = 0
        self._model = model
        self._majority_cutoff = 1
        self._training_set_X = []
        self._training_set_y = []
        self._batch_num = 1

        self._past_instances = {}

        self._original_random_state = random_state
        self.random_state = None
예제 #21
0
    def __init__(self, classes, model=HoeffdingTree(), random_state=None):

        super().__init__()

        self._classes = classes
        self._alpha = 1 / len(self._classes)
        self._model = model
        self._majority_cutoff = 1
        self._training_set_X = []
        self._training_set_y = []
        self._num_instance_per_class = {}
        self._batch_num = 1
        self._original_random_state = random_state
        self.random_state = None

        for var in self._classes:
            self._num_instance_per_class[var] = 0
def demo():

    # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD)
    h = HoeffdingTree()

    # Setup Stream
    stream = FileStream("../data/datasets/sea_stream.csv")
    stream.prepare_for_use()

    pretrain = 100
    evaluator = EvaluatePrequential(pretrain_size=pretrain,
                                    output_file='test_filestream.csv',
                                    max_samples=10000,
                                    batch_size=1,
                                    n_wait=1000,
                                    show_plot=True)
    evaluator.evaluate(stream=stream, model=h)
예제 #23
0
def test_evaluate_classification_metrics():

    stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2,
                                 n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)
    stream.prepare_for_use()

    # Setup learner
    nominal_attr_idx = [x for x in range(15, len(stream.feature_names))]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx)

    max_samples = 1000
    metrics = ['f1', 'precision', 'recall', 'gmean']
    evaluator = EvaluatePrequential(max_samples=max_samples,
                                    metrics=metrics)

    # Evaluate
    evaluator.evaluate(stream=stream, model=learner)
    mean_performance, current_performance = evaluator.get_measurements(model_idx=0)

    expected_current_f1_score = 0.7096774193548387
    expected_current_precision = 0.6814159292035398
    expected_current_recall = 0.7403846153846154
    expected_current_g_mean = 0.6802502367624613
    expected_mean_f1_score = 0.7009803921568628
    expected_mean_precision = 0.7185929648241206
    expected_mean_recall = 0.6842105263157895
    expected_mean_g_mean = 0.6954166367760247
    print(mean_performance.get_g_mean())
    print(mean_performance.get_recall())
    print(mean_performance.get_precision())
    print(mean_performance.get_f1_score())
    print(current_performance.get_g_mean())
    print(current_performance.get_recall())
    print(current_performance.get_precision())
    print(current_performance.get_f1_score())
    assert np.isclose(current_performance.get_f1_score(), expected_current_f1_score)
    assert np.isclose(current_performance.get_precision(), expected_current_precision)
    assert np.isclose(current_performance.get_recall(), expected_current_recall)
    assert np.isclose(current_performance.get_g_mean(), expected_current_g_mean)
    assert np.isclose(mean_performance.get_f1_score(), expected_mean_f1_score)
    assert np.isclose(mean_performance.get_precision(), expected_mean_precision)
    assert np.isclose(mean_performance.get_recall(), expected_mean_recall)
    assert np.isclose(mean_performance.get_g_mean(), expected_mean_g_mean)
def demo(output_file=None, instances=40000):
    """ _test_comparison_holdout
    
    This demo will test a holdout evaluation task when more than one learner is 
    evaluated, which makes it a comparison task. 
    
    Parameters
    ----------
    output_file: string, optional
        If passed this parameter indicates the output file name. If left blank, 
        no output file will be generated.
    
    instances: int (Default: 40000)
        The evaluation's maximum number of instances.
    
    """
    # Setup the File Stream
    # stream = FileStream("../data/datasets/covtype.csv", -1, 1)
    stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    clf_one = HoeffdingTree()
    # clf_two = KNNAdwin(n_neighbors=8, max_window_size=2000)
    # classifier = PassiveAggressiveClassifier()
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    classifier = [clf_one]

    # Setup the evaluator
    evaluator = EvaluateHoldout(test_size=500,
                                dynamic_test_set=True,
                                max_samples=instances,
                                batch_size=1,
                                n_wait=5000,
                                max_time=1000,
                                output_file=output_file,
                                show_plot=True,
                                metrics=['kappa'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)
예제 #25
0
def test_hoeffding_tree_nb(test_path):
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=4,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)
    stream.prepare_for_use()

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTree(nominal_attributes=nominal_attr_idx,
                            leaf_prediction='nb')

    cnt = 0
    max_samples = 5000
    predictions = array('i')
    proba_predictions = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            proba_predictions.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1
    expected_predictions = array('i', [
        0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3,
        1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3,
        2
    ])

    assert np.alltrue(predictions == expected_predictions)

    expected_info = "HoeffdingTree(binary_split=False, grace_period=200, leaf_prediction='nb',\n" \
                    "              max_byte_size=33554432, memory_estimate_period=1000000,\n" \
                    "              nb_threshold=0, no_preprune=False,\n" \
                    "              nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14],\n" \
                    "              remove_poor_atts=False, split_confidence=1e-07,\n" \
                    "              split_criterion='info_gain', stop_mem_management=False,\n" \
                    "              tie_threshold=0.05)"
    assert learner.get_info() == expected_info
def demo():
    """ _test_pipeline
    
    This demo demonstrates the Pipeline structure seemingly working as a 
    learner, while being passed as parameter to an EvaluatePrequential 
    object.
     
    """
    # # Setup the stream
    # stream = FileStream("../data/datasets/covtype.csv", -1, 1)
    # stream.prepare_for_use()
    # # If used for Hoeffding Trees then need to pass indices for Nominal attributes

    # Test with RandomTreeGenerator
    # stream = RandomTreeGenerator(n_classes=2, n_numerical_attributes=5)
    # stream.prepare_for_use()

    # Test with WaveformGenerator
    stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    #classifier = PerceptronMask()
    #classifier = NaiveBayes()
    #classifier = PassiveAggressiveClassifier()
    classifier = HoeffdingTree()

    # Setup the pipeline
    pipe = Pipeline([('Hoeffding Tree', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(show_plot=True,
                                    pretrain_size=1000,
                                    max_samples=100000)

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)
예제 #27
0
def simulation():
    """
    Simulation webpage

    Returns
    -------
    webpage: string.
        Html of the simulation webpage.
    """
    global thread
    global stream_stop_event
    global stream_pause_event
    thread = Thread()
    stream_stop_event = Event()
    stream_stop_event.set()
    stream_pause_event = Event()
    dataset = request.args.get('dataset') + ".data"
    print("DATASET:", dataset)
    model_name = request.args.get('model')
    if model_name == "NaiveBayes":
        model = NaiveBayes()
    elif model_name == "VFDR":
        model = VFDR(ordered_rules=False,
                     rule_prediction="weighted_sum",
                     drift_detector=None)
    else:
        model = HoeffdingTree()
    freq = request.args.get('freq')
    alpha = request.args.get('alpha')
    beta = request.args.get('beta')
    buffer = True if request.args.get('buffer') == "on" else False
    xmax = pd.read_csv(BASE_DIR + dataset).shape[0] + 1
    thread = socketio.start_background_task(spc_method, dataset, model,
                                            int(alpha), int(beta), buffer,
                                            int(freq))
    plot = create_plot(model_name, xmax)
    return render_template('simulation.html', plot=plot)
예제 #28
0
def main():
    global agentNetwork
    # start agent network
    agentNetwork = AgentNetwork()
    # add agents
    data_stream_agent_1 = agentNetwork.add_agent(agentType=DataStreamAgent)
    ml_agent_hoeffdingTree = agentNetwork.add_agent(agentType=ML_Model)
    ml_agent_neuralNets = agentNetwork.add_agent(agentType=ML_Model)
    monitor_agent_1 = agentNetwork.add_agent(agentType=MonitorAgent)
    # init parameters
    data_stream_agent_1.init_parameters(stream=WaveformGenerator(),
                                        pretrain_size=1000,
                                        batch_size=100)
    ml_agent_hoeffdingTree.init_parameters(ml_model=HoeffdingTree())
    ml_agent_neuralNets.init_parameters(ml_model=NaiveBayes())
    # connect agents
    agentNetwork.bind_agents(data_stream_agent_1, ml_agent_hoeffdingTree)
    agentNetwork.bind_agents(data_stream_agent_1, ml_agent_neuralNets)
    agentNetwork.bind_agents(ml_agent_hoeffdingTree, monitor_agent_1)
    agentNetwork.bind_agents(ml_agent_neuralNets, monitor_agent_1)
    agentNetwork.set_running_state()

    # allow for shutting down the network after execution
    return agentNetwork
예제 #29
0
from strlearn.evaluators import TestThenTrain
from sklearn.naive_bayes import GaussianNB
from strlearn.metrics import (balanced_accuracy_score, f1_score,
                              geometric_mean_score_1, precision, recall,
                              specificity)
import sys
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from skmultiflow.trees import HoeffdingTree

# Select streams and methods
streams = h.realstreams()
print(len(streams))

ob = OnlineBagging(n_estimators=20,
                   base_estimator=HoeffdingTree(split_criterion='hellinger'))
oob = OOB(n_estimators=20,
          base_estimator=HoeffdingTree(split_criterion='hellinger'))
uob = UOB(n_estimators=20,
          base_estimator=HoeffdingTree(split_criterion='hellinger'))
ros_knorau2 = SEA(base_estimator=StratifiedBagging(
    base_estimator=HoeffdingTree(split_criterion='hellinger'),
    random_state=42,
    oversampler="ROS"),
                  oversampled="ROS",
                  des="KNORAU2")
cnn_knorau2 = SEA(base_estimator=StratifiedBagging(
    base_estimator=HoeffdingTree(split_criterion='hellinger'),
    random_state=42,
    oversampler="CNN"),
                  oversampled="CNN",
예제 #30
0
 def init_parameters(self, ml_model=HoeffdingTree()):
     self.ml_model = ml_model