def test_hat_mc(test_path):
    stream = ConceptDriftStream(stream=SEAGenerator(random_state=1,
                                                    noise_percentage=0.05),
                                drift_stream=SEAGenerator(
                                    random_state=2,
                                    classification_function=2,
                                    noise_percentage=0.05),
                                random_state=1,
                                position=250,
                                width=10)
    stream.prepare_for_use()

    learner = HAT(leaf_prediction='mc')

    cnt = 0
    max_samples = 1000
    y_pred = array('i')
    y_proba = []
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1
    ])
    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_mc.npy')
    data = np.load(test_file)
    assert np.allclose(y_proba, data)

    expected_info = "HAT(binary_split=False, grace_period=200, leaf_prediction='mc',\n" \
                    "    max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0,\n" \
                    "    no_preprune=False, nominal_attributes=None, remove_poor_atts=False,\n" \
                    "    split_confidence=1e-07, split_criterion='info_gain',\n" \
                    "    stop_mem_management=False, tie_threshold=0.05)"

    assert learner.get_info() == expected_info

    expected_model_1 = 'Leaf = Class 1.0 | {0.0: 398.0, 1.0: 1000.0}\n'

    assert (learner.get_model_description() == expected_model_1)

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    stream.restart()
    X, y = stream.next_sample(5000)

    learner = HAT(max_byte_size=30, leaf_prediction='mc', grace_period=10)
    learner.partial_fit(X, y)
def test_hat_mc(test_path):
    stream = ConceptDriftStream(stream=SEAGenerator(random_state=1,
                                                    noise_percentage=0.05),
                                drift_stream=SEAGenerator(
                                    random_state=2,
                                    classification_function=2,
                                    noise_percentage=0.05),
                                random_state=1,
                                position=250,
                                width=10)
    stream.prepare_for_use()

    learner = HAT(leaf_prediction='mc')

    cnt = 0
    max_samples = 1000
    y_pred = array('i')
    y_proba = []
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,
        1
    ])
    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_mc.npy')
    data = np.load(test_file)
    assert np.allclose(y_proba, data)

    expected_info = 'HAT: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200' \
                    ' - split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05' \
                    ' - binary_split: False - stop_mem_management: False - remove_poor_atts: False' \
                    ' - no_pre_prune: False - leaf_prediction: mc - nb_threshold: 0' \
                    ' - nominal_attributes: [] - '

    assert learner.get_info() == expected_info

    expected_model_1 = 'Leaf = Class 1.0 | {0.0: 0.005295278636481529, 1.0: 1.9947047213635185}\n'
    expected_model_2 = 'Leaf = Class 1.0 | {0.0: 0.0052952786364815294, 1.0: 1.9947047213635185}\n'
    expected_model_3 = 'Leaf = Class 1.0 | {1.0: 1.9947047213635185, 0.0: 0.0052952786364815294}\n'
    assert (learner.get_model_description() == expected_model_1) \
           or  (learner.get_model_description() == expected_model_2) \
           or  (learner.get_model_description() == expected_model_3)

    stream.restart()
    X, y = stream.next_sample(5000)

    learner = HAT(max_byte_size=30, leaf_prediction='mc', grace_period=10)
    learner.partial_fit(X, y)
Пример #3
0
def test_concept_drift_stream(test_path):
    stream = ConceptDriftStream(random_state=1, position=20, width=5)
    stream.prepare_for_use()

    assert stream.n_remaining_samples() == -1

    expected_names = [
        "salary", "commission", "age", "elevel", "car", "zipcode", "hvalue",
        "hyears", "loan"
    ]
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['target']

    assert stream.n_features == 9

    assert stream.n_cat_features == 3

    assert stream.n_num_features == 6

    assert stream.n_targets == 1

    assert stream.get_info() == 'ConceptDriftStream: ' \
                                'First Stream: AGRAWALGenerator - ' \
                                'Drift Stream: AGRAWALGenerator - ' \
                                'alpha: 0.0 - position: 20 - width: 5'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'concept_drift_stream.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(30)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]

    assert 'stream' == stream.get_class_type()
Пример #4
0
def test_knn_adwin():
    stream = ConceptDriftStream(stream=SEAGenerator(random_state=1),
                                drift_stream=SEAGenerator(
                                    random_state=2, classification_function=2),
                                random_state=1,
                                position=250,
                                width=10)
    stream.prepare_for_use()
    learner = KNNADWINClassifier(n_neighbors=8,
                                 leaf_size=40,
                                 max_window_size=200)

    cnt = 0
    max_samples = 1000
    predictions = array('i')
    correct_predictions = 0
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            if y[0] == predictions[-1]:
                correct_predictions += 1
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
        1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
        1
    ])
    assert np.alltrue(predictions == expected_predictions)

    expected_correct_predictions = 46
    assert correct_predictions == expected_correct_predictions

    learner.reset()
    assert learner.window.n_samples == 0

    expected_info = 'KNNADWINClassifier(leaf_size=40, max_window_size=200, n_neighbors=8, nominal_attributes=None)'
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    stream.restart()

    X, y = stream.next_sample(max_samples)
    learner.fit(X[:950], y[:950])
    predictions = learner.predict(X[951:])

    correct_predictions = sum(np.array(predictions) == y[951:])
    expected_correct_predictions = 47
    assert correct_predictions == expected_correct_predictions

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
def test_hat_nb(test_path):
    stream = ConceptDriftStream(stream=SEAGenerator(random_state=1,
                                                    noise_percentage=0.05),
                                drift_stream=SEAGenerator(
                                    random_state=2,
                                    classification_function=2,
                                    noise_percentage=0.05),
                                random_state=1,
                                position=250,
                                width=10)
    stream.prepare_for_use()

    learner = HAT(leaf_prediction='nb')

    cnt = 0
    max_samples = 1000
    y_pred = array('i')
    y_proba = []
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [
        1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
        1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
        1
    ])
    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_nb.npy')
    data = np.load(test_file)
    assert np.allclose(y_proba, data)

    expected_info = 'HAT: max_byte_size: 33554432 - memory_estimate_period: 1000000 - grace_period: 200' \
                    ' - split_criterion: info_gain - split_confidence: 1e-07 - tie_threshold: 0.05' \
                    ' - binary_split: False - stop_mem_management: False - remove_poor_atts: False' \
                    ' - no_pre_prune: False - leaf_prediction: nb - nb_threshold: 0' \
                    ' - nominal_attributes: [] - '

    assert learner.get_info() == expected_info
    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
def test_hoeffding_adaptive_tree_nb(test_path):
    stream = ConceptDriftStream(stream=SEAGenerator(random_state=1, noise_percentage=0.05),
                                drift_stream=SEAGenerator(random_state=2, classification_function=2,
                                                          noise_percentage=0.05),
                                random_state=1, position=250, width=10)
    stream.prepare_for_use()

    learner = HoeffdingAdaptiveTreeClassifier(leaf_prediction='nb')

    cnt = 0
    max_samples = 1000
    y_pred = array('i')
    y_proba = []
    wait_samples = 20

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            y_pred.append(learner.predict(X)[0])
            y_proba.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
                                       0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
                                       1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
                                       1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
                                       0, 1, 1, 1, 1, 1, 0, 1, 1])
    assert np.alltrue(y_pred == expected_predictions)

    test_file = os.path.join(test_path, 'test_hoeffding_adaptive_tree_nb.npy')
    data = np.load(test_file)
    assert np.allclose(y_proba, data)

    expected_info = "HoeffdingAdaptiveTreeClassifier(binary_split=False, bootstrap_sampling=True, grace_period=200, " \
                    "leaf_prediction='nb', max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, " \
                    "no_preprune=False, nominal_attributes=None, remove_poor_atts=False, split_confidence=1e-07, " \
                    "split_criterion='info_gain', stop_mem_management=False, tie_threshold=0.05)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray
class RecurringConceptGradualStream(RecurringConceptStream):
    """ A stream featuring gradual drift between given concepts.
    Uses the scikit-multiflow concept drift stream to blend concepts over
    a window.

    Parameters
    ----------

    rctype: RCStreamType
        An enum describing the type of stream
    
    num_samples: int
        The number of samples in the stream
    
    noise: float
        The probability that noise will happen in the generation. At each
        new sample generated, the sample with will perturbed by the amount of
        perturbation.
        Values go from 0.0 to 1.0.
    
    concept_chain: list<int> or dict
        A dict with key observation number and value
        the concept begining at that observation
        or
        A list of concept ids. A dict will be generated
        with each concept lasting its length given in desc
        or uniform length.
    
    window_size: int
        The number of observations each gradual drift is
        spread over.
    
    seed: int
        Random seed.
    
    desc: dict<int><conceptOccurence>
        A map of concept ID to options

    boost_first_occurance: bool
        If true, double the observations drawn from
        the first occurence of a concept. Allows 
        a better model to be built and stored.

    Examples
    --------

    >>> # An example stream using the STAGGER Generator.
    >>> # Starts using generating function 0, then at
    >>> # observation 5000 transitions to generating function
    >>> # 1 then at 10000 transitions back to 0.
    >>> from skika.data.synthetic.reccurring_concept_stream import RCStreamType, RecurringConceptGradualStream, conceptOccurence
    >>> concept_chain = {0: 0, 5000: 1, 10000: 0}
    >>> num_samples = 15000
    >>> # init concept
    >>> concept_0 = conceptOccurence(id = 0, difficulty = 2, noise = 0,
                        appearences = 2, examples_per_appearence = 5000)
    >>> concept_1 = conceptOccurence(id = 1, difficulty = 3, noise = 0,
                        appearences = 1, examples_per_appearence = 5000)
    >>> desc = {0: concept_0, 1: concept_1}
    >>> datastream = RecurringConceptGradualStream(
                        rctype = RCStreamType.STAGGER,
                        num_samples =num_samples,
                        noise = 0,
                        concept_chain = concept_chain,
                        window_size = 1000,
                        seed = 42,
                        desc = desc,
                        boost_first_occurance = False)
    >>> datastream.has_more_samples()
    True
    >>> datastream.get_drift_info()
    {0: 0, 5000: 1, 10000: 0}
    >>> datastream.n_remaining_samples()
    15000
    >>> datastream.get_stream_info()
    {0: 0, 5000: 1, 10000: 0}
    0 - 5000: STAGGERGenerator(balance_classes=False, classification_function=0,
                    random_state=42)
    5000 - 10000: STAGGERGenerator(balance_classes=False, classification_function=1,
                    random_state=43)
    10000 - 15000: STAGGERGenerator(balance_classes=False, classification_function=0,
                    random_state=42)
    >>> datastream.get_moa_stream_info()
    {0: 0, 5000: 1, 10000: 0}
    '(ConceptDriftStream -s (generators.STAGGERGenerator -f 1 -i 42) -d (ConceptDriftStream -s (generators.STAGGERGenerator -f 2 -i 43) -d (generators.STAGGERGenerator -f 1 -i 42) -p 5000 -w 1) -p 5000 -w 1)'
    >>> datastream.get_supplementary_info()
    >>> datastream.next_sample()
    (array([[2., 0., 2.]]), array([0]))
    >>> datastream.n_remaining_samples()
    14999
    >>> datastream.next_sample()
    (array([[2., 0., 0.]]), array([0]))
    >>> datastream.n_remaining_samples()
    14998
    """
    def __init__(self,
                 rctype,
                 num_samples,
                 noise,
                 concept_chain,
                 window_size=1000,
                 seed=None,
                 desc=None,
                 boost_first_occurance=True):
        self.in_drift = False
        self.drift_switch = False
        self.window_size = window_size
        self.transition_stream = None
        super().__init__(rctype,
                         num_samples,
                         noise,
                         concept_chain,
                         seed=seed,
                         desc=desc,
                         boost_first_occurance=boost_first_occurance)

    def next_sample(self, batch_size=1):
        if batch_size > 1:
            print("Only batch size of 1 for now")
            return None

        if not self.in_drift:
            samples = self.concepts[self.current_concept].next_sample(
                batch_size)
        else:
            samples = self.transition_stream.next_sample(batch_size)

        last_switch_point = 0 - self.window_size // 2
        next_switch_point = self.num_samples + self.window_size
        self.example_count += batch_size
        for concept_switch_index in sorted(self.concept_chain.keys()):
            if (concept_switch_index <= self.example_count):
                last_switch_point = concept_switch_index
            if concept_switch_index >= self.example_count:
                next_switch_point = concept_switch_index
                break

        self.drifted = False
        if not self.in_drift:
            if self.example_count >= next_switch_point - self.window_size // 2:
                self.in_drift = True
                self.drift_switch = True
                self.transition_stream = ConceptDriftStream(
                    stream=self.concepts[self.concept_chain[last_switch_point]]
                    .get_datastream(),
                    drift_stream=self.concepts[self.concept_chain[
                        next_switch_point]].get_datastream(),
                    position=self.window_size // 2,
                    width=self.window_size)
                self.transition_stream.prepare_for_use()
        else:
            if self.example_count == next_switch_point:
                self.current_concept = self.concept_chain[next_switch_point]
                self.drifted = True
                self.drift_switch = False
            if self.example_count >= (last_switch_point + self.window_size //
                                      2) and not self.drift_switch:
                self.in_drift = False

        return samples
Пример #8
0
class RecurringConceptGradualStream(RecurringConceptStream):
    def __init__(self,
                 rctype,
                 num_samples,
                 noise,
                 concept_chain,
                 window_size=1000,
                 seed=None,
                 desc=None,
                 boost_first_occurance=True):
        self.in_drift = False
        self.drift_switch = False
        self.window_size = window_size
        self.transition_stream = None
        super().__init__(rctype,
                         num_samples,
                         noise,
                         concept_chain,
                         seed=seed,
                         desc=desc,
                         boost_first_occurance=boost_first_occurance)

    def next_sample(self, batch_size=1):
        if batch_size > 1:
            print("Only batch size of 1 for now")
            return None

        if not self.in_drift:
            samples = self.concepts[self.current_concept].next_sample(
                batch_size)
        else:
            samples = self.transition_stream.next_sample(batch_size)

        last_switch_point = 0 - self.window_size // 2
        next_switch_point = self.num_samples + self.window_size
        self.example_count += batch_size
        for concept_switch_index in sorted(self.concept_chain.keys()):
            if (concept_switch_index <= self.example_count):
                last_switch_point = concept_switch_index
            if concept_switch_index >= self.example_count:
                next_switch_point = concept_switch_index
                break

        self.drifted = False
        if not self.in_drift:
            # print(f"START GRADUAL DRIFT FROM {self.concept_chain[last_switch_point]} TO {self.concept_chain[next_switch_point]}")
            if self.example_count >= next_switch_point - self.window_size // 2:
                # print(f"{self.example_count}: START GRADUAL DRIFT FROM {self.concept_chain[last_switch_point]} TO {self.concept_chain[next_switch_point]}")
                self.in_drift = True
                self.drift_switch = True
                self.transition_stream = ConceptDriftStream(
                    stream=self.concepts[self.concept_chain[last_switch_point]]
                    .get_datastream(),
                    drift_stream=self.concepts[self.concept_chain[
                        next_switch_point]].get_datastream(),
                    position=self.window_size // 2,
                    width=self.window_size)
                self.transition_stream.prepare_for_use()
        else:
            if self.example_count == next_switch_point:
                self.current_concept = self.concept_chain[next_switch_point]
                self.drifted = True
                self.drift_switch = False
                # print(f"{self.example_count}: SWITCH POINT")
            if self.example_count >= (last_switch_point + self.window_size //
                                      2) and not self.drift_switch:
                self.in_drift = False
                # print(f"{self.example_count}: END GRADUAL DRIFT FROM {self.concept_chain[last_switch_point]} TO {self.concept_chain[next_switch_point]}")

        return samples
    random_state=None,
    alpha=0.0)
stream_3 = ConceptDriftStream(
    stream=AGRAWALGenerator(balance_classes=False,
                            classification_function=1,
                            perturbation=0.0,
                            random_state=11),
    drift_stream=AGRAWALGenerator(balance_classes=False,
                                  classification_function=2,
                                  perturbation=0.0,
                                  random_state=12),
    position=6000,
    width=500,
    random_state=None,
    alpha=0.0)
stream_1.prepare_for_use()
stream_2.prepare_for_use()
stream_3.prepare_for_use()

instances_num = 10000
instances_counter = 0
ENSEMBLE_TYPE = 'av'

### Arrays for storing accuracy values for Streams
accuracies_1 = []
accuracies_2 = []
accuracies_3_mv = []
accuracies_3_av = []
accuracies_3_goowe = []

num_features = stream_1.n_features