Python HoeffdingTreeClassifier.partial_fit示例，skmultiflow.trees.HoeffdingTreeClassifier.partial_fit Python示例

示例#1

0

显示文件

文件： test_hoeffding_tree.py 项目： sayanddude/scikit-multiflow

def test_hoeffding_tree_categorical_features(test_path):
    data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy')
    stream = np.load(data_path)
    # Removes the last two columns (regression targets)
    stream = stream[:, :-2]
    X, y = stream[:, :-1], stream[:, -1]

    nominal_attr_idx = np.arange(7).tolist()
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    learner.partial_fit(X, y, classes=np.unique(y))

    expected_description = "if Attribute 0 = -15.0:\n" \
                           "  Leaf = Class 2 | {2: 350.0}\n" \
                           "if Attribute 0 = 0.0:\n" \
                           "  Leaf = Class 0 | {0: 420.0, 1: 252.0}\n" \
                           "if Attribute 0 = 1.0:\n" \
                           "  Leaf = Class 1 | {0: 312.0, 1: 332.0}\n" \
                           "if Attribute 0 = 2.0:\n" \
                           "  Leaf = Class 1 | {0: 236.0, 1: 383.0}\n" \
                           "if Attribute 0 = 3.0:\n" \
                           "  Leaf = Class 1 | {0: 168.0, 1: 459.0}\n" \
                           "if Attribute 0 = -30.0:\n" \
                           "  Leaf = Class 3.0 | {3.0: 46.0, 4.0: 42.0}\n"

    assert learner.get_model_description() == expected_description

示例#2

0

显示文件

文件： test_hoeffding_tree.py 项目： houcembenmakhlouf/GHVFDT

def test_hoeffding_tree_coverage():
    # Cover memory management
    max_samples = 5000
    max_size_kb = 50
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=10,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=15,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    # Unconstrained model has over 72 kB
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx,
                                      leaf_prediction='mc',
                                      memory_estimate_period=100,
                                      max_byte_size=max_size_kb * 2**10)

    X, y = stream.next_sample(max_samples)
    learner.partial_fit(X, y)

    assert calculate_object_size(learner, 'kB') <= max_size_kb

    learner.reset()

示例#3

0

显示文件

文件： test_hoeffding_tree.py 项目： sayanddude/scikit-multiflow

def test_hoeffding_tree_model_information():
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    X, y = stream.next_sample(5000)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    learner.partial_fit(X, y, classes=stream.target_values)

    expected_info = {
        'Tree size (nodes)': 5,
        'Tree size (leaves)': 3,
        'Active learning nodes': 3,
        'Tree depth': 2,
        'Active leaf byte size estimate': 0.0,
        'Inactive leaf byte size estimate': 0.0,
        'Byte size estimate overhead': 1.0
    }

    observed_info = learner.get_model_measurements
    for k in expected_info:
        assert k in observed_info
        assert expected_info[k] == observed_info[k]

    expected_description = "if Attribute 0 <= 4.549969620513424:\n" \
                            "  if Attribute 1 <= 5.440182925299016:\n" \
                            "    Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \
                            "  if Attribute 1 > 5.440182925299016:\n" \
                            "    Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \
                            "if Attribute 0 > 4.549969620513424:\n" \
                            "  Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \

    assert expected_description == learner.get_model_description()

示例#4

0

显示文件

文件： test_hoeffding_tree.py 项目： sayanddude/scikit-multiflow

def test_hoeffding_tree_coverage():
    # Cover memory management
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    X, y = stream.next_sample(5000)

    learner = HoeffdingTreeClassifier(max_byte_size=30, memory_estimate_period=100, grace_period=10, leaf_prediction='mc')

    learner.partial_fit(X, y, classes=stream.target_values)

    learner.reset()

    # Cover nominal attribute observer
    stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0,
                                 n_categories_per_cat_feature=2)
    X, y = stream.next_sample(1000)
    learner = HoeffdingTreeClassifier(leaf_prediction='mc', nominal_attributes=[i for i in range(10)])
    learner.partial_fit(X, y, classes=stream.target_values)

示例#5

0

显示文件

文件： test_hoeffding_tree.py 项目： houcembenmakhlouf/GHVFDT

def test_hoeffding_tree_nb(test_path):
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=4,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=6,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx,
                                      leaf_prediction='nb')

    cnt = 0
    max_samples = 5000
    predictions = array('i')
    proba_predictions = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            proba_predictions.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1
    expected_predictions = array('i', [
        0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3,
        1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3,
        2
    ])

    assert np.alltrue(predictions == expected_predictions)

    expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nb', " \
                    "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \
                    "nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14], remove_poor_atts=False, " \
                    "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, " \
                    "tie_threshold=0.05)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

示例#6

0

显示文件

文件： learner.py 项目： Vini7x/meta_act

def get_error_hoeffdingtree(data, pre_train_size, **hf_kwargs):
    orig_X = data[:, :-1]
    orig_y = data[:, -1].astype(int)
    stream = DataStream(orig_X, orig_y)
    hf = HoeffdingTreeClassifier(**hf_kwargs)

    pretrainX, pretrainy = stream.next_sample(pre_train_size)

    # Pre-train
    hf.partial_fit(pretrainX, pretrainy, classes=stream.target_values)

    evaluations = []
    while stream.has_more_samples():
        X, y = stream.next_sample()

        # Evaluation
        y_hat = hf.predict(X)
        evaluations.append(int(y_hat[0] == y[0]))

        # Train
        hf.partial_fit(X, y, classes=stream.target_values)

    return evaluations

示例#7

0

显示文件

elec_df.columns = ['date', 'day', 'period', 'nswprice', 'nswdemand', 'vicprice', 'vicdemand', 'transfer', 'class']
mapping = {"day":{"1":1, "2":2, "3":3, "4":4, "5":5, "6":6, "7":7}, "class": {"UP": 0, "DOWN": 1}}
elec_df = elec_df.replace(mapping)

elec_full_df = pandas.concat([elec_df] * 200)

STREAM_SIZE = elec_full_df.shape[0]

elec_stream = DataStream(elec_full_df, name="elec")
elec_stream.prepare_for_use()

X_train, y_train = elec_stream.next_sample(TRAINING_SIZE)

ht = HoeffdingTreeClassifier()

ht.partial_fit(X_train, y_train)

n_global = ignore + TRAINING_SIZE  # Cumulative Number of observations
d_ddm = 0
w_ddm = 0
TP_ddm = []
FP_ddm = []
RT_ddm = []
DIST_ddm = []
mem_ddm = []
retrain = False
grace_end = n_global
detect_end = n_global
mine_pr = []
mine_std = []
mine_alpha = []

示例#8

0

显示文件

文件： test_hoeffding_tree.py 项目： sayanddude/scikit-multiflow

def test_hoeffding_tree_nba(test_path):
    stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2,
                                 n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx)

    cnt = 0
    max_samples = 5000
    predictions = array('i')
    proba_predictions = []
    wait_samples = 100

    while cnt < max_samples:
        X, y = stream.next_sample()
        # Test every n samples
        if (cnt % wait_samples == 0) and (cnt != 0):
            predictions.append(learner.predict(X)[0])
            proba_predictions.append(learner.predict_proba(X)[0])
        learner.partial_fit(X, y)
        cnt += 1

    expected_predictions = array('i', [0, 1, 3, 0, 0, 3, 0, 1, 1, 2,
                                       0, 2, 1, 1, 2, 1, 3, 0, 1, 1,
                                       1, 1, 0, 3, 1, 2, 1, 1, 3, 2,
                                       1, 2, 2, 2, 1, 1, 1, 0, 1, 2,
                                       0, 2, 0, 0, 0, 0, 1, 3, 2])

    test_file = os.path.join(test_path, 'test_hoeffding_tree.npy')

    data = np.load(test_file)

    assert np.alltrue(predictions == expected_predictions)
    assert np.allclose(proba_predictions, data)

    expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nba', " \
                    "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \
                    "nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14], remove_poor_atts=False, " \
                    "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, " \
                    "tie_threshold=0.05)"
    info = " ".join([line.strip() for line in learner.get_info().split()])
    assert info == expected_info

    expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1423.0, 1.0: 1745.0, 2.0: 978.0, 3.0: 854.0}\n'

    assert (learner.get_model_description() == expected_model_1)
    assert type(learner.predict(X)) == np.ndarray
    assert type(learner.predict_proba(X)) == np.ndarray

    X, y = stream.next_sample(20000)
    learner.split_criterion = 'hellinger'
    learner.partial_fit(X, y)

    expected_rules = 'Att (5) == 0.000 and Att (12) == 0.000 | class: 1\n' + \
        'Att (5) == 0.000 and Att (12) == 1.000 | class: 1\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) <= 0.730 | class: 0\n' +\
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) > 0.730 | class: 2\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) <= 0.800 | class: 0\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 0.000' \
        ' | class: 0\n' + \
        'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 1.000' \
        ' | class: 1\n' + \
        'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) <= 0.730 | class: 1\n' + \
        'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) > 0.730 | class: 0\n'
    assert expected_rules == learner.get_rules_description()

示例#9

0

显示文件

文件： audio_train.py 项目： david-richter42/Hoeffding-Tree-Training

y shape: (len(X) // 3, 1)
For the targets, it has to be a numpy array of single-element integer numpy arrays.
Example y sample: np.array([0.5])
"""

import os
import numpy as np

from skmultiflow.trees import HoeffdingTreeClassifier

num_samples = 30000
samples = np.random.rand(num_samples)
samples = samples.reshape(len(samples) // 3, 1, 3)
targets = np.random.randint(2, size=len(samples))
targets = targets.reshape(len(samples), 1)

ht = HoeffdingTreeClassifier()

correct = 0
max_samples = num_samples // 3
for i in range(max_samples):
    X, y = samples[i], targets[i]
    y_pred = ht.predict(X)
    if y[0] == y_pred[0]:
        correct += 1
    ht = ht.partial_fit(X, y)

    # This is just here so I knew it was working
    # print(f"processed sample: {i}")

print(f"Hoeffding Tree accuracy: {correct / len(samples)}")

示例#10

0

显示文件

class TransitionModel:
    """
    !!! It assumes fully imputed and evenly timed data. !!!

    Data received must be DataFrame with raw measurements and column 'label'. Timestamp must be index of DataFrame and
    it must be in same format as for stream story. Label is an integer denoting state of the measurement. Function fit
    from StateGraph returns data in right format.

    For each feature it calculates average and delta (slope of the least squares linear fit) on last window_size
    measurements. Every time before model is fitted, it is tested on the input.

    Attributes:
        window_size (int): Size of window for rolling average and delta
        history (DataFrame of shape (window_size, n_features)): Last window_size rows of data
        accuracy (float): Accuracy of the model
    """
    def __init__(self, window_size):
        self.window_size = window_size
        self.model = HoeffdingTreeClassifier()
        self.history = None
        self.accuracy = None
        # Number of all predictions and correct predictions for calculating accuracy
        self.predictions = 0
        self.correct_predictions = 0

    def delta(self, y: pd.Series) -> np.float64:
        """ Return slope of least squares linear fit. """
        x = np.arange(len(y))
        return np.polyfit(x, y, 1)[0]

    def check_data(self):
        pass

    def prepare_data(self,
                     data: pd.DataFrame,
                     drop_last_row: bool = True,
                     use_history: bool = True) -> pd.DataFrame:
        """ Take raw data and return data stream with running average and running delta. For the last row there is no
            next state, so drop_last_row should be True for learning, but False for predicting. If use_history is set
            to true function will add history to the data and update history."""
        if use_history:
            data = pd.concat([self.history, data])
            # Update self.history to have last self.window_size measurements
            self.history = data.tail(self.window_size)

        sensor_values = data.drop(columns='label')
        labels = data['label']

        prepared_data = pd.DataFrame()
        for col in sensor_values.columns:
            prepared_data[col + '_mean'] = sensor_values[col].rolling(
                window=self.window_size).mean()
            prepared_data[col + '_delta'] = sensor_values[col].rolling(
                window=self.window_size).apply(self.delta, raw=True)
        prepared_data['current_state'] = labels
        prepared_data['next_state'] = labels.shift(periods=-1, fill_value=-1)

        # Drop last row, because we don't know next state yet
        if drop_last_row:
            prepared_data.drop(prepared_data.tail(1).index, inplace=True)
        prepared_data.drop(prepared_data.head(self.window_size - 1).index,
                           inplace=True)
        return prepared_data

    def partial_fit(self, data: pd.DataFrame) -> None:
        """ The most basic working version for now.
        TODO: improve, calculate accuracy, maybe add possibility to learn in batches? """
        stream = DataStream(self.prepare_data(data))
        n = stream.n_remaining_samples()
        for i in range(n):
            x, y = stream.next_sample()
            if self.model.predict(x)[0] == y[0]:
                self.correct_predictions += 1
            self.model.partial_fit(x, y)
        self.predictions += n
        self.accuracy = self.correct_predictions / self.predictions

    def predict(self,
                data: pd.DataFrame = pd.DataFrame(),
                use_history: bool = True) -> List[int]:
        """ Argument data is a DataFrame with shape (n_samples, n_features).
        use_history tells whether or not history will be included in data before making prediction. If use_history is
        set to true, function will make n_samples+1 predictions, otherwise n_samples-window_size+1 predictions."""
        if not use_history and data.shape[0] < self.window_size:
            raise RuntimeError("Not enough measurements to make a prediction.")
        prepared_data = self.prepare_data(data,
                                          drop_last_row=False,
                                          use_history=use_history)
        prepared_data.drop(columns="next_state", inplace=True)
        return self.model.predict(prepared_data.values)

    def save_model(self):
        pass

示例#11

0

显示文件

class OnlineTrainingServer(
        sea_generator_rpcdesign_pb2_grpc.SEAOnlineTrainingServicer):
    def __init__(self):
        self._model = HoeffdingTreeClassifier()
        self._num_event_accumulation = 0
        self._correct_pred = 0

    def test_connection(self, request, context):
        inputTestingRequestString = request.testString
        print('[gRPC Server] connection testing, request message: ',
              inputTestingRequestString)
        return sea_generator_rpcdesign_pb2.testConnectString(
            testString="Message send back to Client")

    def learn_one(self, request, context):
        self._num_event_accumulation += 1

        input_data_bytes = request.ndarray
        decode_data = np.frombuffer(input_data_bytes)
        dummy_data = np.zeros([1, len(decode_data) - 1])
        dummy_label = np.zeros([1]).astype(np.int64)

        for i in range(len(decode_data) - 1):
            dummy_data[0, i] = decode_data[i]

        dummy_label[0] = decode_data[-1]

        print("check out type")
        print(type(dummy_data[0]))

        prior_y_pred = self._model.predict(dummy_data)
        if prior_y_pred == dummy_label:
            self._correct_pred += 1

        self._model.partial_fit(dummy_data, dummy_label)
        post_y_pred = self._model.predict(dummy_data)
        print(
            "Update model with event: ", str(dummy_data), "Ground Truth:",
            str(dummy_label), "  prior pred: ", str(prior_y_pred),
            "  post pred: ", str(post_y_pred), "  Accuracy: ",
            '{:.2f}'.format(self._correct_pred / self._num_event_accumulation))

        self._model.partial_fit(dummy_data, dummy_label)

        return sea_generator_rpcdesign_pb2.learnOneReply(
            isLearnOneSuccess=True)

    def predict_one(self, request, context):
        """
        prediction rpc, the API provided for client to invoke the model which hold by model sever

        :param request:
        :param context:
        :return: y_pred: int
        """

        input_data_bytes = request.ndarray
        decode_data = np.frombuffer(input_data_bytes)
        dummy_data = np.zeros([1, len(decode_data)])

        for i in range(len(decode_data)):
            dummy_data[0, i] = decode_data[i]

        y_pred = None
        try:
            y_pred = self._model.predict(dummy_data)
        except:
            raise ValueError(
                "Incoming data {} is not fit with model. ".format(y_pred))

        return sea_generator_rpcdesign_pb2.predictOneReply(
            isPredictSuccess=True, y_pred=y_pred)

    def flush_model(self, request, context):

        flush_dir = "../modelPersist/online_hoeffding_tree_persist.pkl"

        request_model_persist_dir = request.modelPersistDir
        if len(request_model_persist_dir) > 0:
            flush_dir = request_model_persist_dir

        is_successful = False
        print("check out the flush directory: ", flush_dir)
        with open(flush_dir, 'wb') as flush_model_processor:
            pickle.dump(self._model, flush_model_processor,
                        pickle.HIGHEST_PROTOCOL)
            is_successful = True

        return sea_generator_rpcdesign_pb2.flushModelReply(
            isSuccess=is_successful)

    def extract_model(self, request, context):

        extract_dir = "../modelPersist/online_hoeffding_tree_persist.pkl"

        request_model_extract_dir = request.modelExtractionDir
        if len(request_model_extract_dir) > 0:
            extract_dir = request_model_extract_dir

        is_successful = False
        print("check out the reading directory: ", extract_dir)
        with open(extract_dir, 'rb') as read_model_processor:
            self._model = pickle.load(read_model_processor)
            is_successful = True

        #return is_successful
        return sea_generator_rpcdesign_pb2.extractModelReply(
            isSuccess=is_successful)