def test_hoeffding_tree_categorical_features(test_path): data_path = os.path.join(test_path, 'ht_categorical_features_testcase.npy') stream = np.load(data_path) # Removes the last two columns (regression targets) stream = stream[:, :-2] X, y = stream[:, :-1], stream[:, -1] nominal_attr_idx = np.arange(7).tolist() learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) learner.partial_fit(X, y, classes=np.unique(y)) expected_description = "if Attribute 0 = -15.0:\n" \ " Leaf = Class 2 | {2: 350.0}\n" \ "if Attribute 0 = 0.0:\n" \ " Leaf = Class 0 | {0: 420.0, 1: 252.0}\n" \ "if Attribute 0 = 1.0:\n" \ " Leaf = Class 1 | {0: 312.0, 1: 332.0}\n" \ "if Attribute 0 = 2.0:\n" \ " Leaf = Class 1 | {0: 236.0, 1: 383.0}\n" \ "if Attribute 0 = 3.0:\n" \ " Leaf = Class 1 | {0: 168.0, 1: 459.0}\n" \ "if Attribute 0 = -30.0:\n" \ " Leaf = Class 3.0 | {3.0: 46.0, 4.0: 42.0}\n" assert learner.get_model_description() == expected_description
def test_hoeffding_tree_coverage(): # Cover memory management max_samples = 5000 max_size_kb = 50 stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=10, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=15, min_leaf_depth=3, fraction_leaves_per_level=0.15) nominal_attr_idx = [x for x in range(5, stream.n_features)] # Unconstrained model has over 72 kB learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx, leaf_prediction='mc', memory_estimate_period=100, max_byte_size=max_size_kb * 2**10) X, y = stream.next_sample(max_samples) learner.partial_fit(X, y) assert calculate_object_size(learner, 'kB') <= max_size_kb learner.reset()
def test_hoeffding_tree_model_information(): stream = SEAGenerator(random_state=1, noise_percentage=0.05) X, y = stream.next_sample(5000) nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) learner.partial_fit(X, y, classes=stream.target_values) expected_info = { 'Tree size (nodes)': 5, 'Tree size (leaves)': 3, 'Active learning nodes': 3, 'Tree depth': 2, 'Active leaf byte size estimate': 0.0, 'Inactive leaf byte size estimate': 0.0, 'Byte size estimate overhead': 1.0 } observed_info = learner.get_model_measurements for k in expected_info: assert k in observed_info assert expected_info[k] == observed_info[k] expected_description = "if Attribute 0 <= 4.549969620513424:\n" \ " if Attribute 1 <= 5.440182925299016:\n" \ " Leaf = Class 0 | {0: 345.54817975126275, 1: 44.43855503614928}\n" \ " if Attribute 1 > 5.440182925299016:\n" \ " Leaf = Class 1 | {0: 54.451820248737235, 1: 268.5614449638507}\n" \ "if Attribute 0 > 4.549969620513424:\n" \ " Leaf = Class 1 | {0: 390.5845685762964, 1: 2372.3747376855454}\n" \ assert expected_description == learner.get_model_description()
def test_hoeffding_tree_coverage(): # Cover memory management stream = SEAGenerator(random_state=1, noise_percentage=0.05) X, y = stream.next_sample(5000) learner = HoeffdingTreeClassifier(max_byte_size=30, memory_estimate_period=100, grace_period=10, leaf_prediction='mc') learner.partial_fit(X, y, classes=stream.target_values) learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=1, sample_random_state=1, n_num_features=0, n_categories_per_cat_feature=2) X, y = stream.next_sample(1000) learner = HoeffdingTreeClassifier(leaf_prediction='mc', nominal_attributes=[i for i in range(10)]) learner.partial_fit(X, y, classes=stream.target_values)
def test_hoeffding_tree_nb(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx, leaf_prediction='nb') cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [ 0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3, 1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3, 2 ]) assert np.alltrue(predictions == expected_predictions) expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nb', " \ "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \ "nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14], remove_poor_atts=False, " \ "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, " \ "tie_threshold=0.05)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info
def get_error_hoeffdingtree(data, pre_train_size, **hf_kwargs): orig_X = data[:, :-1] orig_y = data[:, -1].astype(int) stream = DataStream(orig_X, orig_y) hf = HoeffdingTreeClassifier(**hf_kwargs) pretrainX, pretrainy = stream.next_sample(pre_train_size) # Pre-train hf.partial_fit(pretrainX, pretrainy, classes=stream.target_values) evaluations = [] while stream.has_more_samples(): X, y = stream.next_sample() # Evaluation y_hat = hf.predict(X) evaluations.append(int(y_hat[0] == y[0])) # Train hf.partial_fit(X, y, classes=stream.target_values) return evaluations
elec_df.columns = ['date', 'day', 'period', 'nswprice', 'nswdemand', 'vicprice', 'vicdemand', 'transfer', 'class'] mapping = {"day":{"1":1, "2":2, "3":3, "4":4, "5":5, "6":6, "7":7}, "class": {"UP": 0, "DOWN": 1}} elec_df = elec_df.replace(mapping) elec_full_df = pandas.concat([elec_df] * 200) STREAM_SIZE = elec_full_df.shape[0] elec_stream = DataStream(elec_full_df, name="elec") elec_stream.prepare_for_use() X_train, y_train = elec_stream.next_sample(TRAINING_SIZE) ht = HoeffdingTreeClassifier() ht.partial_fit(X_train, y_train) n_global = ignore + TRAINING_SIZE # Cumulative Number of observations d_ddm = 0 w_ddm = 0 TP_ddm = [] FP_ddm = [] RT_ddm = [] DIST_ddm = [] mem_ddm = [] retrain = False grace_end = n_global detect_end = n_global mine_pr = [] mine_std = [] mine_alpha = []
def test_hoeffding_tree_nba(test_path): stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=4, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=6, min_leaf_depth=3, fraction_leaves_per_level=0.15) nominal_attr_idx = [x for x in range(5, stream.n_features)] learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx) cnt = 0 max_samples = 5000 predictions = array('i') proba_predictions = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) proba_predictions.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y) cnt += 1 expected_predictions = array('i', [0, 1, 3, 0, 0, 3, 0, 1, 1, 2, 0, 2, 1, 1, 2, 1, 3, 0, 1, 1, 1, 1, 0, 3, 1, 2, 1, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 1, 3, 2]) test_file = os.path.join(test_path, 'test_hoeffding_tree.npy') data = np.load(test_file) assert np.alltrue(predictions == expected_predictions) assert np.allclose(proba_predictions, data) expected_info = "HoeffdingTreeClassifier(binary_split=False, grace_period=200, leaf_prediction='nba', " \ "max_byte_size=33554432, memory_estimate_period=1000000, nb_threshold=0, no_preprune=False, " \ "nominal_attributes=[5, 6, 7, 8, 9, 10, 11, 12, 13, 14], remove_poor_atts=False, " \ "split_confidence=1e-07, split_criterion='info_gain', stop_mem_management=False, " \ "tie_threshold=0.05)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info expected_model_1 = 'Leaf = Class 1.0 | {0.0: 1423.0, 1.0: 1745.0, 2.0: 978.0, 3.0: 854.0}\n' assert (learner.get_model_description() == expected_model_1) assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray X, y = stream.next_sample(20000) learner.split_criterion = 'hellinger' learner.partial_fit(X, y) expected_rules = 'Att (5) == 0.000 and Att (12) == 0.000 | class: 1\n' + \ 'Att (5) == 0.000 and Att (12) == 1.000 | class: 1\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) <= 0.730 | class: 0\n' +\ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) <= 0.550 and Att (3) > 0.730 | class: 2\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) <= 0.800 | class: 0\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 0.000' \ ' | class: 0\n' + \ 'Att (5) == 1.000 and Att (13) == 0.000 and Att (1) > 0.550 and Att (1) > 0.800 and Att (14) == 1.000' \ ' | class: 1\n' + \ 'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) <= 0.730 | class: 1\n' + \ 'Att (5) == 1.000 and Att (13) == 1.000 and Att (3) > 0.730 | class: 0\n' assert expected_rules == learner.get_rules_description()
y shape: (len(X) // 3, 1) For the targets, it has to be a numpy array of single-element integer numpy arrays. Example y sample: np.array([0.5]) """ import os import numpy as np from skmultiflow.trees import HoeffdingTreeClassifier num_samples = 30000 samples = np.random.rand(num_samples) samples = samples.reshape(len(samples) // 3, 1, 3) targets = np.random.randint(2, size=len(samples)) targets = targets.reshape(len(samples), 1) ht = HoeffdingTreeClassifier() correct = 0 max_samples = num_samples // 3 for i in range(max_samples): X, y = samples[i], targets[i] y_pred = ht.predict(X) if y[0] == y_pred[0]: correct += 1 ht = ht.partial_fit(X, y) # This is just here so I knew it was working # print(f"processed sample: {i}") print(f"Hoeffding Tree accuracy: {correct / len(samples)}")
class TransitionModel: """ !!! It assumes fully imputed and evenly timed data. !!! Data received must be DataFrame with raw measurements and column 'label'. Timestamp must be index of DataFrame and it must be in same format as for stream story. Label is an integer denoting state of the measurement. Function fit from StateGraph returns data in right format. For each feature it calculates average and delta (slope of the least squares linear fit) on last window_size measurements. Every time before model is fitted, it is tested on the input. Attributes: window_size (int): Size of window for rolling average and delta history (DataFrame of shape (window_size, n_features)): Last window_size rows of data accuracy (float): Accuracy of the model """ def __init__(self, window_size): self.window_size = window_size self.model = HoeffdingTreeClassifier() self.history = None self.accuracy = None # Number of all predictions and correct predictions for calculating accuracy self.predictions = 0 self.correct_predictions = 0 def delta(self, y: pd.Series) -> np.float64: """ Return slope of least squares linear fit. """ x = np.arange(len(y)) return np.polyfit(x, y, 1)[0] def check_data(self): pass def prepare_data(self, data: pd.DataFrame, drop_last_row: bool = True, use_history: bool = True) -> pd.DataFrame: """ Take raw data and return data stream with running average and running delta. For the last row there is no next state, so drop_last_row should be True for learning, but False for predicting. If use_history is set to true function will add history to the data and update history.""" if use_history: data = pd.concat([self.history, data]) # Update self.history to have last self.window_size measurements self.history = data.tail(self.window_size) sensor_values = data.drop(columns='label') labels = data['label'] prepared_data = pd.DataFrame() for col in sensor_values.columns: prepared_data[col + '_mean'] = sensor_values[col].rolling( window=self.window_size).mean() prepared_data[col + '_delta'] = sensor_values[col].rolling( window=self.window_size).apply(self.delta, raw=True) prepared_data['current_state'] = labels prepared_data['next_state'] = labels.shift(periods=-1, fill_value=-1) # Drop last row, because we don't know next state yet if drop_last_row: prepared_data.drop(prepared_data.tail(1).index, inplace=True) prepared_data.drop(prepared_data.head(self.window_size - 1).index, inplace=True) return prepared_data def partial_fit(self, data: pd.DataFrame) -> None: """ The most basic working version for now. TODO: improve, calculate accuracy, maybe add possibility to learn in batches? """ stream = DataStream(self.prepare_data(data)) n = stream.n_remaining_samples() for i in range(n): x, y = stream.next_sample() if self.model.predict(x)[0] == y[0]: self.correct_predictions += 1 self.model.partial_fit(x, y) self.predictions += n self.accuracy = self.correct_predictions / self.predictions def predict(self, data: pd.DataFrame = pd.DataFrame(), use_history: bool = True) -> List[int]: """ Argument data is a DataFrame with shape (n_samples, n_features). use_history tells whether or not history will be included in data before making prediction. If use_history is set to true, function will make n_samples+1 predictions, otherwise n_samples-window_size+1 predictions.""" if not use_history and data.shape[0] < self.window_size: raise RuntimeError("Not enough measurements to make a prediction.") prepared_data = self.prepare_data(data, drop_last_row=False, use_history=use_history) prepared_data.drop(columns="next_state", inplace=True) return self.model.predict(prepared_data.values) def save_model(self): pass
class OnlineTrainingServer( sea_generator_rpcdesign_pb2_grpc.SEAOnlineTrainingServicer): def __init__(self): self._model = HoeffdingTreeClassifier() self._num_event_accumulation = 0 self._correct_pred = 0 def test_connection(self, request, context): inputTestingRequestString = request.testString print('[gRPC Server] connection testing, request message: ', inputTestingRequestString) return sea_generator_rpcdesign_pb2.testConnectString( testString="Message send back to Client") def learn_one(self, request, context): self._num_event_accumulation += 1 input_data_bytes = request.ndarray decode_data = np.frombuffer(input_data_bytes) dummy_data = np.zeros([1, len(decode_data) - 1]) dummy_label = np.zeros([1]).astype(np.int64) for i in range(len(decode_data) - 1): dummy_data[0, i] = decode_data[i] dummy_label[0] = decode_data[-1] print("check out type") print(type(dummy_data[0])) prior_y_pred = self._model.predict(dummy_data) if prior_y_pred == dummy_label: self._correct_pred += 1 self._model.partial_fit(dummy_data, dummy_label) post_y_pred = self._model.predict(dummy_data) print( "Update model with event: ", str(dummy_data), "Ground Truth:", str(dummy_label), " prior pred: ", str(prior_y_pred), " post pred: ", str(post_y_pred), " Accuracy: ", '{:.2f}'.format(self._correct_pred / self._num_event_accumulation)) self._model.partial_fit(dummy_data, dummy_label) return sea_generator_rpcdesign_pb2.learnOneReply( isLearnOneSuccess=True) def predict_one(self, request, context): """ prediction rpc, the API provided for client to invoke the model which hold by model sever :param request: :param context: :return: y_pred: int """ input_data_bytes = request.ndarray decode_data = np.frombuffer(input_data_bytes) dummy_data = np.zeros([1, len(decode_data)]) for i in range(len(decode_data)): dummy_data[0, i] = decode_data[i] y_pred = None try: y_pred = self._model.predict(dummy_data) except: raise ValueError( "Incoming data {} is not fit with model. ".format(y_pred)) return sea_generator_rpcdesign_pb2.predictOneReply( isPredictSuccess=True, y_pred=y_pred) def flush_model(self, request, context): flush_dir = "../modelPersist/online_hoeffding_tree_persist.pkl" request_model_persist_dir = request.modelPersistDir if len(request_model_persist_dir) > 0: flush_dir = request_model_persist_dir is_successful = False print("check out the flush directory: ", flush_dir) with open(flush_dir, 'wb') as flush_model_processor: pickle.dump(self._model, flush_model_processor, pickle.HIGHEST_PROTOCOL) is_successful = True return sea_generator_rpcdesign_pb2.flushModelReply( isSuccess=is_successful) def extract_model(self, request, context): extract_dir = "../modelPersist/online_hoeffding_tree_persist.pkl" request_model_extract_dir = request.modelExtractionDir if len(request_model_extract_dir) > 0: extract_dir = request_model_extract_dir is_successful = False print("check out the reading directory: ", extract_dir) with open(extract_dir, 'rb') as read_model_processor: self._model = pickle.load(read_model_processor) is_successful = True #return is_successful return sea_generator_rpcdesign_pb2.extractModelReply( isSuccess=is_successful)