def test_hoeffding_tree_regressor_coverage(): max_samples = 1000 max_size_mb = 2 stream = RegressionGenerator( n_samples=max_samples, n_features=10, n_informative=7, n_targets=1, random_state=42 ) X, y = stream.next_sample(max_samples) # Cover memory management tree = HoeffdingTreeRegressor( leaf_prediction='mean', grace_period=100, memory_estimate_period=100, max_byte_size=max_size_mb*2**20 ) tree.partial_fit(X, y) # A tree without memory management enabled reaches over 3 MB in size assert calculate_object_size(tree, 'MB') <= max_size_mb # Typo in leaf prediction tree = HoeffdingTreeRegressor( leaf_prediction='percptron', grace_period=100, memory_estimate_period=100, max_byte_size=max_size_mb*2**20 ) # Invalid split_criterion tree.split_criterion = 'VR' tree.partial_fit(X, y) assert calculate_object_size(tree, 'MB') <= max_size_mb tree.reset() assert tree._estimator_type == 'regressor'
def test_hoeffding_tree_coverage(): max_samples = 1000 max_size_mb = 2 stream = RegressionGenerator( n_samples=max_samples, n_features=10, n_informative=7, n_targets=3, random_state=42 ) X, y = stream.next_sample(max_samples) # Will generate a warning concerning the invalid leaf prediction option tree = StackedSingleTargetHoeffdingTreeRegressor( leaf_prediction='mean', grace_period=200, memory_estimate_period=100, max_byte_size=max_size_mb*2**20 ) # Trying to predict without fitting tree.predict(X[0]) tree.partial_fit(X, y) # A tree without memory management enabled reaches over 3 MB in size assert calculate_object_size(tree, 'MB') <= max_size_mb tree = StackedSingleTargetHoeffdingTreeRegressor( leaf_prediction='adaptive', grace_period=200, memory_estimate_period=100, max_byte_size=max_size_mb*2**20, learning_ratio_const=False ) tree.partial_fit(X, y) assert calculate_object_size(tree, 'MB') <= max_size_mb
def _estimate_model_byte_size(self): """ Calculate the size of the model and trigger tracker function if the actual model size exceeds the max size in the configuration.""" learning_nodes = self._find_learning_nodes() total_active_size = 0 total_inactive_size = 0 for found_node in learning_nodes: if not found_node.node.is_leaf( ): # Safety check for non-trivial tree structures continue if isinstance(found_node.node, ActiveLeaf): total_active_size += calculate_object_size(found_node.node) else: total_inactive_size += calculate_object_size(found_node.node) if total_active_size > 0: self._active_leaf_byte_size_estimate = total_active_size / self._active_leaf_node_cnt if total_inactive_size > 0: self._inactive_leaf_byte_size_estimate = total_inactive_size \ / self._inactive_leaf_node_cnt actual_model_size = calculate_object_size(self) estimated_model_size = ( self._active_leaf_node_cnt * self._active_leaf_byte_size_estimate + self._inactive_leaf_node_cnt * self._inactive_leaf_byte_size_estimate) self._byte_size_estimate_overhead_fraction = actual_model_size / estimated_model_size if actual_model_size > self.max_byte_size: self._enforce_tracker_limit()
def test_hoeffding_tree_coverage(): # Cover memory management max_samples = 5000 max_size_kb = 50 stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=10, n_cat_features=2, n_num_features=5, n_categories_per_cat_feature=5, max_tree_depth=15, min_leaf_depth=3, fraction_leaves_per_level=0.15) nominal_attr_idx = [x for x in range(5, stream.n_features)] # Unconstrained model has over 72 kB learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx, leaf_prediction='mc', memory_estimate_period=100, max_byte_size=max_size_kb * 2**10) X, y = stream.next_sample(max_samples) learner.partial_fit(X, y) assert calculate_object_size(learner, 'kB') <= max_size_kb learner.reset()
def test_extremely_fast_decision_tree_coverage(): # Cover memory management max_size_kb = 20 stream = SEAGenerator(random_state=1, noise_percentage=0.05) X, y = get_next_n_samples(stream, 5000) # Unconstrained model has over 50 kB learner = ExtremelyFastDecisionTreeClassifier(leaf_prediction='mc', memory_estimate_period=200, max_byte_size=max_size_kb * 2**10, min_samples_reevaluate=2500) learner.partial_fit(X, y, classes=[0, 1]) assert calculate_object_size(learner, 'kB') <= max_size_kb learner.reset() # Cover nominal attribute observer stream = RandomTreeGenerator(tree_random_state=23, sample_random_state=12, n_classes=2, n_cat_features=2, n_categories_per_cat_feature=4, n_num_features=1, max_tree_depth=30, min_leaf_depth=10, fraction_leaves_per_level=0.45) X, y = get_next_n_samples(stream, 5000) learner = ExtremelyFastDecisionTreeClassifier( leaf_prediction='nba', nominal_attributes=[i for i in range(1, 9)]) learner.partial_fit(X, y, classes=[0, 1])
def train(): # Pre training the classifier X, y = stream.next_sample(stats["pretrain_size"]) do_pretraining = X.shape[0] > 0 if ensemble: if isinstance(model, list): if do_pretraining: logging.info("Pre-training models in ensemble...") [ m.partial_fit(X, y, classes=stream.target_values[0]) for m in model ] model_pretrained = ensemble(model, stream) else: model_pretrained = ensemble(model, stream) elif type(ensemble(model, stream)).__name__ == 'OzaBaggingMLClassifier': model_pretrained = ensemble(model, stream) if do_pretraining: logging.info("Pre-training oza...") model_pretrained.partial_fit( X, y, classes=stream.target_values[0]) else: if do_pretraining: logging.info("Pre-training model in ensemble...") model.partial_fit(X, y, classes=stream.target_values[0]) model_pretrained = ensemble(model, stream) else: if do_pretraining: logging.info("Pre-training model...") model.partial_fit(X, y, classes=stream.target_values[0]) model_pretrained = model # Keeping track of sample count, true labels and predictions to later # compute the classifier's hamming score iterations = 0 logging.info("Training...") while stream.has_more_samples(): X, y = stream.next_sample(stats["batch_size"]) y_pred = model_pretrained.predict(X) model_pretrained.partial_fit(X, y, classes=stream.target_values[0]) predictions.extend(y_pred) true_labels.extend(y) if iterations % log_every_iterations == 0: logging.info("%s / %s trained samples.", (iterations + 1) * stats["batch_size"], stats["train_size"]) iterations += 1 end_time = time.time() logging.info("All samples trained successfully") stats["success"] = True stats["error"] = False stats["end_time"] = end_time stats["time_seconds"] = end_time - stats["start_time"] stats["model_size_kb"] = calculate_object_size(model_pretrained, "kB")
def measure_byte_size(self): """ Calculate the size of the tree. Returns ------- int Size of the tree in bytes. """ return calculate_object_size(self)
def get_model_measurements(self): """Collect metrics corresponding to the current status of the model. Returns ------- string A string buffer containing the measurements of the model. """ size = calculate_object_size(self) measurements = {'Number of rules: ': len(self.rule_set), 'model_size in bytes': size} return measurements
def test_isoup_tree_coverage(): max_samples = 1000 max_size_mb = 2 stream = RegressionGenerator(n_samples=max_samples, n_features=10, n_informative=7, n_targets=3, random_state=42) # Cover memory management tree = iSOUPTreeRegressor(leaf_prediction='mean', grace_period=200, memory_estimate_period=100, max_byte_size=max_size_mb * 2**20) # Invalid split_criterion tree.split_criterion = 'ICVR' X, y = stream.next_sample(max_samples) tree.partial_fit(X, y) # A tree without memory management enabled reaches over 3 MB in size assert calculate_object_size(tree, 'MB') <= max_size_mb # Memory management in a tree with perceptron leaves (purposeful typo in leaf_prediction) tree = iSOUPTreeRegressor(leaf_prediction='PERCEPTRON', grace_period=200, memory_estimate_period=100, max_byte_size=max_size_mb * 2**20) tree.partial_fit(X, y) assert calculate_object_size(tree, 'MB') <= max_size_mb # Memory management in a tree with adaptive leaves tree = iSOUPTreeRegressor(leaf_prediction='adaptive', grace_period=200, memory_estimate_period=100, max_byte_size=max_size_mb * 2**20) tree.partial_fit(X, y) assert calculate_object_size(tree, 'MB') <= max_size_mb
def test_label_combination_hoeffding_tree_coverage(): # Cover memory management max_samples = 10000 max_size_kb = 50 stream = MultilabelGenerator(n_samples=10000, n_features=15, n_targets=3, n_labels=4, random_state=112) # Unconstrained model has over 62 kB learner = LabelCombinationHoeffdingTreeClassifier( n_labels=3, leaf_prediction='mc', memory_estimate_period=200, max_byte_size=max_size_kb * 2**10) X, y = stream.next_sample(max_samples) learner.partial_fit(X, y) assert calculate_object_size(learner, 'kB') <= max_size_kb
def measure_model_size(self, unit='byte'): return calculate_object_size(self, unit)
def _update_metrics(self): """ Updates the metrics of interest. This function updates the evaluation data buffer which is used to track performance during evaluation. The content of the buffer depends on the evaluation task type and metrics selected. If more than one model/learner is evaluated at once, data is stored as lists inside the buffer. """ shift = 0 if self._method == 'prequential': shift = -self.batch_size # Adjust index due to training after testing sample_id = self.global_sample_count + shift for metric in self.metrics: values = [[], []] if metric == constants.ACCURACY: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].accuracy_score()) values[1].append( self.current_eval_measurements[i].accuracy_score()) elif metric == constants.KAPPA: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].kappa_score()) values[1].append( self.current_eval_measurements[i].kappa_score()) elif metric == constants.KAPPA_T: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].kappa_t_score()) values[1].append( self.current_eval_measurements[i].kappa_t_score()) elif metric == constants.KAPPA_M: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].kappa_m_score()) values[1].append( self.current_eval_measurements[i].kappa_m_score()) elif metric == constants.HAMMING_SCORE: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].hamming_score()) values[1].append( self.current_eval_measurements[i].hamming_score()) elif metric == constants.HAMMING_LOSS: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].hamming_loss_score()) values[1].append( self.current_eval_measurements[i].hamming_loss_score()) elif metric == constants.EXACT_MATCH: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].exact_match_score()) values[1].append( self.current_eval_measurements[i].exact_match_score()) elif metric == constants.J_INDEX: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].jaccard_score()) values[1].append( self.current_eval_measurements[i].jaccard_score()) elif metric == constants.MSE: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].get_mean_square_error()) values[1].append(self.current_eval_measurements[i]. get_mean_square_error()) elif metric == constants.MAE: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].get_average_error()) values[1].append( self.current_eval_measurements[i].get_average_error()) elif metric == constants.AMSE: for i in range(self.n_models): values[0].append(self.mean_eval_measurements[i]. get_average_mean_square_error()) values[1].append(self.current_eval_measurements[i]. get_average_mean_square_error()) elif metric == constants.AMAE: for i in range(self.n_models): values[0].append(self.mean_eval_measurements[i]. get_average_absolute_error()) values[1].append(self.current_eval_measurements[i]. get_average_absolute_error()) elif metric == constants.ARMSE: for i in range(self.n_models): values[0].append(self.mean_eval_measurements[i]. get_average_root_mean_square_error()) values[1].append(self.current_eval_measurements[i]. get_average_root_mean_square_error()) elif metric == constants.F1_SCORE: for i in range(self.n_models): values[0].append(self.mean_eval_measurements[i].f1_score()) values[1].append( self.current_eval_measurements[i].f1_score()) elif metric == constants.PRECISION: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].precision_score()) values[1].append( self.current_eval_measurements[i].precision_score()) elif metric == constants.RECALL: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].recall_score()) values[1].append( self.current_eval_measurements[i].recall_score()) elif metric == constants.GMEAN: for i in range(self.n_models): values[0].append( self.mean_eval_measurements[i].geometric_mean_score()) values[1].append(self.current_eval_measurements[i]. geometric_mean_score()) elif metric == constants.TRUE_VS_PREDICTED: y_true = -1 y_pred = [] for i in range(self.n_models): t, p = self.mean_eval_measurements[i].get_last() y_true = t # We only need to keep one true value y_pred.append(p) values[0] = y_true for i in range(self.n_models): values[1].append(y_pred[i]) elif metric == constants.DATA_POINTS: target_values = self.stream.target_values features = { } # Dictionary containing feature values, using index as key y_pred, p = self.mean_eval_measurements[0].get_last( ) # Only track one model (first) by default X = self.stream.current_sample_x idx_1 = 0 # TODO let the user choose the feature indices of interest idx_2 = 1 features[idx_1] = X[0][idx_1] features[idx_2] = X[0][idx_2] values = [None, None, None] values[0] = features values[1] = target_values values[2] = y_pred elif metric == constants.RUNNING_TIME: values = [[], [], []] for i in range(self.n_models): values[0].append(self.running_time_measurements[i]. get_current_training_time()) values[1].append(self.running_time_measurements[i]. get_current_testing_time()) values[2].append(self.running_time_measurements[i]. get_current_total_running_time()) elif metric == constants.MODEL_SIZE: values = [] for i in range(self.n_models): values.append(calculate_object_size(self.model[i], 'kB')) else: raise ValueError('Unknown metric {}'.format(metric)) # Update buffer if metric == constants.TRUE_VS_PREDICTED: self._data_buffer.update_data(sample_id=sample_id, metric_id=metric, data_id=constants.Y_TRUE, value=values[0]) self._data_buffer.update_data(sample_id=sample_id, metric_id=metric, data_id=constants.Y_PRED, value=values[1]) elif metric == constants.DATA_POINTS: self._data_buffer.update_data(sample_id=sample_id, metric_id=metric, data_id='X', value=values[0]) self._data_buffer.update_data(sample_id=sample_id, metric_id=metric, data_id='target_values', value=values[1]) self._data_buffer.update_data(sample_id=sample_id, metric_id=metric, data_id='predictions', value=values[2]) elif metric == constants.RUNNING_TIME: self._data_buffer.update_data(sample_id=sample_id, metric_id=metric, data_id='training_time', value=values[0]) self._data_buffer.update_data(sample_id=sample_id, metric_id=metric, data_id='testing_time', value=values[1]) self._data_buffer.update_data(sample_id=sample_id, metric_id=metric, data_id='total_running_time', value=values[2]) elif metric == constants.MODEL_SIZE: self._data_buffer.update_data(sample_id=sample_id, metric_id=metric, data_id='model_size', value=values) else: # Default case, 'mean' and 'current' performance self._data_buffer.update_data(sample_id=sample_id, metric_id=metric, data_id=constants.MEAN, value=values[0]) self._data_buffer.update_data(sample_id=sample_id, metric_id=metric, data_id=constants.CURRENT, value=values[1]) shift = 0 if self._method == 'prequential': shift = -self.batch_size # Adjust index due to training after testing self._update_outputs(self.global_sample_count + shift)