def learn_from_instance(self, X, y, weight, rht): if self.perceptron_weight is None: # Creates matrix of perceptron random weights _, rows = get_dimensions(y) _, cols = get_dimensions(X) self.perceptron_weight = self.random_state.uniform( -1, 1, (rows, cols + 1)) self.normalize_perceptron_weights() try: self._observed_class_distribution[0] += weight except KeyError: self._observed_class_distribution[0] = weight if rht.learning_ratio_const: learning_ratio = rht.learning_ratio_perceptron else: learning_ratio = rht.learning_ratio_perceptron / \ (1 + self._observed_class_distribution[0] * rht.learning_ratio_decay) try: self._observed_class_distribution[1] += weight * y self._observed_class_distribution[2] += weight * y * y except KeyError: self._observed_class_distribution[1] = weight * y self._observed_class_distribution[2] = weight * y * y for i in range(int(weight)): self.update_weights(X, y, learning_ratio, rht)
def learn_from_instance(self, X, y, weight, rht): """Update the node with the provided instance. Parameters ---------- X: numpy.ndarray of length equal to the number of features. Instance attributes for updating the node. y: numpy.ndarray of length equal to the number of targets. Instance targets. weight: float Instance weight. rht: RegressionHoeffdingTree Regression Hoeffding Tree to update. """ if self.perceptron_weight is None: self.perceptron_weight = {} # Creates matrix of perceptron random weights _, rows = get_dimensions(y) _, cols = get_dimensions(X) self.perceptron_weight[0] = \ self.random_state.uniform(-1.0, 1.0, (rows, cols + 1)) # Cascade Stacking self.perceptron_weight[1] = \ self.random_state.uniform(-1.0, 1.0, (rows, rows + 1)) self.normalize_perceptron_weights() try: self._observed_class_distribution[0] += weight except KeyError: self._observed_class_distribution[0] = weight if rht.learning_ratio_const: learning_ratio = rht.learning_ratio_perceptron else: learning_ratio = rht.learning_ratio_perceptron / \ (1 + self._observed_class_distribution[0] * rht.learning_ratio_decay) try: self._observed_class_distribution[1] += weight * y self._observed_class_distribution[2] += weight * y * y except KeyError: self._observed_class_distribution[1] = weight * y self._observed_class_distribution[2] = weight * y * y for i in range(int(weight)): self.update_weights(X, y, learning_ratio, rht) for i, x in enumerate(X): try: obs = self._attribute_observers[i] except KeyError: # Creates targets observers, if not already defined if rht.nominal_attributes is not None and i in rht.nominal_attributes: obs = NominalAttributeRegressionObserver() else: obs = NumericAttributeRegressionObserverMultiTarget() self._attribute_observers[i] = obs obs.observe_attribute_class(x, y, weight)
def update_weights(self, X, y, learning_ratio, rht): """Update the perceptron weights Parameters ---------- X: numpy.ndarray of length equal to the number of features. Instance attributes for updating the node. y: numpy.ndarray of length equal to the number of targets. Targets values. learning_ratio: float perceptron learning ratio rht: RegressionHoeffdingTree Regression Hoeffding Tree to update. """ normalized_sample = rht.normalize_sample(X) normalized_base_pred = self._predict_base(normalized_sample) _, n_features = get_dimensions(X) _, n_targets = get_dimensions(y) normalized_target_value = rht.normalized_target_value(y) self.perceptron_weight[0] += learning_ratio * \ (normalized_target_value - normalized_base_pred)[:, None] @ \ normalized_sample[None, :] # Add bias term normalized_base_pred = np.append(normalized_base_pred, 1.0) normalized_meta_pred = self._predict_meta(normalized_base_pred) self.perceptron_weight[1] += learning_ratio * \ (normalized_target_value - normalized_meta_pred)[:, None] @ \ normalized_base_pred[None, :] self.normalize_perceptron_weights() # Update faded errors for the predictors # The considered errors are normalized, since they are based on # mean centered and sd scaled values self.fMAE_M = 0.95 * self.fMAE_M + np.absolute( normalized_target_value - rht.normalized_target_value( self._observed_class_distribution[1] / self._observed_class_distribution[0])) # Ignore added bias term in the comparison self.fMAE_P = 0.95 * self.fMAE_P + np.absolute( normalized_target_value - normalized_base_pred[:-1]) self.fMAE_SP = 0.95 * self.fMAE_SP + np.absolute( normalized_target_value - normalized_meta_pred)
def normalize_sample(self, X): """Normalize the features in order to have the same influence during the process of training. Parameters ---------- X: np.array features. Returns ------- np.array: normalized samples """ if self.examples_seen <= 1: _, c = get_dimensions(X) return np.zeros((c + 1), dtype=np.float64) mean = self.sum_of_attribute_values / self.examples_seen variance = (self.sum_of_attribute_squares - (self.sum_of_attribute_values**2) / self.examples_seen) / ( self.examples_seen - 1) sd = np.sqrt(variance, out=np.zeros_like(variance), where=variance >= 0.0) normalized_sample = np.zeros(X.shape[0] + 1, dtype=np.float64) np.divide(X - mean, sd, where=sd != 0, out=normalized_sample[:-1]) # Augments sample with the bias input signal (or y intercept for # each target) normalized_sample[-1] = 1.0 return normalized_sample
def transform(self, X): r, c = get_dimensions(X) custom_dtype_b = np.dtype([('bytes', np.uint8, 8)]) X_u4 = X.view(custom_dtype_b) X_u4_u = np.unpackbits(X_u4['bytes'], axis=1) return X_u4_u.reshape(r, c * 8 * 8)
def init_ensemble(self, X): self.ensemble = [None] * self.n_estimators self._set_max_features(get_dimensions(X)[1]) for i in range(self.n_estimators): self.ensemble[i] = ARFBaseLearner(i, ARFHoeffdingTree(max_byte_size=self.max_byte_size, memory_estimate_period=self.memory_estimate_period, grace_period=self.grace_period, split_criterion=self.split_criterion, split_confidence=self.split_confidence, tie_threshold=self.tie_threshold, binary_split=self.binary_split, stop_mem_management=self.stop_mem_management, remove_poor_atts=self.remove_poor_atts, no_preprune=self.no_preprune, leaf_prediction=self.leaf_prediction, nb_threshold=self.nb_threshold, nominal_attributes=self.nominal_attributes, max_features=self.max_features, random_state=self._init_random_state), self.instances_seen, self.drift_detection_method, self.warning_detection_method, False)
def learn_from_instance(self, X, y, weight, ht): """Update the node with the provided instance. Parameters ---------- X: numpy.ndarray of length equal to the number of features. Instance attributes for updating the node. y: int Instance class. weight: float Instance weight. ht: HoeffdingTree Hoeffding Tree to update. """ try: self._observed_class_distribution[y] += weight except KeyError: self._observed_class_distribution[y] = weight if self.list_attributes.size == 0: self.list_attributes = self._sample_features( get_dimensions(X)[1]) for i in self.list_attributes: try: obs = self._attribute_observers[i] except KeyError: if i in ht.nominal_attributes: obs = NominalAttributeClassObserver() else: obs = NumericAttributeClassObserverGaussian() self._attribute_observers[i] = obs obs.observe_attribute_class(X[i], int(y), weight)
def _partial_fit(self, X # , y # , sample_weight=None ): # print(__name__, 6) # if y is not None: sample_weight = None row_cnt, _ = get_dimensions(X) if sample_weight is None: # sample_weight = 1 sample_weight = np.ones(row_cnt) # sample_weight = np.ones((1, row_cnt)) if row_cnt != len(sample_weight): raise ValueError('Inconsistent number of instances ({}) and weights ({}).'. format(row_cnt, len(sample_weight))) for i in range(row_cnt): if sample_weight[i] != 0.0: self.samples_seen += sample_weight[i] try: self.sum_of_attribute_values = np.add(self.sum_of_attribute_values, np.multiply(sample_weight[i], X[i])) self.sum_of_attribute_squares = np.add( self.sum_of_attribute_squares, np.multiply(sample_weight[i], np.power(X[i], 2)) ) except ValueError: self.sum_of_attribute_values = np.multiply(sample_weight[i], X[i]) self.sum_of_attribute_squares = np.multiply(sample_weight[i], np.power(X[i], 2))
def predict(self, X): """ predict The predict function will average the predictions from all its learners to find the most likely prediction for the sample matrix X. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) A matrix of the samples we want to predict. Returns ------- numpy.ndarray A numpy.ndarray with the label prediction for all the samples in X. """ r, c = get_dimensions(X) proba = self.predict_proba(X) predictions = [] if proba is None: return None for i in range(r): predictions.append(np.argmax(proba[i])) return np.asarray(predictions)
def run(X, y, hyperParams): """ run Test function for SAMKNN, not integrated with evaluation modules. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The feature's matrix, coded as 64 bits. y: numpy.array of size n_samples The labels for all the samples in X coded as 8 bits. hyperParams: dict A dictionary containing the __init__ params for the SAMKNN. """ r, c = get_dimensions(X) classifier = SAMKNN(n_neighbors=hyperParams['nNeighbours'], max_window_size=hyperParams['maxSize'], weighting=hyperParams['knnWeights'], stm_size_option=hyperParams['STMSizeAdaption'], use_ltm=hyperParams['useLTM']) logging.info('applying model on dataset') predicted_labels = [] true_labels = [] for i in range(r): pred = classifier.predict(np.asarray([X[i]])) predicted_labels.append(pred[0]) true_labels.append(y[i]) classifier = classifier.partial_fit(np.asarray([X[i]]), np.asarray([y[i]]), None) if (i % (r // 20)) == 0: logging.info(str((i // (r / 20))*5) + "%") accuracy = accuracy_score(true_labels, predicted_labels) logging.info('error rate %.2f%%' % (100-100*accuracy))
def predict_proba(self, X): """ Estimate the probability of X belonging to each class-labels. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) Returns ------- numpy.ndarray A 2D array of shape (n_samples, n_classes). Where each i-th row contains len(self.target_value) elements, representing the probability that the i-th sample of X belongs to a certain class label. """ r, c = get_dimensions(X) if self.data_window is None or self.data_window.size < self.n_neighbors: # The model is empty, defaulting to zero return np.zeros(shape=(r, 1)) proba = [] self.classes = list(set().union( self.classes, np.unique(self.data_window.targets_buffer.astype(np.int)))) new_dist, new_ind = self._get_neighbors(X) for i in range(r): votes = [0.0 for _ in range(int(max(self.classes) + 1))] for index in new_ind[i]: votes[int(self.data_window.targets_buffer[index])] += 1. / len( new_ind[i]) proba.append(votes) return np.asarray(proba)
def transform(self, X): """ transform Transform one hot features in the X matrix into int coded categorical features. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. Returns ------- numpy.ndarray The transformed data. """ r, c = get_dimensions(X) new_width = c for i in range(len(self.categorical_list)): new_width -= len(self.categorical_list[i]) - 1 ret = np.zeros((0, new_width), dtype=X.dtype) for i in range(r): ret = np.concatenate((ret, self._transform(X[i, :], new_width)), axis=0) return ret
def partial_fit(self, X, y, sample_weight=None): """Incrementally trains the model. Train samples (instances) are composed of X attributes and their corresponding targets y. Tasks performed before training: * Verify instance weight. if not provided, uniform weights (1.0) are assumed. * If more than one instance is passed, loop through X and pass instances one at a time. * Update weight seen by model. Training tasks: * If the tree is empty, create a leaf node as the root. * If the tree is already initialized, find the corresponding leaf for the instance and update the leaf node statistics. * If growth is allowed and the number of instances that the leaf has observed between split attempts exceed the grace period then attempt to split. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) Instance attributes. y: numpy.ndarray of shape (n_samples, n_targets) Target values. sample_weight: float or array-like Samples weight. If not provided, uniform weights are assumed. """ if y is not None: # Set the number of targets once if not self._n_targets_set: _, self._n_targets = get_dimensions(y) self._n_targets_set = True row_cnt, _ = get_dimensions(X) if sample_weight is None: sample_weight = np.ones(row_cnt) if row_cnt != len(sample_weight): raise ValueError( 'Inconsistent number of instances ({}) and weights ({}).'. format(row_cnt, len(sample_weight))) for i in range(row_cnt): if sample_weight[i] != 0.0: self._train_weight_seen_by_model += sample_weight[i] self._partial_fit(X[i], y[i], sample_weight[i])
def partial_fit(self, X, y, classes=None, sample_weight=None): """ Incrementally trains the model. Train samples (instances) are composed of X attributes and their corresponding targets y. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) Instance attributes. y: array_like Classes (targets) for all samples in X. classes: list or numpy.array Contains the class values in the stream. If defined, will be used to define the length of the arrays returned by `predict_proba` sample_weight: float or array-like Samples weight. If not provided, uniform weights are assumed. Notes ----- Tasks performed before training: * Verify instance weight. if not provided, uniform weights (1.0) are assumed. * If more than one instance is passed, loop through X and pass instances one at a time. * Update weight seen by model. Training tasks: * If the tree is empty, create a leaf node as the root. * If the tree is already initialized, find the path from root to the corresponding leaf for the instance and sort the instance. * Reevaluate the best split for each internal node. * Attempt to split the leaf. """ if self.classes is None and classes is not None: self.classes = classes if y is not None: if sample_weight is None: sample_weight = np.array([1.0]) row_cnt, _ = get_dimensions(X) wrow_cnt, _ = get_dimensions(sample_weight) if row_cnt != wrow_cnt: sample_weight = [sample_weight[0]] * row_cnt for i in range(row_cnt): if sample_weight[i] != 0.0: self._train_weight_seen_by_model += sample_weight[i] self._partial_fit(X[i], y[i], sample_weight[i])
def predict(self, X): r, _ = get_dimensions(X) predictions = [] y_proba = self.predict_proba(X) for i in range(r): index = np.argmax(y_proba[i]) predictions.append(index) return np.array(predictions)
def predict_proba(self, X): """ Estimate the probability of X belonging to each class-label. Parameters ---------- X : numpy.ndarray of shape (n_samples, n_features) The matrix of samples to predict the class probabilities for. Raises ------ ValueError: A ValueError is raised if the number of classes in the base learner exceed that of the ensemble learner. Returns ------- A numpy.ndarray of shape (n_samples, n_labels), in which each outer entry is associated with the X entry of the same index. And where the list in index [i] contains len(self.target_values) elements, each of which represents the probability that the i-th sample of X belongs to a certain class-label. Notes ----- Calculates the probability of each sample in X belonging to each of the labels, based on the base estimator. This is done by predicting the class probability for each one of the ensemble's classifier, and then taking the absolute probability from the ensemble itself. """ if self.enable_code_matrix: return self.predict_binary_proba(X) proba = [] r, c = get_dimensions(X) try: for i in range(self.actual_n_estimators): partial_proba = self.ensemble[i].predict_proba(X) if len(partial_proba[0]) > max(self.classes) + 1: raise ValueError("The number of classes in the base learner is larger than in" " the ensemble.") if len(proba) < 1: for row_idx in range(r): proba.append([0.0] * len(partial_proba[row_idx])) for row_idx in range(r): for class_idx in range(len(partial_proba[row_idx])): try: proba[row_idx][class_idx] += partial_proba[row_idx][class_idx] except IndexError: proba[row_idx].append(partial_proba[row_idx][class_idx]) except ValueError: return np.zeros((r, 1)) except TypeError: return np.zeros((r, 1)) return self._normalize_probabilities(rows=r, y_proba=proba)
def predict(self, X): #print(self.get_model_measurements()["Tree size (nodes)"]) #print("start predict") r, _ = get_dimensions(X) predictions = [] y_proba = self.predict_proba(X) for i in range(r): index = np.argmax(y_proba[i]) predictions.append(index) return np.array(predictions)
def partial_fit(self, X, y, classes=None, weight=1.0): if self.classes is None and classes is not None: self.classes = classes if y is not None: row_cnt, _ = get_dimensions(X) weight = check_weights(weight, expand_length=row_cnt) for i in range(row_cnt): if weight[i] != 0.0: self._train_weight_seen_by_model += weight[i] self._partial_fit(X[i], y[i], self.classes, weight[i])
def transform(self, X): # print(__name__, 2) self._partial_fit(X) normalized_samples = [] if self.samples_seen > 0: r, c = get_dimensions(X) for i in range(r): normalized_samples.append(self.normalize_sample(X[i])) else: pass return np.array(normalized_samples)
def predict(self, X): r, c = get_dimensions(X) predictedLabel = [] if self._STMSamples is None: self._STMSamples = np.empty(shape=(0, c)) self._LTMSamples = np.empty(shape=(0, c)) for i in range(r): distancesSTM = SAMKNN.get_distances(X[i], self._STMSamples) predictedLabel.append(self.predictFct(X[i], None, distancesSTM)) return predictedLabel
def partial_fit(self, X, y, classes=None, weight=None): """Processes a new sample.""" r, c = get_dimensions(X) if self._STMSamples is None: self._STMSamples = np.empty(shape=(0, c)) self._LTMSamples = np.empty(shape=(0, c)) for i in range(r): self._partial_fit(X[i, :], y[i]) return self
def test_get_dimensions(): rows_expected = 5 cols_expected = 5 a_list = [None] * cols_expected rows, cols = get_dimensions(a_list) assert rows == 1 assert cols == cols_expected a_list_of_lists = [a_list] * rows_expected rows, cols = get_dimensions(a_list_of_lists) assert rows == rows_expected assert cols == cols_expected a_ndarray = np.ndarray(cols_expected) rows, cols = get_dimensions(a_ndarray) assert rows == 1 assert cols == cols_expected a_ndarray = np.ndarray((rows_expected, cols_expected)) rows, cols = get_dimensions(a_ndarray) assert rows == rows_expected assert cols == cols_expected
def partial_fit(self, X, y, classes=None, sample_weight=None): """ Partially (incrementally) fit the model. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: numpy.ndarray, optional (default=None) Array with all possible/known classes. sample_weight: Not used. Returns ------- KNNADWINClassifier self Notes ----- Partially fits the model by updating the window with new samples while also updating the ADWIN algorithm. IF ADWIN detects a change, the window is split in such a wat that samples from the previous concept are dropped. """ r, c = get_dimensions(X) if classes is not None: self.classes = list(set().union(self.classes, classes)) for i in range(r): self.data_window.add_sample(X[i], y[i]) if self.data_window.size >= self.n_neighbors: correctly_classifies = 1 if self.predict(np.asarray( [X[i]])) == y[i] else 0 self.adwin.add_element(correctly_classifies) else: self.adwin.add_element(0) if self.data_window.size >= self.n_neighbors: if self.adwin.detected_change(): if self.adwin.width < self.data_window.size: for i in range(self.data_window.size, self.adwin.width, -1): self.data_window.delete_oldest_sample() return self
def update_weights(self, X, y, learning_ratio, rht): """Update the perceptron weights Parameters ---------- X: numpy.ndarray of length equal to the number of features. Instance attributes for updating the node. y: numpy.ndarray of length equal to the number of targets. Targets values. learning_ratio: float perceptron learning ratio rht: RegressionHoeffdingTree Regression Hoeffding Tree to update. """ normalized_sample = rht.normalize_sample(X) normalized_base_pred = self._predict_base(normalized_sample) _, n_features = get_dimensions(X) _, n_targets = get_dimensions(y) normalized_target_value = rht.normalized_target_value(y) self.perceptron_weight[0] += learning_ratio * \ (normalized_target_value - normalized_base_pred)[:, None] @ \ normalized_sample[None, :] # Add bias term normalized_base_pred = np.append(normalized_base_pred, 1.0) normalized_meta_pred = self._predict_meta(normalized_base_pred) self.perceptron_weight[1] += learning_ratio * \ (normalized_target_value - normalized_meta_pred)[:, None] @ \ normalized_base_pred[None, :] self.normalize_perceptron_weights()
def partial_fit(self, X, y, classes=None, sample_weight=None): if self.classes is None and classes is not None: self.classes = classes if y is not None: row_cnt, _ = get_dimensions(X) if sample_weight is None: sample_weight = np.ones(row_cnt) if row_cnt != len(sample_weight): raise ValueError( 'Inconsistent number of instances ({}) and weights ({}).'. format(row_cnt, len(sample_weight))) for i in range(row_cnt): if sample_weight[i] != 0.0: self._train_weight_seen_by_model += sample_weight[i] self._partial_fit(X[i], y[i], sample_weight[i]) return self
def partial_fit(self, X, y, classes=None, sample_weight=None): """ Partially (incrementally) fit the model. Parameters ---------- X : numpy.ndarray of shape (n_samples, n_features) The features to train the model. y: numpy.ndarray of shape (n_samples) An array-like with the class labels of all samples in X. classes: numpy.ndarray, optional (default=None) Array with all possible/known class labels. sample_weight: not used (default=None) Raises ------ ValueError: A ValueError is raised if the 'classes' parameter is not passed in the first partial_fit call, or if they are passed in further calls but differ from the initial classes list passed. Returns ------- LeveragingBaggingClassifier self """ if classes is None and self.classes is None: raise ValueError("The first partial_fit call should pass all the classes.") if classes is not None and self.classes is None: self.classes = classes elif classes is not None and self.classes is not None: if set(self.classes) == set(classes): pass else: raise ValueError( "The classes passed to the partial_fit function differ from those passed in " "an earlier call.") r, c = get_dimensions(X) for i in range(r): self.__partial_fit(X[i], y[i]) return self
def transform(self, X): """ transform Does the transformation process in the samples in X. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. """ r, c = get_dimensions(X) for i in range(r): for j in range(c): if X[i][j] in self.missing_value: X[i][j] = self._get_substitute(j) return X
def transform(self, X): """ transform Does the transformation process in the samples in X. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) The sample or set of samples that should be transformed. """ r, c = get_dimensions(X) for i in range(r): if self.strategy in ['mean', 'median', 'mode']: self.window.add_element([X[i][:]]) for j in range(c): if X[i][j] in self.missing_value or np.isnan(X[i][j]): X[i][j] = self._get_substitute(j) return X
def predict_proba(self, X): r, _ = get_dimensions(X) predictions = [] for i in range(r): votes = self.get_votes_for_instance(X[i]).copy() if votes == {}: # Tree is empty, all classes equal, default to zero predictions.append([0]) else: new_votes = dict((key, d[key]) for d in votes for key in d) if sum(new_votes.values()) != 0: normalize_values_in_dict(new_votes) if self.classes is not None: y_proba = np.zeros(int(max(self.classes)) + 1) else: y_proba = np.zeros(int(max(new_votes.keys())) + 1) for key, value in new_votes.items(): y_proba[int(key)] = value predictions.append(y_proba) return np.array(predictions)
def predict(self, X): """Predicts the label of the X instance(s) Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) Samples for which we want to predict the labels. Returns ------- numpy.ndarray Predicted labels for all instances in X. """ r, _ = get_dimensions(X) predictions = [] for i in range(r): votes = self.get_votes_for_instance(X[i]) if votes == {}: # Ensemble is empty, all classes equal, default to zero predictions.append(0) else: predictions.append(max(votes, key=votes.get)) return np.asarray(predictions)