def test_build_error(): new_exception = exception.BuildError('error') try: raise new_exception except exception.BuildError: pass
def fit(self, X_train, Y_train=None): """Fits data in the classifier. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. """ logger.info('Clustering with classifier ...') # Initializing the timer start = time.time() # Creating a subgraph self.subgraph = KNNSubgraph(X_train, Y_train) # Checks if it is supposed to use pre-computed distances if self.pre_computed_distance: # Checks if its size is the same as the subgraph's amount of nodes if self.pre_distances.shape[ 0] != self.subgraph.n_nodes or self.pre_distances.shape[ 1] != self.subgraph.n_nodes: # If not, raises an error raise e.BuildError( 'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`' ) # Performing the best minimum cut on the subgraph self._best_minimum_cut(self.min_k, self.max_k) # Clustering the data with best `k` value self._clustering(self.subgraph.best_k) # The subgraph has been properly trained self.subgraph.trained = True # Ending timer end = time.time() # Calculating training task time train_time = end - start logger.info('Classifier has been clustered with.') logger.info(f'Number of clusters: {self.subgraph.n_clusters}.') logger.info(f'Clustering time: {train_time} seconds.')
def _learn(self, X_train, Y_train, I_train, X_val, Y_val, I_val): """Learns the best `k` value over the validation set. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. I_train (np.array): Array of training indexes. X_val (np.array): Array of validation features. Y_val (np.array): Array of validation labels. I_val (np.array): Array of validation indexes. """ logger.info('Learning best `k` value ...') # Creating a subgraph self.subgraph = KNNSubgraph(X_train, Y_train, I_train) if self.pre_computed_distance: if self.pre_distances.shape[ 0] != self.subgraph.n_nodes or self.pre_distances.shape[ 1] != self.subgraph.n_nodes: raise e.BuildError( 'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`' ) # Defining initial maximum accuracy as 0 max_acc = 0.0 for k in range(1, self.max_k + 1): # Gathers current `k` as subgraph's best `k` self.subgraph.best_k = k # Calculate the arcs using the current `k` value self.subgraph.create_arcs(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Calculate the p.d.f. using the current `k` value self.subgraph.calculate_pdf(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Clusters the subgraph self._clustering() # Calculate the predictions over the validation set preds = self.predict(X_val, I_val) # Calculating the accuracy acc = g.opf_accuracy(Y_val, preds) if acc > max_acc: max_acc = acc best_k = k logger.info('Accuracy over k = %d: %s', k, acc) self.subgraph.destroy_arcs() self.subgraph.best_k = best_k
def predict(self, X_val, I_val=None): """Predicts new data using the pre-trained classifier. Args: X_val (np.array): Array of validation features. I_val (np.array): Array of validation indexes. Returns: A list of predictions for each record of the data. """ # Checks if there is a knn-subgraph if not self.subgraph: # If not, raises an BuildError raise e.BuildError('KNNSubgraph has not been properly created') # Checks if knn-subgraph has been properly trained if not self.subgraph.trained: # If not, raises an BuildError raise e.BuildError('Classifier has not been properly clustered') logger.info('Predicting data ...') # Initializing the timer start = time.time() # Creating a prediction subgraph pred_subgraph = KNNSubgraph(X_val, I=I_val) # Gathering the best `k` value best_k = self.subgraph.best_k # Creating an array of distances distances = np.zeros(best_k + 1) # Creating an array of nearest neighbours indexes neighbours_idx = np.zeros(best_k + 1) # For every possible prediction node for i in range(pred_subgraph.n_nodes): # Defines the current cost cost = -c.FLOAT_MAX # Filling array of distances with maximum value distances.fill(c.FLOAT_MAX) # For every possible trained node for j in range(self.subgraph.n_nodes): # If they are different nodes if j != i: # If it is supposed to use a pre-computed distance if self.pre_computed_distance: # Gathers the distance from the matrix distances[best_k] = self.pre_distances[ pred_subgraph.nodes[i].idx][ self.subgraph.nodes[j].idx] # If it is supposed to calculate the distance else: # Calculates the distance between nodes `i` and `j` distances[best_k] = self.distance_fn( pred_subgraph.nodes[i].features, self.subgraph.nodes[j].features) # Apply node `j` as a neighbour neighbours_idx[best_k] = j # Gathers current `k` cur_k = best_k # While current `k` is bigger than 0 and the `k` distance is smaller than `k-1` distance while cur_k > 0 and distances[cur_k] < distances[cur_k - 1]: # Swaps the distance from `k` and `k-1` distances[cur_k], distances[cur_k - 1] = distances[ cur_k - 1], distances[cur_k] # Swaps the neighbours indexex from `k` and `k-1` neighbours_idx[cur_k], neighbours_idx[ cur_k - 1] = neighbours_idx[cur_k - 1], neighbours_idx[cur_k] # Decrements `k` cur_k -= 1 # Defining the density as 0 density = 0.0 # For every possible k for k in range(best_k): # Accumulates the density density += np.exp(-distances[k] / self.subgraph.constant) # Gather its mean value density /= best_k # Scale the density between minimum and maximum values density = ((c.MAX_DENSITY - 1) * (density - self.subgraph.min_density) / (self.subgraph.max_density - self.subgraph.min_density + c.EPSILON)) + 1 # For every possible k for k in range(best_k): # If distance is different than maximum possible value if distances[k] != c.FLOAT_MAX: # Gathers the node's neighbour neighbour = int(neighbours_idx[k]) # Calculate the temporary cost temp_cost = np.minimum(self.subgraph.nodes[neighbour].cost, density) # If temporary cost is bigger than current cost if temp_cost > cost: # Replaces the current cost cost = temp_cost # Propagates the predicted label from the neighbour pred_subgraph.nodes[ i].predicted_label = self.subgraph.nodes[ neighbour].predicted_label # Propagates the cluster label from the neighbour pred_subgraph.nodes[ i].cluster_label = self.subgraph.nodes[ neighbour].cluster_label # Creating the list of predictions preds = [pred.predicted_label for pred in pred_subgraph.nodes] # Creating the list of clusters clusters = [pred.cluster_label for pred in pred_subgraph.nodes] # Ending timer end = time.time() # Calculating prediction task time predict_time = end - start logger.info('Data has been predicted.') logger.info('Prediction time: %s seconds.', predict_time) return preds, clusters
def predict(self, X_val, I_val=None): """Predicts new data using the pre-trained classifier. Args: X_val (np.array): Array of validation or test features. I_val (np.array): Array of validation or test indexes. Returns: A list of predictions for each record of the data. """ if not self.subgraph: raise e.BuildError('Subgraph has not been properly created') if not self.subgraph.trained: raise e.BuildError('Classifier has not been properly fitted') logger.info('Predicting data ...') start = time.time() # Creating a prediction subgraph pred_subgraph = Subgraph(X_val, I=I_val) for i in range(pred_subgraph.n_nodes): # Initializing the conqueror node conqueror = -1 # Initializes the `j` counter j = 0 # Gathers the first node from the ordered list k = self.subgraph.idx_nodes[j] if self.pre_computed_distance: weight = self.pre_distances[self.subgraph.nodes[k].idx][pred_subgraph.nodes[i].idx] else: weight = self.distance_fn(self.subgraph.nodes[k].features, pred_subgraph.nodes[i].features) # The minimum cost will be the maximum between the `k` node cost and its weight (arc) min_cost = np.maximum(self.subgraph.nodes[k].cost, weight) # The current label will be `k` node's predicted label current_label = self.subgraph.nodes[k].predicted_label # While `j` is a possible node and the minimum cost is bigger than the current node's cost while j < (self.subgraph.n_nodes - 1) and min_cost > self.subgraph.nodes[self.subgraph.idx_nodes[j+1]].cost: # Gathers the next node from the ordered list l = self.subgraph.idx_nodes[j+1] if self.pre_computed_distance: weight = self.pre_distances[self.subgraph.nodes[l].idx][pred_subgraph.nodes[i].idx] else: weight = self.distance_fn(self.subgraph.nodes[l].features, pred_subgraph.nodes[i].features) # The temporary minimum cost will be the maximum between the `l` node cost and its weight (arc) temp_min_cost = np.maximum(self.subgraph.nodes[l].cost, weight) # If temporary minimum cost is smaller than the minimum cost if temp_min_cost < min_cost: # Replaces the minimum cost min_cost = temp_min_cost # Gathers the identifier of `l` node conqueror = l # Updates the current label as `l` node's predicted label current_label = self.subgraph.nodes[l].predicted_label # Increments the `j` counter j += 1 # Makes `k` and `l` equals k = l # Node's `i` predicted label is the same as current label pred_subgraph.nodes[i].predicted_label = current_label # Checks if any node has been conquered if conqueror > -1: # Marks the conqueror node and its path self.subgraph.mark_nodes(conqueror) # Creating the list of predictions preds = [pred.predicted_label for pred in pred_subgraph.nodes] end = time.time() predict_time = end - start logger.info('Data has been predicted.') logger.info('Prediction time: %s seconds.', predict_time) return preds
def _learn(self, X_train, Y_train, X_val, Y_val): """Learns the best `k` value over the validation set. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. X_val (np.array): Array of validation features. Y_val (np.array): Array of validation labels. """ logger.info('Learning best `k` value ...') # Creating a subgraph self.subgraph = KNNSubgraph(X_train, Y_train) # Checks if it is supposed to use pre-computed distances if self.pre_computed_distance: # Checks if its size is the same as the subgraph's amount of nodes if self.pre_distances.shape[ 0] != self.subgraph.n_nodes or self.pre_distances.shape[ 1] != self.subgraph.n_nodes: # If not, raises an error raise e.BuildError( 'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`' ) # Defining initial maximum accuracy as 0 max_acc = 0.0 # For every possible `k` value for k in range(1, self.max_k + 1): # Gathers current `k` as subgraph's best `k` self.subgraph.best_k = k # Calculate the arcs using the current `k` value self.subgraph.create_arcs(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Calculate the p.d.f. using the current `k` value self.subgraph.calculate_pdf(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Clusters the subgraph self._clustering() # Calculate the predictions over the validation set preds = self.predict(X_val) # Calculating the accuracy acc = g.opf_accuracy(Y_val, preds) # If accuracy is better than maximum accuracy if acc > max_acc: # Replaces the maximum accuracy value max_acc = acc # Defines current `k` as the best `k` value best_k = k logger.info(f'Accuracy over k = {k}: {acc}') # Destroy the arcs self.subgraph.destroy_arcs() # Applying the best k to the subgraph's property self.subgraph.best_k = best_k
def fit(self, X_train, Y_train, X_unlabeled): """Fits data in the semi-supervised classifier. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. X_unlabeled (np.array): Array of unlabeled features. """ logger.info('Fitting semi-supervised classifier ...') # Initializing the timer start = time.time() # Creating a subgraph self.subgraph = Subgraph(X_train, Y_train) # Finding prototypes self._find_prototypes() # Gather current number of nodes current_n_nodes = self.subgraph.n_nodes # Iterate over every possible unlabeled sample for i, feature in enumerate(X_unlabeled): # Creates a Node structure node = Node(current_n_nodes + i, 1, feature) # Appends the node to the list self.subgraph.nodes.append(node) # Checks if it is supposed to use pre-computed distances if self.pre_computed_distance: # Checks if its size is the same as the subgraph's amount of nodes if self.pre_distances.shape[ 0] != self.subgraph.n_nodes or self.pre_distances.shape[ 1] != self.subgraph.n_nodes: # If not, raises an error raise e.BuildError( 'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`' ) # Creating a minimum heap h = Heap(size=self.subgraph.n_nodes) # For each possible node for i in range(self.subgraph.n_nodes): # Checks if node is a prototype if self.subgraph.nodes[i].status == c.PROTOTYPE: # If yes, it does not have predecessor nodes self.subgraph.nodes[i].pred = c.NIL # Its predicted label is the same as its true label self.subgraph.nodes[i].predicted_label = self.subgraph.nodes[ i].label # Its cost equals to zero h.cost[i] = 0 # Inserts the node into the heap h.insert(i) # If node is not a prototype else: # Its cost equals to maximum possible value h.cost[i] = c.FLOAT_MAX # While the heap is not empty while not h.is_empty(): # Removes a node p = h.remove() # Appends its index to the ordered list self.subgraph.idx_nodes.append(p) # Gathers its cost self.subgraph.nodes[p].cost = h.cost[p] # For every possible node for q in range(self.subgraph.n_nodes): # If we are dealing with different nodes if p != q: # If `p` node cost is smaller than `q` node cost if h.cost[p] < h.cost[q]: # Checks if we are using a pre-computed distance if self.pre_computed_distance: # Gathers the distance from the distance's matrix weight = self.pre_distances[self.subgraph.nodes[ p].idx][self.subgraph.nodes[q].idx] # If the distance is supposed to be calculated else: # Calls the corresponding distance function weight = self.distance_fn( self.subgraph.nodes[p].features, self.subgraph.nodes[q].features) # The current cost will be the maximum cost between the node's and its weight (arc) current_cost = np.maximum(h.cost[p], weight) # If current cost is smaller than `q` node's cost if current_cost < h.cost[q]: # `q` node has `p` as its predecessor self.subgraph.nodes[q].pred = p # And its predicted label is the same as `p` self.subgraph.nodes[ q].predicted_label = self.subgraph.nodes[ p].predicted_label # As we may have unlabeled nodes, make sure that `q` label equals to `q` predicted label self.subgraph.nodes[q].label = self.subgraph.nodes[ q].predicted_label # Updates the heap `q` node and the current cost h.update(q, current_cost) # The subgraph has been properly trained self.subgraph.trained = True # Ending timer end = time.time() # Calculating training task time train_time = end - start logger.info('Semi-supervised classifier has been fitted.') logger.info(f'Training time: {train_time} seconds.')
def predict(self, X_val, I_val=None): """Predicts new data using the pre-trained classifier. Args: X_val (np.array): Array of validation features. I_val (np.array): Array of validation indexes. Returns: A list of predictions for each record of the data. """ # Checks if there is a knn-subgraph if not self.subgraph: # If not, raises an BuildError raise e.BuildError('ANNSubgraph has not been properly created') # Checks if knn-subgraph has been properly trained if not self.subgraph.trained: # If not, raises an BuildError raise e.BuildError('Classifier has not been properly clustered') logger.info('Predicting data ...') # Initializing the timer start = time.time() # Creating a prediction subgraph pred_subgraph = ANNSubgraph(X_val, I=I_val) # Gathering the best `k` value best_k = self.subgraph.best_k # Creating an array of distances # distances = np.zeros(best_k + 1) # Creating an array of nearest neighbours indexes # neighbours_idx = np.zeros(best_k + 1) # For every possible prediction node for i in range(pred_subgraph.n_nodes): # For every possible trained node neighbors_idx, distances = self.ann_search.query( pred_subgraph.nodes[i].features, best_k) density = np.sum( np.exp(-np.array(distances) / self.subgraph.constant)) # Gather its mean value density /= best_k # Scale the density between minimum and maximum values density = ((c.MAX_DENSITY - 1) * (density - self.subgraph.min_density) / (self.subgraph.max_density - self.subgraph.min_density + c.EPSILON)) + 1 neighbor_costs = [ self.subgraph.nodes[neighbor].cost for neighbor in neighbors_idx ] # Calculate the temporary cost temp_cost = np.minimum(neighbor_costs, [density]) # Select the maximum cost among node's neighbors k = np.argmax(temp_cost) # Gathers the node's neighbor neighbor = int(neighbors_idx[k]) # Propagates the predicted label from the neighbour pred_subgraph.nodes[i].predicted_label = self.subgraph.nodes[ neighbor].predicted_label # Propagates the cluster label from the neighbour pred_subgraph.nodes[i].cluster_label = self.subgraph.nodes[ neighbor].cluster_label del neighbor_costs del neighbor # Creating the list of predictions preds = [pred.predicted_label for pred in pred_subgraph.nodes] # Creating the list of clusters clusters = [pred.cluster_label for pred in pred_subgraph.nodes] # Ending timer end = time.time() # Calculating prediction task time self.pred_time = end - start logger.info('Data has been predicted.') logger.info(f'Prediction time: {self.pred_time : .4f} seconds.') return preds, clusters