def predict(self, x: spa.csr_matrix): '''Return a matrix with the predicted ratings. Applies logs to avoid underflow and to take into account that the probability of appearance of a word increases if the same word has already appeared before. Uses only matrix operations.''' if self.is_trained == False: return ('''The Classifier has not been trained. Please use train(train_data: spa.csr_matrix, scores: np.ndarray, Laplace_alpha) to train the Classifier.''') else: x.data = np.log(x.data + 1) self.log_cond_prob_matrix = spa.hstack(self.log_cond_prob_trans) log_freq = np.log(self.fractions) pre_final_result = x.dot(self.log_cond_prob_matrix) + log_freq final_prediction = (pre_final_result.argmax(axis=1) + 1).transpose() x.data = np.exp(x.data) - 1 return final_prediction
def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data): dataMatrix.data = dataMatrix.data * (1 / np.log1p(feature_data)) return dataMatrix
def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data): feature_data[feature_data > 1] = np.log(feature_data[feature_data > 1]) dataMatrix.data = dataMatrix.data * feature_data return dataMatrix
def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data): dataMatrix.data = dataMatrix.data * feature_data return dataMatrix
def _reweight_values(self, doc_term_matrix: sp.csr_matrix) -> sp.csr_matrix: """ Re-weight values in a doc-term matrix according to parameters specified in :class:`Vectorizer` initialization: binary or tf-idf weighting, sublinear term-frequency, document-normalized weights. Args: doc_term_matrix Returns: Reweighted doc-term matrix. """ # re-weight the local components (term freqs) if self.tf_type == "binary": doc_term_matrix.data.fill(1) elif self.tf_type == "bm25": if not self.dl_type: doc_term_matrix.data = (doc_term_matrix.data * (BM25_K1 + 1.0) / (BM25_K1 + doc_term_matrix.data)) else: dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type) length_norm = (1 - BM25_B) + (BM25_B * (dls / self._avg_doc_length)) doc_term_matrix = doc_term_matrix.tocoo(copy=False) doc_term_matrix.data = ( doc_term_matrix.data * (BM25_K1 + 1.0) / (doc_term_matrix.data + (BM25_K1 * length_norm[doc_term_matrix.row]))) doc_term_matrix = doc_term_matrix.tocsr(copy=False) elif self.tf_type == "sqrt": _ = np.sqrt(doc_term_matrix.data, doc_term_matrix.data, casting="unsafe") elif self.tf_type == "log": _ = np.log(doc_term_matrix.data, doc_term_matrix.data, casting="unsafe") doc_term_matrix.data += 1.0 elif self.tf_type == "linear": pass # tfs are already linear else: # this should never raise, i'm just being a worrywart raise ValueError( errors.value_invalid_msg( "tf_type", self.tf_type, {"binary", "bm25", "sqrt", "log", "linear"})) # apply the global component (idfs), column-wise if self.idf_type: doc_term_matrix = doc_term_matrix * self._idf_diag # apply normalizations, row-wise # unless we've already handled it for bm25-style tf if self.dl_type and self.tf_type != "bm25": n_docs, _ = doc_term_matrix.shape dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type) dl_diag = sp.spdiags(1.0 / dls, diags=0, m=n_docs, n=n_docs, format="csr") doc_term_matrix = dl_diag * doc_term_matrix if self.norm is not None: doc_term_matrix = normalize_mat(doc_term_matrix, norm=self.norm, axis=1, copy=False) return doc_term_matrix
def _eliminate_subzeroes(self, m: sparse.csr_matrix, epsilon): m.data = np.where(abs(m.data) < epsilon, 0, m.data) m.eliminate_zeros()
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree', normalized: bool = True) -> float: """Tree sampling divergence of a hierarchy (quality metric). * Graphs * Digraphs Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` (default) or ``'uniform'``. normalized : If ``True``, normalized score (between 0 and 1). Returns ------- score : float Score. Example ------- >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris >>> from sknetwork.data import house >>> paris = Paris() >>> adjacency = house() >>> dendrogram = paris.fit_transform(adjacency) >>> score = tree_sampling_divergence(adjacency, dendrogram) >>> np.round(score, 2) 0.52 References ---------- Charpentier, B. & Bonald, T. (2019). `Tree Sampling Divergence: An Information-Theoretic Metric for Hierarchical Graph Clustering. <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI. """ adjacency = check_format(adjacency) check_square(adjacency) n = adjacency.shape[0] if n <= 1: raise ValueError('The graph must contain at least two nodes.') total_weight = adjacency.data.sum() if total_weight <= 0: raise ValueError('The graph must contain at least one edge.') adjacency.data = adjacency.data / total_weight aggregate_graph, height, cluster_weight, edge_sampling, weights_row, weights_col = _instanciate_vars( adjacency, weights) node_sampling = np.zeros(n - 1) for t in range(n - 1): i = int(dendrogram[t][0]) j = int(dendrogram[t][1]) if i >= n and height[i - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[i - n] edge_sampling[i - n] = 0 node_sampling[t] = node_sampling[i - n] elif j >= n and height[j - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[j - n] edge_sampling[j - n] = 0 node_sampling[t] = node_sampling[j - n] if j in aggregate_graph.neighbors[i]: edge_sampling[t] += aggregate_graph.neighbors[i][j] node_sampling[t] += aggregate_graph.cluster_out_weights[i] * aggregate_graph.cluster_in_weights[j] + \ aggregate_graph.cluster_out_weights[j] * aggregate_graph.cluster_in_weights[i] height[t] = dendrogram[t][2] aggregate_graph.merge(i, j) index = np.where(edge_sampling)[0] score = edge_sampling[index].dot( np.log(edge_sampling[index] / node_sampling[index])) if normalized: inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr') inv_out_weights.data = 1 / inv_out_weights.data inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr') inv_in_weights.data = 1 / inv_in_weights.data sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights)) inv_out_weights.data = np.ones(len(inv_out_weights.data)) inv_in_weights.data = np.ones(len(inv_in_weights.data)) edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights)) mutual_information = edge_sampling.data.dot(np.log( sampling_ratio.data)) score /= mutual_information return score
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree', normalized: bool = True) -> float: """ Tree sampling divergence of a hierarchy (quality metric). The higher the score, the better. Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` (default) or ``'uniform'``. normalized: If ``True``, normalized by the mutual information of the graph. Returns ------- score : float The tree sampling divergence of the hierarchy. If normalized, returns a value between 0 and 1. References ---------- Charpentier, B. & Bonald, T. (2019). `Tree Sampling Divergence: An Information-Theoretic Metric for Hierarchical Graph Clustering. <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI. """ adjacency = check_format(adjacency) if not is_square(adjacency): raise ValueError('The adjacency matrix is not square.') n = adjacency.shape[0] if n <= 1: raise ValueError('The graph must contain at least two nodes.') total_weight = adjacency.data.sum() if total_weight <= 0: raise ValueError('The graph must contain at least one edge.') adjacency.data = adjacency.data / total_weight out_weights = check_probs(weights, adjacency) in_weights = check_probs(weights, adjacency.T) aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights) height = np.zeros(n - 1) edge_sampling = np.zeros(n - 1) node_sampling = np.zeros(n - 1) for t in range(n - 1): node1 = int(dendrogram[t][0]) node2 = int(dendrogram[t][1]) if node1 >= n and height[node1 - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node1 - n] edge_sampling[node1 - n] = 0 node_sampling[t] = node_sampling[node1 - n] elif node2 >= n and height[node2 - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node2 - n] edge_sampling[node2 - n] = 0 node_sampling[t] = node_sampling[node2 - n] if node2 in aggregate_graph.neighbors[node1]: edge_sampling[t] += aggregate_graph.neighbors[node1][node2] node_sampling[t] += aggregate_graph.cluster_out_weights[node1] * aggregate_graph.cluster_in_weights[node2] + \ aggregate_graph.cluster_out_weights[node2] * aggregate_graph.cluster_in_weights[node1] height[t] = dendrogram[t][2] aggregate_graph.merge(node1, node2) index = np.where(edge_sampling)[0] score = edge_sampling[index].dot(np.log(edge_sampling[index] / node_sampling[index])) if normalized: inv_out_weights = sparse.diags(out_weights, shape=(n, n), format='csr') inv_out_weights.data = 1 / inv_out_weights.data inv_in_weights = sparse.diags(in_weights, shape=(n, n), format='csr') inv_in_weights.data = 1 / inv_in_weights.data sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights)) inv_out_weights.data = np.ones(len(inv_out_weights.data)) inv_in_weights.data = np.ones(len(inv_in_weights.data)) edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights)) mutual_information = edge_sampling.data.dot(np.log(sampling_ratio.data)) score /= mutual_information return score