def page_rank_nibble(self, g, ref_node, vol, phi = 0.5, algorithm = 'fista', epsilon = 1.0e-2, max_iter = 10000, max_time = 100, cpp = True): """ DESCRIPTION ----------- Page Rank Nibble Algorithm. For details please refer to: R. Andersen, F. Chung and K. Lang. Local Graph Partitioning using PageRank Vectors link: http://www.cs.cmu.edu/afs/cs/user/glmiller/public/Scientific-Computing/F-11/RelatedWork/local_partitioning_full.pdf The algorithm works on the connected component that the given reference node belongs to. PARAMETERS (mandatory) ---------------------- g: graph object ref_node: integer The reference node, i.e., node of interest around which we are looking for a target cluster. vol: float, double Lower bound for the volume of the output cluster. PARAMETERS (optional) --------------------- phi: float, double default == 0.5 Target conductance for the output cluster. algorithm: string default == 'fista' Algorithm for spectral local graph clustering Options: 'fista', 'ista', 'acl'. epsilon: float, double default = 1.0e-2 Termination tolerance for l1-regularized PageRank, i.e., applies to FISTA and ISTA algorithms max_iter: integer default = 10000 Maximum number of iterations of FISTA, ISTA or ACL. max_time: float, double default = 100 Maximum time in seconds cpp: boolean default = True Use the faster C++ version of FISTA or not. RETURNS ------- The output can be accessed from the localCluster object that calls this function. If cpp = False then the output is: node_embedding_nibble: numpy array, float Approximate personalized PageRank vector best_cluster_nibble: list A list of nodes that correspond to the cluster with the best conductance that was found by the algorithm. best_conductance_nibble: float Conductance value that corresponds to the cluster with the best conductance that was found by the algorithm. sweep_profile_nibble: list of objects A two dimensional list of objects. For example, sweep_profile[0] contains a numpy array with all conductances for all clusters that were calculated by sweep_cut. sweep_profile[1] is a multidimensional list that contains the indices of all clusters that were calculated by sweep_cut. For example, sweep_profile[1][5] is a list that contains the indices of the 5th cluster that was calculated by sweep_cut. The set of indices in sweep_profile[1][5] also correspond to conductance in sweep_profile[0][5]. The number of clusters is unknwon apriori and depends on the data and that parameter setting of the algorithm. volume_profile_nibble: list of objects A two dimensional list of objects which stores information about clusters which have volume larger than the input vol and les than 2/3 of the volume of the whole graph. For example, volume_profile[0] contains a list with all conductances for all clusters that were calculated by sweep_cut and also satisfy the previous volume constraint. volume_profile[1] is a multidimensional list that contains the indices of all clusters that were calculated by sweep_cut and also satisfy the previous volume constraint. For example, volume_profile[1][5] is a list that contains the indices of the 5th cluster that was calculated by sweep_cut and also satisfies the previous volume constraint. The set of indices in volume_profile[1][5] also correspond to conductance in volume_profile[0][5]. The number of clusters is unknwon apriori and depends on the data and that parameter setting of the algorithm. If cpp = True then the output is: node_embedding_nibble: numpy array, float Approximate personalized PageRank vector best_cluster_nibble: list A list of nodes that correspond to the cluster with the best conductance that was found by the algorithm. best_conductance_nibble: float Conductance value that corresponds to the cluster with the best conductance that was found by the algorithm. """ n = g.A.shape[0] nodes = range(n) g_copy = g m = g_copy.A.count_nonzero()/2 B = np.log2(m) if vol < 0: print("The input volume must be non-negative") return [], [], [], [], [] if vol == 0: vol_user = 1 else: vol_user = vol b = 1 + np.log2(vol_user) b = min(b,B) alpha = (phi**2)/(225*np.log(100*np.sqrt(m))) rho = (1/(2**b))*(1/(48*B)) if algorithm == 'fista': if not cpp: p = fista_dinput_dense(ref_node, g_copy, alpha = alpha, rho = rho, epsilon = epsilon, max_iter = max_iter, max_time = max_time) else: uint_indptr = np.uint32(g.A.indptr) uint_indices = np.uint32(g.A.indices) (not_converged,grad,p) = proxl1PRaccel(uint_indptr, uint_indices, g.A.data, ref_node, g.d, g.d_sqrt, g.dn_sqrt, alpha = alpha, rho = rho, epsilon = epsilon, maxiter = max_iter, max_time = max_time) elif algorithm == 'ista': p = ista_dinput_dense(ref_node, g_copy, alpha = alpha, rho = rho, epsilon = epsilon, max_iter = max_iter, max_time = max_time) elif algorithm == 'acl': p = acl_list(ref_node, g_copy, alpha = alpha, rho = rho, max_iter = max_iter, max_time = max_time) else: print("There is no such algorithm provided") return [], [], [], [] sweep = sweepCut() if not cpp: sweep.sweep_normalized(p,g_copy,vol) for i in range(len(sweep.sweep_profile[0])): sweep.sweep_profile[1][i] = [nodes[j] for j in sweep.sweep_profile[1][i]] for i in range(len(sweep.volume_profile[0])): sweep.volume_profile[1][i] = [nodes[j] for j in sweep.volume_profile[1][i]] sweep.best_cluster = [nodes[i] for i in sweep.best_cluster] self.node_embedding_nibble = p self.best_cluster_nibble = sweep.best_cluster self.best_conductance_nibble = sweep.best_conductance self.sweep_profile_nibble = sweep.sweep_profile self.volume_profile_nibble = sweep.volume_profile else: n = g.A.shape[0] sweep.sweep_cut_cpp(p,g) self.node_embedding_nibble = p self.best_cluster_nibble = sweep.best_cluster self.best_conductance_nibble = sweep.best_conductance
def fista(self, ref_node, g, alpha = 0.15, rho = 1.0e-5, epsilon = 1.0e-6, max_iter = 10000, vol_G = -1, max_time = 100, cpp = True): """DESCRIPTION ----------- Fast Iterative Soft Thresholding Algorithm (FISTA). This algorithm solves the l1-regularized personalized PageRank problem using an accelerated version of ISTA. It rounds the solution using sweep cut. The l1-regularized personalized PageRank problem is defined as min rho*||p||_1 + <c,p> + <p,Q*p> where p is the PageRank vector, ||p||_1 is the l1-norm of p, rho is the regularization parameter of the l1-norm, c is the right hand side of the personalized PageRank linear system and Q is the symmetrized personalized PageRank matrix. For details regarding ISTA please refer to: K. Fountoulakis, F. Roosta-Khorasani, J. Shun, X. Cheng and M. Mahoney. Variational Perspective on Local Graph Clustering. arXiv:1602.01886, 2017. arXiv link:https://arxiv.org/abs/1602.01886 PARAMETERS (mandatory) ---------------------- ref_node: integer The reference node, i.e., node of interest around which we are looking for a target cluster. g: graph object PARAMETERS (optional) --------------------- alpha: float, double default == 0.15 Teleportation parameter of the personalized PageRank linear system. The smaller the more global the personalized PageRank vector is. rho: float, double defaul == 1.0e-5 Regularization parameter for the l1-norm of the model. For details of these parameters please refer to: K. Fountoulakis, F. Roosta-Khorasani, J. Shun, X. Cheng and M. Mahoney. Variational Perspective on Local Graph Clustering. arXiv:1602.01886, 2017 arXiv link:https://arxiv.org/abs/1602.01886 epsilon: float, double default == 1.0e-6 Tolerance for FISTA for solving the l1-regularized personalized PageRank problem. max_iter: integer default = 10000 Maximum number of iterations of FISTA. max_time: float, double default = 100 Maximum time in seconds cpp: boolean default = True Use the faster C++ version of FISTA or not. RETURNS ------- The output can be accessed from the localCluster object that calls this function. If cpp = False then the output is: node_embedding_fista: numpy array, float Approximate personalized PageRank vector best_cluster_fista: list A list of nodes that correspond to the cluster with the best conductance that was found by FISTA. best_conductance: float, double Conductance value that corresponds to the cluster with the best conductance that was found by FISTA. sweep_profile_fista: list of objects A two dimensional list of objects. For example, sweep_profile[0] contains a numpy array with all conductances for all clusters that were calculated by sweep_cut. sweep_profile[1] is a multidimensional list that contains the indices of all clusters that were calculated by sweep_cut. For example, sweep_profile[1,5] is a list that contains the indices of the 5th cluster that was calculated by sweep_cut. The set of indices in sweep_profile[1][5] also correspond to conductance in sweep_profile[0][5]. The number of clusters is unknwon apriori and depends on the data and that parameter setting of FISTA. If cpp = True then the output is: node_embedding_fista: numpy array, float Approximate personalized PageRank vector best_cluster_fista: list A list of nodes that correspond to the cluster with the best conductance that was found by FISTA. best_conductance_fista: float, double Conductance value that corresponds to the cluster with the best conductance that was found by FISTA. """ sweep = sweepCut() if not cpp: self.node_embedding_fista = fista_dinput_dense(ref_node, g, alpha = alpha, rho = rho, epsilon = epsilon, max_iter = max_iter, max_time = max_time) sweep.sweep_normalized(self.node_embedding_fista,g) self.best_cluster_fista = sweep.best_cluster self.best_conductance_fista = sweep.best_conductance self.sweep_profile_fista = sweep.sweep_profile else: uint_indptr = np.uint32(g.A.indptr) uint_indices = np.uint32(g.A.indices) (not_converged,grad,self.node_embedding_fista) = proxl1PRaccel(uint_indptr, uint_indices, g.A.data, ref_node, g.d, g.d_sqrt, g.dn_sqrt, alpha = alpha, rho = rho, epsilon = epsilon, maxiter = max_iter, max_time = max_time) n = g.A.shape[0] sweep.sweep_cut_cpp(self.node_embedding_fista,g) self.best_cluster_fista = sweep.best_cluster self.best_conductance_fista = sweep.best_conductance
def page_rank_nibble_algo(g, ref_node, vol, phi=0.5, algorithm='fista', epsilon=1.0e-2, max_iter=10000, max_time=100, cpp=True): """ Page Rank Nibble Algorithm. For details please refer to: R. Andersen, F. Chung and K. Lang. Local Graph Partitioning using PageRank Vectors link: http://www.cs.cmu.edu/afs/cs/user/glmiller/public/Scientific-Computing/F-11/RelatedWork/local_partitioning_full.pdf The algorithm works on the connected component that the given reference node belongs to. This method stores the results in the class attribute page_rank_nibble_transformation. Parameters (mandatory) ---------------------- g: graph object ref_node: integer The reference node, i.e., node of interest around which we are looking for a target cluster. vol: float, double Lower bound for the volume of the output cluster. Parameters (optional) --------------------- phi: float64 Default == 0.5 Target conductance for the output cluster. algorithm: string Default == 'fista' Algorithm for spectral local graph clustering Options: 'fista', 'ista', 'acl'. epsilon: float64 Default = 1.0e-2 Termination tolerance for l1-regularized PageRank, i.e., applies to FISTA and ISTA algorithms max_iter: int default = 10000 Maximum number of iterations of FISTA, ISTA or ACL. max_time: float64 default = 100 Maximum time in seconds cpp: bool default = True Use the faster C++ version of FISTA or not. """ n = g.adjacency_matrix.shape[0] nodes = range(n) m = g.adjacency_matrix.count_nonzero() / 2 B = np.log2(m) if vol < 0: print("The input volume must be non-negative") return [], [], [], [], [] if vol == 0: vol_user = 1 else: vol_user = vol b = 1 + np.log2(vol_user) b = min(b, B) alpha = (phi**2) / (225 * np.log(100 * np.sqrt(m))) rho = (1 / (2**b)) * (1 / (48 * B)) if algorithm == 'fista': if not cpp: p = fista_dinput_dense(ref_node, g, alpha=alpha, rho=rho, epsilon=epsilon, max_iter=max_iter, max_time=max_time) else: uint_indptr = np.uint32(g.adjacency_matrix.indptr) uint_indices = np.uint32(g.adjacency_matrix.indices) (not_converged, grad, p) = proxl1PRaccel(uint_indptr, uint_indices, g.adjacency_matrix.data, ref_node, g.d, g.d_sqrt, g.dn_sqrt, g.lib, alpha=alpha, rho=rho, epsilon=epsilon, maxiter=max_iter, max_time=max_time) p = np.abs(p) elif algorithm == 'ista': p = ista_dinput_dense(ref_node, g, alpha=alpha, rho=rho, epsilon=epsilon, max_iter=max_iter, max_time=max_time) elif algorithm == 'acl': p = acl_list(ref_node, g, alpha=alpha, rho=rho, max_iter=max_iter, max_time=max_time) else: raise Exception("There is no such algorithm provided") return p
def produce(self, inputs: Sequence[Input], ref_nodes: Sequence[int], ys: Sequence[Sequence[float]] = None, timeout: float = 100, iterations: int = 1000, alpha: float = 0.15, rho: float = 1.0e-6, epsilon: float = 1.0e-2, cpp: bool = True) -> Sequence[Output]: """ Computes an l1-regularized PageRank vector. Uses the Fast Iterative Soft Thresholding Algorithm (FISTA). This algorithm solves the l1-regularized personalized PageRank problem. The l1-regularized personalized PageRank problem is defined as min rho*||p||_1 + <c,p> + <p,Q*p> where p is the PageRank vector, ||p||_1 is the l1-norm of p, rho is the regularization parameter of the l1-norm, c is the right hand side of the personalized PageRank linear system and Q is the symmetrized personalized PageRank matrix. For details please refer to: K. Fountoulakis, F. Roosta-Khorasani, J. Shun, X. Cheng and M. Mahoney. Variational Perspective on Local Graph Clustering. arXiv:1602.01886, 2017. arXiv link:https://arxiv.org/abs/1602.01886 Parameters ---------- inputs: Sequence[Graph] ref_nodes: Sequence[int] A sequence of reference nodes, i.e., nodes of interest around which we are looking for a target cluster. Parameters (optional) --------------------- ys: Sequence[Sequence[float]] Defaul == None Initial solutions for l1-regularized PageRank algorithm. If not provided then it is initialized to zero. This is only used for the C++ version of FISTA. alpha: float Default == 0.15 Teleportation parameter of the personalized PageRank linear system. The smaller the more global the personalized PageRank vector is. rho: float Defaul == 1.0e-5 Regularization parameter for the l1-norm of the model. epsilon: float64 Default == 1.0e-2 Tolerance for FISTA for solving the l1-regularized personalized PageRank problem. iterations: int Default = 100000 Maximum number of iterations of FISTA algorithm. timeout: float Default = 100 Maximum time in seconds. cpp: boolean Default = True Use the faster C++ version of FISTA or not. Returns ------- For each graph in inputs it returns the following: An np.ndarray (1D embedding) of the nodes for each graph. """ if not cpp: return [ fista_dinput_dense(ref_nodes[i], inputs[i], alpha=alpha, rho=rho, epsilon=epsilon, max_iter=iterations, max_time=timeout) for i in range(len(inputs)) ] else: if ys == None: return [ proxl1PRaccel( np.uint32(inputs[i].adjacency_matrix.indptr), np.uint32(inputs[i].adjacency_matrix.indices), inputs[i].adjacency_matrix.data, ref_nodes[i], inputs[i].d, inputs[i].d_sqrt, inputs[i].dn_sqrt, alpha=alpha, rho=rho, epsilon=epsilon, maxiter=iterations, max_time=timeout)[2] for i in range(len(inputs)) ] else: return [ proxl1PRaccel( np.uint32(inputs[i].adjacency_matrix.indptr), np.uint32(inputs[i].adjacency_matrix.indices), inputs[i].adjacency_matrix.data, ref_nodes[i], inputs[i].d, inputs[i].d_sqrt, inputs[i].dn_sqrt, ys[i], alpha=alpha, rho=rho, epsilon=epsilon, maxiter=iterations, max_time=timeout)[2] for i in range(len(inputs)) ]
def multiclass_label_prediction_algo(labels, g, alpha = 0.15, rho = 1.0e-10, epsilon = 1.0e-2, max_iter = 10000, max_time = 100, cpp = True): """ This function predicts labels for unlabelled nodes. For details refer to: D. Gleich and M. Mahoney. Variational Using Local Spectral Methods to Robustify Graph-Based Learning Algorithms. SIGKDD 2015. https://www.stat.berkeley.edu/~mmahoney/pubs/robustifying-kdd15.pdf Parameters (mandatory) ---------------------- labels: list of lists Each list of this list corresponds to indices of nodes that are assumed to belong in a certain class. For example, list[i] is a list of indices of nodes that are assumed to belong in class i. g: graph object Parameters (optional) --------------------- alpha: float, double Default == 0.15 Teleportation parameter of the personalized PageRank linear system. The smaller the more global the personalized PageRank vector is. rho: float, double Defaul == 1.0e-10 Regularization parameter for the l1-norm of the model. epsilon: float, double Default == 1.0e-2 Tolerance for FISTA for solving the l1-regularized personalized PageRank problem. max_iter: integer Default = 10000 Maximum number of iterations of FISTA max_time: float, double Default = 100 Maximum time in seconds cpp: bool default = True Use the faster C++ version of FISTA or not. Returns ------- A list of three objects. output 0: list of indices that holds the class for each node. For example classes[i] is the class of node i. output 1: list of lists. Each componenent of the list is a list that holds the rank of the nodes for each class. For details see [1]. output 2: a list of numpy arrays. Each array in this list corresponds to the diffusion vector returned by personalized PageRank for each rank. For details see [1]. [1] D. Gleich and M. Mahoney. Variational Using Local Spectral Methods to Robustify Graph-Based Learning Algorithms. SIGKDD 2015. https://www.stat.berkeley.edu/~mmahoney/pubs/robustifying-kdd15.pdf """ n = g.adjacency_matrix.shape[0] output = [[],[],[]] for labels_i in labels: if not cpp: output_fista = fista_dinput_dense(labels_i, g, alpha = alpha, rho = rho, epsilon = epsilon, max_iter = max_iter, max_time = max_time) else: uint_indptr = np.uint32(g.adjacency_matrix.indptr) uint_indices = np.uint32(g.adjacency_matrix.indices) (not_converged,grad,output_fista) = proxl1PRaccel(uint_indptr, uint_indices, g.adjacency_matrix.data, labels_i, g.d, g.d_sqrt, g.dn_sqrt, alpha = alpha, rho = rho, epsilon = epsilon, maxiter = max_iter, max_time = max_time) p = np.zeros(n) for i in range(n): p[i] = output_fista[i] output[0].append(p) index = (-p).argsort(axis=0) rank = np.empty(n, int) rank[index] = np.arange(n) output[1].append(rank) l_labels = len(labels) for i in range(n): min_rank = n+1 class_ = l_labels + 1 for j in range(l_labels): rank = output[1][j][i] if rank < min_rank: min_rank = rank class_ = j output[2].append(class_) return output