def testProblemStiffness2(self): width = 4 height = 4 [nodes, boundary_nodes, tris] = generateRectangularMesh((width, height), (0,0), (1,1)) p = Problem(nodes, boundary_nodes, tris) K = p.getStiffnessMatrix(lambda x,y: x*y) self.assertTrue((sparse.triu(K, 1).T.toarray() == sparse.tril(K,-1).toarray()).all()) width = 5 height = 5 [nodes, boundary_nodes, tris] = generateRectangularMesh((width, height), (0,0), (1,1)) p = Problem(nodes, boundary_nodes, tris) K = p.getStiffnessMatrix(lambda x,y: x*y) self.assertTrue((sparse.triu(K, 1).T.toarray() == sparse.tril(K,-1).toarray()).all())
def r_perturbR(g,R): '''可变参数的随机扰动方法''' A=nx.to_scipy_sparse_matrix(g) B=sparse.triu(A).toarray() #print B n=len(g) i = 0 ts=0 while i<n: j=i+1 while j<n: if(B[i,j]==1): if R[i,j]<1: B[i,j] = stats.bernoulli.rvs(R[i,j])#参数p伯努利实验成功的概率 else: B[i, j] = stats.bernoulli.rvs(1) #其实可以去掉 ts=ts + 1 #print "+",ts, ":", i, ",", j, ",", B[i, j] else: if R[i,j]<1: B[i,j] = stats.bernoulli.rvs(R[i,j])#参数q伯努利实验成功的概率 else: B[i, j] = stats.bernoulli.rvs(0) #其实可以去掉 ts=ts + 1 #print "-",ts, ":", i, ",", j, ",", B[i, j] j = j + 1 i=i+1 return nx.from_numpy_matrix(B,create_using=nx.Graph())#重新构建了Graph类型的返回对象
def r_perturbSa(g,p=None): '''固定参数的随机扰动方法,p伯努利实验成功的概率''' A=nx.to_scipy_sparse_matrix(g) B=sparse.triu(A).toarray() #print B n=len(g) e_num=len(g.edges())#图中存在的边数 q = e_num * (1 - p) / ((n * (n - 1)) / 2 - e_num) #print q i = 0 ts=0 listp=stats.bernoulli.rvs(p,size=e_num) listp=listp.tolist() listq=stats.bernoulli.rvs(q,size=(n * (n - 1)) / 2 - e_num) listq=listq.tolist() while i<n: j=i+1#略过对角线上的0 while j<n: if(B[i,j]==1): B[i,j] = listp.pop()#参数p伯努利实验成功的概率 #ts=ts + 1 # print "+",ts, ":", i, ",", j, ",", B[i, j] else: B[i,j] = listq.pop()#参数q伯努利实验成功的概率 #ts=ts + 1 # print "-",ts, ":", i, ",", j, ",", B[i, j] j = j + 1 i=i+1 return nx.from_numpy_matrix(B,create_using=nx.Graph())#重新构建了Graph类型的返回对象
def rewire(Adj, p): """ Rewiring takes an existing UNDIRECTED network with Adjacency matrix given by Adj and returns a matrix with the same number of bonds but with a scrambled connectivity. The nodes are iterated through in order. At each node n_i, all bonds (n_i, n_j) with j > i are rewired with probability p. In rewiring, the bond to n_j is connected to a new node n_k with k selected uniformly from the nodes not currently connected to i. """ # first pull the existing bonds in the network rows, cols = sparse.triu(Adj, k=1).nonzero() A = Adj.tolil() # LIL matrices are cheaper to rewire # rewire each bond with probability p for i, j in zip(rows, cols): if np.random.rand() < p: # pull list of candidate nodes to be reconnected to A[i, i] = 1 # as a placeholder for the moment temp, disconnected_nodes = (A[i, :] == 0).nonzero() # Draw the new node new_node = np.random.choice(disconnected_nodes) A[i, i] = 0 # remove self-link A[i, j] = 0 # remove old link A[j, i] = 0 A[i, new_node] = 1 # replace with new link A[new_node, i] = 1 return A.tocsr()
def graphml2mat(ingraph, outgraph, prune=False): ing = Graph.Read_GraphML(ingraph) if sum(ing.es()[:]['weight']) < 500000: print 'bad graph? ecount= ' , sum(ing.es()[:]['weight']) print 'filename= ', ingraph return; #currently being done in graphgen so don't need to delete vertex 0 #ing.vs[0].delete() if prune: #delete zero degree nodes #GK TODO: be smarter i = list() for n, v in enumerate(ing.vs): if v.degree() == 0: i.append(n) ing.vs[i].delete() outg = lil_matrix((ing.vcount(), ing.vcount())) #import pdb; pdb.set_trace() for e in ing.es: outg[e.source, e.target] = e['weight'] outg[e.target, e.source] = e['weight'] #since edges are undirected add both ways outg = triu(outg) mat_dict = {"graph": outg} savemat(outgraph, mat_dict)
def sor(A, b, x0=None, w=1., maxiter=200, tol=1E-6, direction='forward'): ''' SOR iteration has M = L + D/w, N = (1/w-1)*D - U for forward and M = U + D/w, N = (1/w-1)*D - L for bacward. ''' L, D, U = tril(A, k=-1), diags(A.diagonal(), 0), triu(A, k=1) if direction == 'forward': M = L + D/w N = (1/w - 1)*D - U else: M = U + D/w N = (1/w - 1)*D - L # Start from 0 initial guess if x0 is None: x0 = np.zeros(A.shape[1]) r = b - A.dot(x0) residuals = [np.linalg.norm(r)] count = 0 while residuals[-1] > tol and count < maxiter: # Update x0 = spsolve(M, N.dot(x0) + b) # Error r = b - A.dot(x0) residuals.append(np.linalg.norm(r)) # Count count += 1 converged = residuals[-1] < tol n_iters = len(residuals) - 1 data = {'status': converged, 'iter count': n_iters, 'residuals': residuals} return x0, data
def ssor(A, b, x0=None, w=1., maxiter=200, tol=1E-6): '''For symmetric matrices combine forward and backward SOR.''' assert is_symmetric(A, tol=1E-6) L, D, U = tril(A, k=-1), diags(A.diagonal(), 0), triu(A, k=1) # Forward MF = L + D/w NF = (1/w - 1)*D - U # Backward MB = U + D/w NB = (1/w - 1)*D - L # Start from 0 initial guess if x0 is None: x0 = np.zeros(A.shape[1]) r = b - A.dot(x0) residuals = [np.linalg.norm(r)] count = 0 while residuals[-1] > tol and count < maxiter: # Update x0 = spsolve(MF, NF.dot(x0) + b) x0 = spsolve(MB, NB.dot(x0) + b) # Error r = b - A.dot(x0) residuals.append(np.linalg.norm(r)) # Count count += 1 converged = residuals[-1] < tol n_iters = len(residuals) - 1 data = {'status': converged, 'iter count': n_iters, 'residuals': residuals} return x0, data
def __init__(self, G, external_voltages, I_threshold, G_OFF=1, G_ON=100): rnets.ResistorNetwork.__init__(self, G, external_voltages) self.I_threshold = I_threshold self.G_OFF = G_OFF self.G_ON = G_ON self.rows_G, self.cols_G = sparse.triu(self.G).nonzero() self.currents = None
def path_lengthsSPARSE(G): """Compute array of all shortest path lengths for the given graph. XXX - implementation using scipy.sparse. This might be faster for very sparse graphs, but so far for our cases the overhead of handling the sparse matrices doesn't seem to be worth it. We're leaving it in for now, in case we revisit this later and it proves useful. The length of the output array is the number of unique pairs of nodes that have a connecting path, so in general it is not known in advance. This assumes the graph is undirected, as for any pair of reachable nodes, once we've seen the pair we do not keep the path length value for the inverse path. Parameters ---------- G : an undirected graph object. """ assert_no_selfloops(G) length = nx.all_pairs_shortest_path_length(G) nnod = G.number_of_nodes() paths_mat = sparse.dok_matrix((nnod,nnod)) for src,targets in length.iteritems(): for targ,val in targets.items(): paths_mat[src,targ] = val return sparse.triu(paths_mat,1).data
def getStiffnessMatrix(self, k_function): K = sparse.lil_matrix((len(self.free_nodes), len(self.free_nodes))) for t in self.triangles: coords = t.getCornerCoords() basis_gradient = self._calcBasisGradient(coords) # print basis_gradient triarea = t.getArea() centroid = t.getCentroid() intensity = triarea * k_function(centroid[0], centroid[1]) # print intensity for i in range(0,3): if t.nodes[i].isBoundary(): continue for j in range(0,i+1): if t.nodes[j].isBoundary(): continue idx1 = t.nodes[i].free_node_index idx2 = t.nodes[j].free_node_index if idx2 < idx1: temp = idx1 idx1 = idx2 idx2 = temp K[idx1,idx2] += basis_gradient[i,j] * intensity K = K + sparse.triu(K,1).T return K.tocsr()
def avg_edge_length(self): """Average length of all edges in the surface. """ adj = self.adj tadj = sparse.triu(adj, 1) # only entries above main diagonal, in coo format edgelens = np.sqrt(((self.pts[tadj.row] - self.pts[tadj.col])**2).sum(1)) return edgelens.mean()
def load_pdata(dataset_str): names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in xrange(len(names)): objects.append(pkl.load(open("./data/ind.{}.{}".format(dataset_str, names[i])))) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file("./data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) train_mask = sample_mask(idx_train, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_test[test_mask, :] = labels[test_mask, :] train_out = [] for i in idx_train: ll = y_train[i].tolist() ll = ll.index(1) + 1 train_out.append([i, ll]) train_out = np.array(train_out) np.random.shuffle(train_out) test_out = [] for i in idx_test: ll = y_test[i].tolist() ll = ll.index(1) + 1 test_out.append([i, ll]) test_out = np.array(test_out) adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) adj.eliminate_zeros() # Check that diag is zero: assert np.diag(adj.todense()).sum() == 0 adj_triu = sp.triu(adj) adj_tuple = sparse_to_tuple(adj_triu) edges = adj_tuple[0] edges_all = sparse_to_tuple(adj)[0] num_mask = int(np.floor(edges.shape[0] / 10.)) return graph, features, train_out, test_out
def circle_tear(self, spanning_tree='mst', cycle_len_thresh=5, spt_idx=None, copy=True): '''Circular graph tearing. spanning_tree: one of {'mst', 'spt'} cycle_len_thresh: int, length of longest allowable cycle spt_idx: int, start vertex for shortest_path_subtree, random if None From "How to project 'circular' manifolds using geodesic distances?" by Lee & Verleysen, ESANN 2004. See also: shortest_path_subtree, minimum_spanning_subtree ''' # make the initial spanning tree graph if spanning_tree == 'mst': tree = self.minimum_spanning_subtree().matrix() elif spanning_tree == 'spt': if spt_idx is None: spt_idx = np.random.choice(self.num_vertices()) tree = self.shortest_path_subtree(spt_idx, directed=False).matrix() # find edges in self but not in the tree potential_edges = np.argwhere(ss.triu(self.matrix() - tree)) # remove edges that induce large cycles ii, jj = _find_cycle_inducers(tree, potential_edges, cycle_len_thresh) return self.remove_edges(ii, jj, symmetric=True, copy=copy)
def test_directLower_1_python(self): from pymatsolver import _BackwardSolver AUinv = _BackwardSolver(sp.triu(self.A)) X = AUinv * self.rhsU x = AUinv * self.rhsU[:,0] self.assertLess(np.linalg.norm(self.sol-X,np.inf), TOL) self.assertLess(np.linalg.norm(self.sol[:,0]-x,np.inf), TOL)
def setUp(self): n = 50 nrhs = 20 self.A = sp.rand(n, n, 0.4) + sp.identity(n) self.sol = np.ones((n, nrhs)) self.rhsU = sp.triu(self.A) * self.sol self.rhsL = sp.tril(self.A) * self.sol
def sparse_power_iteration(P, x, tol=10e-16, maxiter=200): """Preconditioned power iteration for a sparse stochastic matrix Parameters --------------- P : array, shape (n, n), sparse transition matrix of a Markov Chain x : array, shape (n, ) On entry, the initial guess. On exit, the final solution. """ t = 0 eps = tol + 1 n = P.shape[0] # ILU factorization LU = ilu0_factor(P) L = sparse.tril(LU) U = sparse.triu(LU) # New matrix Q Q = P.copy() Q.setdiag(1 - Q.diagonal()) Q *= -1 Q = Q.T info = -1 t = -1 for t in range(maxiter): ## dot() is matrix multiplication dx = spla.spsolve(U, spla.spsolve(L, Q.matvec(x))) x -= dx relres = tvnorm(dx) if relres < tol: info = 0 break t += 1 return (info, t, relres)
def prepare_preferential_attachment(self): self.repeated_nodes = hstack( sparse.triu(self.matrix, format='coo').nonzero()) self.repeated_nodes = append(self.repeated_nodes, fromiter(iter(self.nodes), dtype=np.int32),) self._initialized_preferential_attachment = True
def view_laplacian_off_terms(non_normalized_Laplacian): normalized_Laplacian = Lapl_normalize(non_normalized_Laplacian) triag_u = lil_matrix(triu(normalized_Laplacian)) triag_u.setdiag(0) pre_arr = -triag_u[triag_u.nonzero()].toarray().flatten() arr = np.log10(pre_arr) plt.hist(arr, bins=100, log=True, histtype='step') plt.show()
def train_test_split(adjacency): n_nodes = adjacency.shape[0] coo_adjacency = sp.coo_matrix(adjacency) coo_adjacency_upper = sp.triu(coo_adjacency, k=1) sp_adjacency = dense_to_sparse(coo_adjacency_upper) edges = sp_adjacency[0] num_test = int(np.floor(edges.shape[0]/10.)) num_val = int(np.floor(edges.shape[0]/10.)) idx_all = list(range(edges.shape[0])) np.random.shuffle(idx_all) idx_test = idx_all[:num_test] idx_val = idx_all[num_test:(num_val + num_test)] test_edges_pos = edges[idx_test] val_edges_pos = edges[idx_val] train_edges = np.delete(edges, np.hstack([idx_test, idx_val]), axis=0) test_edges_neg = [] val_edges_neg = [] edge_to_add = [0, 0] while (len(test_edges_neg) < len(test_edges_pos)): n1 = np.random.randint(0, n_nodes) n2 = np.random.randint(0, n_nodes) if n1 == n2: continue if n1 < n2: edge_to_add = [n1, n2] else: edge_to_add = [n2, n1] if any((edges[:]==edge_to_add).all(1)): continue test_edges_neg.append(edge_to_add) while (len(val_edges_neg) < len(val_edges_pos)): n1 = np.random.randint(0, n_nodes) n2 = np.random.randint(0, n_nodes) if n1 == n2: continue if n1 < n2: edge_to_add = [n1, n2] else: edge_to_add = [n2, n1] if any((edges[:] == edge_to_add).all(1)): continue val_edges_neg.append(edge_to_add) row = [] col = [] data = [] for edge in train_edges: row.extend([edge[0], edge[1]]) col.extend([edge[1], edge[0]]) data.extend([1, 1]) train_adjacency = sp.coo_matrix((data, (row,col)), shape=(n_nodes,n_nodes)) return train_adjacency, test_edges_pos, test_edges_neg, val_edges_pos, val_edges_neg
def _generator(self, byres, chromsizes, bin_cumnums): for i in range(chromsizes.size): for j in range(i, chromsizes.size): c1, c2 = chromsizes.index[i], chromsizes.index[j] if self.onlyIntra: if c1!=c2: continue if (c1,c2) in byres: ci, cj = i, j else: if (c2,c1) in byres: c1, c2 = c2, c1 ci, cj = j, i else: continue if type(byres[(c1,c2)])==str: data = np.loadtxt(byres[(c1,c2)], dtype=self._intertype) else: # Make it compatible with TADLib and old version of runHiC if c1!=c2: data = byres[(c1,c2)][(c1,c2)] else: if c1 in byres[(c1,c2)].files: data = byres[(c1,c2)][c1] else: data = byres[(c1,c2)][(c1,c2)] x, y = data['bin1'], data['bin2'] # Fast guarantee triu matrix if ci > cj: x, y = y, x ci, cj = cj, ci xLen = x.max() + 1 yLen = y.max() + 1 if ci != cj: tmp = sparse.csr_matrix((data['IF'], (x,y)), shape=(xLen, yLen)) else: Len = max(xLen, yLen) tmp = sparse.csr_matrix((data['IF'], (x,y)), shape=(Len, Len)) tmp = sparse.lil_matrix(tmp) tmp[y,x] = tmp[x,y] tmp = sparse.triu(tmp) x, y = tmp.nonzero() if ci > 0: x = x + bin_cumnums[ci-1] if cj > 0: y = y + bin_cumnums[cj-1] data = tmp.data current = pd.DataFrame({'bin1_id':x, 'bin2_id':y, 'count':data}, columns=['bin1_id', 'bin2_id', 'count']) yield current
def edge_weights(self, copy=False, directed=True): if not directed: ii, jj = ss.triu(self._adj).nonzero() return np.asarray(self._adj[ii, jj]).ravel() # XXX: assumes correct internal ordering and no explicit zeros w = self._adj.data.ravel() if copy: return w.copy() return w
def aggregate_partitions(G,nodeCommArray,N,tau = None,connectStrayNodes=True): #generate core communities #N = len(G['imputation_batches']) neighborsMatrix = nodeCommArray*nodeCommArray.T #create copy of neighbors matrix for evaluating thresholds scoringMatrix = neighborsMatrix.copy() neighborsMatrix = sp.triu(neighborsMatrix,format= "csr") if tau == None: #compute optimal tau tau,connectedComponents = compute_optimal_threshold(neighborsMatrix,scoringMatrix,N) coreCommunities = [x for x in connectedComponents if len(x) > MIN_CORE_COMM_SIZE] elif tau != None: neighborsMatrix.data[neighborsMatrix.data < tau] = 0 neighborsMatrix.eliminate_zeros() connectedComponents = sp.csgraph.connected_components(neighborsMatrix,directed = False)[1] connectedComponents = ig.Clustering(connectedComponents) coreCommunities = [x for x in connectedComponents if len(x) > MIN_CORE_COMM_SIZE] #if rare case (usually degenerate) of no core communities, output CC's as final answer if len(coreCommunities) < 1: return connectedComponents #output core communities only, this will result in some of the nodes missing from final partition if connectStrayNodes == False: return coreCommunities #merge stray nodes with core communities coreNodes = reduce(lambda x,y:x+y,coreCommunities) strayNodes = [v.index for v in G.vs if v.index not in coreNodes] finalCommunities = deepcopy(coreCommunities) #intialize array to store distances commDistanceMatrix = np.zeros([len(coreCommunities),G.vcount()]) #compute distances of stray nodes to each core community for commIndex,comm in enumerate(coreCommunities): commMatrix = scoringMatrix[comm] commMatrix = commMatrix.astype(np.float16) commMatrix = commMatrix.mean(axis=0) commDistanceMatrix[commIndex,:] = commMatrix maxCommIds = np.argmax(commDistanceMatrix,axis=0) #add stray nodes to the "closest" core community for strayNode in strayNodes: finalCommunities[maxCommIds[strayNode]].append(strayNode) return finalCommunities
def is_tri(X): diag = X.diagonal().sum() if sparse.issparse(X): if not (sparse.tril(X).sum() - diag) or \ not (sparse.triu(X).sum() - diag): return True elif not np.triu(X, 1).sum() or not np.tril(X, -1).sum(): return True else: return False
def get_current_matrix(conductivity_laplacian, node_potentials): """ Recovers the current matrix based on the conductivity laplacian and voltages in each node. :param conductivity_laplacian: :param node_potentials: :return: matrix where M[i,j] = current intensity from i to j. Assymteric and Triangular superior iof the assymetric one. if current is from j to i, term is positive, otherwise it is negative. :rtype: scipy.sparse.lil_matrix """ if switch_to_splu: diag_voltages = lil_matrix(diags(node_potentials.toarray().T.tolist()[0], 0)) else: # print type(node_potentials) # print node_potentials.shape # print node_potentials diag_voltages = lil_matrix(diags(node_potentials.toarray().T.tolist()[0], 0)) corr_conductance_matrix = conductivity_laplacian - \ lil_matrix(diags(conductivity_laplacian.diagonal(), 0)) # true currents currents = diag_voltages.dot(corr_conductance_matrix) - corr_conductance_matrix.dot(diag_voltages) # print type(currents) # we want them to be fully positive (so that the direction of flow doesn't matter) abs_current = sparse_abs(currents) # and symmetric so that the triangular upper matrix contains all the data currents = abs_current+abs_current.T # positive_current = lil_matrix(currents.shape) # positive_current[currents > 0.0] = currents[currents > 0.0] # negative_current = lil_matrix(currents.shape) # negative_current[currents < 0.0] = currents[currents < 0.0] # # incoming_current = np.array((positive_current + positive_current.T).sum(axis=1)).flatten()/2 # outgoing_current = np.array((negative_current + negative_current.T).sum(axis=1)).flatten()/2 # print incoming_current # print outgoing_current # # print 'flow conservation', np.allclose(incoming_current, outgoing_current) # print incoming_current+outgoing_current # print 'discordant', np.nonzero(incoming_current+outgoing_current) # # # print 'symmetric', (currents-currents.T) # # print 'positive', np.any(currents > 0.0) # # print 'negative', np.any(currents < 0.0) # raise Exception('debug') # PB: we can't really use the triu because the flow matrix is not symmetric return currents, triu(currents)
def second_deg_poly_features(X): D = X.shape[1] D2 = (D**2+D)/2+D print D2 X2 = coo_matrix(X.tocsr(), shape=(X.shape[0], D2)).tocsr() for i, row in enumerate(X): interact(local=locals()) nzrows, nzcols = triu(outer(row.T,row)[0][0]).flatten() print r.shape X2[i, D:] = r return X2
def triangular_upper(self, k=0): """ Returns the upper triangular portion of this matrix. :param k: - k = 0 corresponds to the main diagonal - k > 0 is above the main diagonal - k < 0 is below the main diagonal TODO: Add unit tests """ return self._new_instance(sp.triu(self.matrix, k=k))
def __call__(self, x0, lagrange, obj_factor, flag, user_data = None): if flag: return (self.rind, self.cind) else: x = np.hstack([x0,lagrange,obj_factor]) result = adolc.hessian(lID,x) result1 = result[:nvar,:nvar] result = None result = sps.triu(result1,format='coo') return result.data
def analyze_eigvects( non_normalized_Laplacian, num_first_eigvals_to_analyse, index_chars, permutations_limiter=10000000, fudge=10e-10 ): # normalize the laplacian print "analyzing the laplacian with %s items and %s non-zero elts" % ( non_normalized_Laplacian.shape[0] ** 2, len(non_normalized_Laplacian.nonzero()[0]), ) t = time() init = time() normalized_Laplacian = Lapl_normalize(non_normalized_Laplacian) print time() - t t = time() # compute the eigenvalues and storre them true_eigenvals, true_eigenvects = eigsh(normalized_Laplacian, num_first_eigvals_to_analyse) print time() - t t = time() # permute randomly the off-diagonal terms triag_u = lil_matrix(triu(normalized_Laplacian)) triag_u.setdiag(0) tnz = triag_u.nonzero() print "reassigning the indexes for %s items, with %s non-zero elts" % (triag_u.shape[0] ** 2, len(tnz[0])) eltsuite = zip(tnz[0].tolist(), tnz[1].tolist()) shuffle(eltsuite) if eltsuite > permutations_limiter: # pb: we want it to affect any random number with reinsertion eltsuite = eltsuite[:permutations_limiter] print time() - t t = time() # take a nonzero pair of indexes for i, j in eltsuite: # select randomly a pair of indexes and permute it k = randrange(1, triag_u.shape[0] - 1) l = randrange(k + 1, triag_u.shape[0]) triag_u[i, j], triag_u[k, l] = (triag_u[k, l], triag_u[i, j]) print time() - t t = time() # recompute the diagonal terms fullmat = triag_u + triag_u.T diagterms = [-item for sublist in fullmat.sum(axis=0).tolist() for item in sublist] fullmat.setdiag(diagterms) print time() - t t = time() # recompute the normalized matrix normalized_rand = Lapl_normalize(fullmat) # recompute the eigenvalues rand_eigenvals, rand_eigenvects = eigsh(normalized_rand, num_first_eigvals_to_analyse) print time() - t t = time() show_eigenvals_and_eigenvects(true_eigenvals, true_eigenvects, 20, "true laplacian", index_chars) show_eigenvals_and_eigenvects(rand_eigenvals, rand_eigenvects, 20, "random") print "final", time() - t, time() - init
def voltage_drop_abs(self): """ Return a sparse matrix in CSR form containing the voltage drop between nodes i and j. Requires that self.solve() have been called to populate self.voltages """ rows, cols = sparse.triu(self.G).nonzero() # fill in the entries in the voltage drop matrix voltage_drop = sparse.lil_matrix(self.G.shape) for node_i, node_j in itertools.izip(rows, cols): voltage_drop[node_i, node_j] = abs(self.voltages[node_j] - self.voltages[node_i]) voltage_drop[node_j, node_i] = voltage_drop[node_i, node_j] return voltage_drop.tocsr()
def test_sparse_ICE_normalization_triu(): n = 100 X = np.random.random((n, n)) thres = (np.random.random((n, n)) > 0.5).astype(bool) X[thres] = 0 X = X + X.T sparse_X = sparse.triu(X) true_normed_X = ICE_normalization(X, eps=1e-10, max_iter=10) true_normed_X = np.triu(true_normed_X) X = np.triu(X) normed_X = ICE_normalization(sparse_X, eps=1e-10, max_iter=10) assert_array_almost_equal(X, sparse_X.todense()) assert_array_almost_equal(true_normed_X, np.array(normed_X.todense()))
def mask_test_edges(adj): # Function to build test set with 10% positive links # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper. # TODO: Clean up. # Remove diagonal elements adj = adj - sp.dia_matrix( (adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) adj.eliminate_zeros() # Check that diag is zero: assert np.diag(adj.todense()).sum() == 0 adj_triu = sp.triu(adj) adj_tuple = sparse_to_tuple(adj_triu) edges = adj_tuple[0] edges_all = sparse_to_tuple(adj)[0] num_test = int(np.floor(edges.shape[0] / 10.)) num_val = int(np.floor(edges.shape[0] / 20.)) all_edge_idx = list(range(edges.shape[0])) np.random.shuffle(all_edge_idx) val_edge_idx = all_edge_idx[:num_val] test_edge_idx = all_edge_idx[num_val:(num_val + num_test)] test_edges = edges[test_edge_idx] val_edges = edges[val_edge_idx] train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0) def ismember(a, b, tol=5): rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) return np.any(rows_close) test_edges_false = [] while len(test_edges_false) < len(test_edges): idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue if ismember([idx_i, idx_j], edges_all): continue if test_edges_false: if ismember([idx_j, idx_i], np.array(test_edges_false)): continue if ismember([idx_i, idx_j], np.array(test_edges_false)): continue test_edges_false.append([idx_i, idx_j]) val_edges_false = [] while len(val_edges_false) < len(val_edges): idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue if ismember([idx_i, idx_j], train_edges): continue if ismember([idx_j, idx_i], train_edges): continue if ismember([idx_i, idx_j], val_edges): continue if ismember([idx_j, idx_i], val_edges): continue if val_edges_false: if ismember([idx_j, idx_i], np.array(val_edges_false)): continue if ismember([idx_i, idx_j], np.array(val_edges_false)): continue val_edges_false.append([idx_i, idx_j]) assert ~ismember(test_edges_false, edges_all) assert ~ismember(val_edges_false, edges_all) assert ~ismember(val_edges, train_edges) assert ~ismember(test_edges, train_edges) assert ~ismember(val_edges, test_edges) data = np.ones(train_edges.shape[0]) # Re-build adj matrix adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) adj_train = adj_train + adj_train.T # NOTE: these edge lists only contain single direction of edge! return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false
def check_network_health(self): r""" This method check the network topological health by checking for: (1) Isolated pores (2) Islands or isolated clusters of pores (3) Duplicate throats (4) Bidirectional throats (ie. symmetrical adjacency matrix) (5) Headless throats Returns ------- health : dict A dictionary containing the offending pores or throat numbers under each named key. Notes ----- It also returns a list of which pores and throats should be trimmed from the network to restore health. This list is a suggestion only, and is based on keeping the largest cluster and trimming the others. Notes ----- - Does not yet check for duplicate pores - Does not yet suggest which throats to remove - This is just a 'check' and does not 'fix' the problems it finds """ import scipy.sparse.csgraph as csg import scipy.sparse as sprs health = HealthDict() health['disconnected_clusters'] = [] health['isolated_pores'] = [] health['trim_pores'] = [] health['duplicate_throats'] = [] health['bidirectional_throats'] = [] health['headless_throats'] = [] health['looped_throats'] = [] net = self.network # Check for headless throats hits = np.where(net['throat.conns'] > net.Np - 1)[0] if np.size(hits) > 0: health['headless_throats'] = np.unique(hits) return health # Check for throats that loop back onto the same pore P12 = net['throat.conns'] hits = np.where(P12[:, 0] == P12[:, 1])[0] if np.size(hits) > 0: health['looped_throats'] = hits # Check for individual isolated pores Ps = net.num_neighbors(net.pores()) if np.sum(Ps == 0) > 0: health['isolated_pores'] = np.where(Ps == 0)[0] # Check for separated clusters of pores temp = [] am = net.create_adjacency_matrix(fmt='coo', triu=True) Cs = csg.connected_components(am, directed=False)[1] if np.unique(Cs).size > 1: for i in np.unique(Cs): temp.append(np.where(Cs == i)[0]) b = np.array([len(item) for item in temp]) c = np.argsort(b)[::-1] for i in range(0, len(c)): health['disconnected_clusters'].append(temp[c[i]]) if i > 0: health['trim_pores'].extend(temp[c[i]]) # Check for duplicate throats am = net.create_adjacency_matrix(fmt='csr', triu=True).tocoo() hits = np.where(am.data > 1)[0] if len(hits): mergeTs = [] hits = np.vstack((am.row[hits], am.col[hits])).T ihits = hits[:, 0] + 1j * hits[:, 1] conns = net['throat.conns'] iconns = conns[:, 0] + 1j * conns[:, 1] # Convert to imaginary for item in ihits: mergeTs.append(np.where(iconns == item)[0]) health['duplicate_throats'] = mergeTs # Check for bidirectional throats adjmat = net.create_adjacency_matrix(fmt='coo') num_full = adjmat.sum() temp = sprs.triu(adjmat, k=1) num_upper = temp.sum() if num_full > num_upper: biTs = np.where( net['throat.conns'][:, 0] > net['throat.conns'][:, 1])[0] health['bidirectional_throats'] = biTs.tolist() return health
def main(data, a=1, b=1, gamma=0.4, stepm=25, rtype=1, maxiter=1000, verbose=True): S = data['S'] li = data['li'] lj = data['lj'] w = data['w'] setup, m, n = bmw.bipartite_setup(li, lj, w) S = sps.csr_matrix(S, dtype=float) U = sps.csr_matrix(S.shape) xbest = np.zeros(len(w)) flower = 0.0 fupper = np.inf next_reduction_iteration = stepm if verbose: print( '{:5s} {:>4s} {:>8s} {:>7s} {:>7s} {:>7s} {:>7s} {:>7s} {:>7s} {:>7s}' .format('best', 'iter', 'norm-u', 'lower', 'upper', 'cur', 'obj', 'weight', 'card', 'overlap')) for it in range(1, maxiter + 1): q, SM = maxrowmatch((b / 2) * S + U - U.T, li, lj, m, n) x = a * w + q f, matchval, card, overlap, val, mi = bmw.round_messages( x, S, w, a, b, setup, m, n) if val < fupper: fupper = val next_reduction_iteration = it + stepm if f > flower: flower = f itermark = '*' xbest = mi else: itermark = ' ' if rtype == 1: pass elif rtype == 2: mw = S * x mw = a * w + b / 2 * mw f, matchval, card, overlap, _, mx = bmw.round_messages( mw, S, w, a, b, setup, m, n) if f > flower: flower = f itermark = '**' mi = mx xbest = mw if verbose: print( '{:5s} {:4d} {:8.1e} {:7.2f} {:7.2f} {:7.2f} {:7.2f} {:7.2f} {:7d} {:7d}' .format(itermark, it, np.linalg.norm(U.data, 1), flower, fupper, val, f, matchval, card, overlap)) if it == next_reduction_iteration: gamma = gamma * 0.5 if verbose: print(f'{"":5s} {"":4s} reducing step to {gamma}') if gamma < 1e-24: break next_reduction_iteration = it + stepm if (fupper - flower) < 1e-2: break GM = sps.diags(gamma * mi, format="csr") U = U - GM * sps.triu(SM) + sps.tril(SM).T * GM U.data = U.data.clip(-0.5, 0.5) return sps.csr_matrix((xbest, (li, lj)))
def convert_to_obs_exp_matrix(self, maxdepth=None, zscore=False, perchr=False): """ Converts a corrected counts matrix into a obs / expected matrix or z-scores fast. The caveat is that the obs/exp or z-score are only computed for non-zero values, although zero values that are not part of the sparse matrix are considered. For each diagonal the mean (and std when computing z-scores) are calculated and then each non-zero value of the sparse matrix is replaced by the obs/exp or z-score. Parameters ---------- maxdepth: maximum distance from the diagonal to consider. All contacts beyond this distance will not be considered. zscore: if a zscore wants to be returned instead of obs/exp Returns ------- observed / expected sparse matrix nans occur where the standard deviation is zero """ binsize = self.getBinSize() max_depth_in_bins = None if maxdepth: if maxdepth < binsize: raise Exception("Please specify a maxDepth larger than bin size ({})".format(binsize)) max_depth_in_bins = int(float(maxdepth * 1.5) / binsize) # work only with the upper matrix # and remove all pixels that are beyond # max_depth_in_bis # (this is done by subtracting a second sparse matrix # that contains only the upper matrix that wants to be removed. self.matrix = triu(self.matrix, k=0, format='csr') - \ triu(self.matrix, k=max_depth_in_bins, format='csr') else: self.matrix = triu(self.matrix, k=0, format='csr') self.matrix.eliminate_zeros() depth = None if zscore is True: from scipy.sparse import diags m_size = self.matrix.shape[0] if max_depth_in_bins is not None: depth = max_depth_in_bins else: depth = m_size estimated_size_dense_matrix = m_size ** 2 * 8 if estimated_size_dense_matrix > 100e6: log.info("To compute z-scores a dense matrix is required. This will use \n" "{} Mb of memory.\n To reduce memory use the maxdeph option." "".format(estimated_size_dense_matrix / 1e6)) # to compute zscore the zero values need to be accounted and the matrix # need to become dense. This is only practical if only up to certain distance # wants to be evaluated, otherwise the dense matrix is too large. # To make the matrix dense and keep the same computations as when # the matrix is sparse the following is done: # A sparse diagonal matrix of shape = matrix.shape is created with ones # (only upper triangle contains diagonals up to maxdeph) # This sparse matrix is then added to self.matrix # then, -1 is subtracted to the self.matrix.data, thus effectively # adding zeros. diag_mat_ones = diags(np.repeat([1], m_size * depth).reshape(depth, m_size), list(range(depth))) self.matrix += diag_mat_ones from scipy.sparse import lil_matrix trasf_matrix = lil_matrix(self.matrix.shape) chr_submatrix = OrderedDict() cut_intervals = OrderedDict() chrom_sizes = OrderedDict() chrom_range = OrderedDict() if perchr: for chrname in self.getChrNames(): chr_range = self.getChrBinRange(chrname) chr_submatrix[chrname] = self.matrix[chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]].tocoo() cut_intervals[chrname] = [self.cut_intervals[x] for x in range(chr_range[0], chr_range[1])] chrom_sizes[chrname] = [chr_submatrix[chrname].shape[0]] chrom_range[chrname] = (chr_range[0], chr_range[1]) else: chr_submatrix['all'] = self.matrix.tocoo() cut_intervals['all'] = self.cut_intervals # chrom_sizes['all'] = np.array([v[1] - v[0] for k, v in iteritems(self.chrBinBoundaries)]) chrom_sizes['all'] = np.array([v[1] - v[0] for k, v in self.chrBinBoundaries.items()]) chrom_range['all'] = (0, self.matrix.shape[0]) # for chrname, submatrix in iteritems(chr_submatrix): for chrname, submatrix in chr_submatrix.items(): log.info("processing chromosome {}\n".format(chrname)) if zscore is True: # this step has to be done after tocoo() submatrix.data -= 1 dist_list, chrom_list = self.getDistList(submatrix.row, submatrix.col, hiCMatrix.fit_cut_intervals(cut_intervals[chrname])) # to get the sum of all values at a given distance I use np.bincount which # is quite fast. However, the input of bincount is positive integers. Moreover # it returns the sum for every consecutive integer, even if this is not on the list. # Thus, dist_list, which contains the distance in bp between any two bins is # converted to bin distance. # Because positive integers are needed we add +1 to all bin distances # such that the value of -1 (which means different chromosomes) can now be used dist_list[dist_list == -1] = -binsize # divide by binsize to get a list of bin distances and add +1 to remove negative values dist_list = (np.array(dist_list).astype(float) / binsize).astype(int) + 1 # for each distance, return the sum of all values sum_counts = np.bincount(dist_list, weights=submatrix.data) distance_len = np.bincount(dist_list) # compute the average for each distance mat_size = submatrix.shape[0] mu = {} std = {} # compute mean value for each distance for bin_dist_plus_one, sum_value in enumerate(sum_counts): if maxdepth and bin_dist_plus_one == 0: # this is for intra chromosomal counts # when max depth is set, the computation # of the total_intra is not accurate and is safer to # output np.nan mu[bin_dist_plus_one] = np.nan std[bin_dist_plus_one] = np.nan continue if bin_dist_plus_one == 0: total_intra = mat_size ** 2 - sum([size ** 2 for size in chrom_sizes[chrname]]) diagonal_length = int(total_intra / 2) else: # to compute the average counts per distance we take the sum_counts and divide # by the number of values on the respective diagonal # which is equal to the size of each chromosome - the diagonal offset (for those # chromosome larger than the offset) # In the following example with two chromosomes # the first (main) diagonal has a size equal to the matrix (6), # while the next has 1 value less for each chromosome (4) and the last one has only 2 values # 0 1 2 . . . # - 0 1 . . . # - - 0 . . . # . . . 0 1 2 # . . . - 0 1 # . . . - - 0 # idx - 1 because earlier the values where # shifted. diagonal_length = sum([size - (bin_dist_plus_one - 1) for size in chrom_sizes[chrname] if size > (bin_dist_plus_one - 1)]) log.debug("Type of diagonal_length {}".format(type(diagonal_length))) # the diagonal length should contain the number of values at a certain distance. # If the matrix is dense, the distance_len[bin_dist_plus_one] correctly contains the number of values # If the matrix is equally spaced, then, the diagonal_length as computed before is accurate. # But, if the matrix is both sparse and with unequal bins, then none of the above methods is # accurate but the the diagonal_length as computed before will be closer. diagonal_length = max(diagonal_length, distance_len[bin_dist_plus_one]) log.debug("Type of diagonal_length {}".format(type(diagonal_length))) if diagonal_length == 0: mu[bin_dist_plus_one] = np.nan else: mu[bin_dist_plus_one] = np.float64(sum_value) / diagonal_length if np.isnan(sum_value): log.info("nan value found for distance {}\n".format((bin_dist_plus_one - 1) * binsize)) # if zscore is needed, compute standard deviation: std = sqrt(mean(abs(x - x.mean())**2)) if zscore: values_sqrt_diff = \ np.abs((submatrix.data[dist_list == bin_dist_plus_one] - mu[bin_dist_plus_one]) ** 2) # the standard deviation is the sum of the differences with mu squared (value variable) # plus all zeros that are not included in the sparse matrix # for which the standard deviation is # (0 - mu)**2 = (mu)**2 # The number of zeros is the diagonal length - the length of the non zero values zero_values_sqrt_diff_sum = (diagonal_length - len(values_sqrt_diff)) * mu[bin_dist_plus_one] ** 2 _std = np.sqrt((values_sqrt_diff.sum() + zero_values_sqrt_diff_sum) / diagonal_length) std[bin_dist_plus_one] = _std # use the expected values to compute obs/exp transf_ma = np.zeros(len(submatrix.data)) for idx, value in enumerate(submatrix.data): if depth is not None and dist_list[idx] > depth + 1: continue if zscore: if std[dist_list[idx]] == 0: transf_ma[idx] = np.nan else: transf_ma[idx] = (value - mu[dist_list[idx]]) / std[dist_list[idx]] else: transf_ma[idx] = value / mu[dist_list[idx]] submatrix.data = transf_ma trasf_matrix[chrom_range[chrname][0]:chrom_range[chrname][1], chrom_range[chrname][0]:chrom_range[chrname][1]] = submatrix.tolil() self.matrix = trasf_matrix.tocsr() return self.matrix
def solve_via_data(self, data, warm_start, verbose, solver_opts, solver_cache=None): import osqp P = data[s.P] q = data[s.Q] A = sp.vstack([data[s.A], data[s.F]]).tocsc() data['Ax'] = A uA = np.concatenate((data[s.B], data[s.G])) data['u'] = uA lA = np.concatenate([data[s.B], -np.inf * np.ones(data[s.G].shape)]) data['l'] = lA # Overwrite defaults eps_abs=eps_rel=1e-3, max_iter=4000 solver_opts['eps_abs'] = solver_opts.get('eps_abs', 1e-5) solver_opts['eps_rel'] = solver_opts.get('eps_rel', 1e-5) solver_opts['max_iter'] = solver_opts.get('max_iter', 10000) if solver_cache is not None and self.name() in solver_cache: # Use cached data. solver, old_data, results = solver_cache[self.name()] same_pattern = (P.shape == old_data[s.P].shape and all(P.indptr == old_data[s.P].indptr) and all(P.indices == old_data[s.P].indices)) and \ (A.shape == old_data['Ax'].shape and all(A.indptr == old_data['Ax'].indptr) and all(A.indices == old_data['Ax'].indices)) else: same_pattern = False # If sparsity pattern differs need to do setup. if warm_start and same_pattern: new_args = {} for key in ['q', 'l', 'u']: if any(data[key] != old_data[key]): new_args[key] = data[key] factorizing = False if any(P.data != old_data[s.P].data): P_triu = sp.triu(P).tocsc() new_args['Px'] = P_triu.data factorizing = True if any(A.data != old_data['Ax'].data): new_args['Ax'] = A.data factorizing = True if new_args: solver.update(**new_args) # Map OSQP statuses back to CVXPY statuses status = self.STATUS_MAP.get(results.info.status_val, s.SOLVER_ERROR) if status == s.OPTIMAL: solver.warm_start(results.x, results.y) # Polish if factorizing. solver_opts['polish'] = solver_opts.get('polish', factorizing) solver.update_settings(verbose=verbose, **solver_opts) else: # Initialize and solve problem solver_opts['polish'] = solver_opts.get('polish', True) solver = osqp.OSQP() solver.setup(P, q, A, lA, uA, verbose=verbose, **solver_opts) results = solver.solve() if solver_cache is not None: solver_cache[self.name()] = (solver, data, results) return results
def get_edges(sparse_matrix, is_triu=True): coo = sp.coo_matrix(sparse_matrix) if is_triu: coo = sp.triu(coo, 1) return np.vstack((coo.row, coo.col)).transpose() # .tolist()
def check_network_health(self): r""" This method check the network topological health by checking for: (1) Isolated pores (2) Islands or isolated clusters of pores (3) Duplicate throats (4) Bidirectional throats (ie. symmetrical adjacency matrix) (5) Headless throats Returns ------- A dictionary containing the offending pores or throat numbers under each named key. It also returns a list of which pores and throats should be trimmed from the network to restore health. This list is a suggestion only, and is based on keeping the largest cluster and trimming the others. Notes ----- - Does not yet check for duplicate pores - Does not yet suggest which throats to remove - This is just a 'check' method and does not 'fix' the problems it finds """ health = Tools.HealthDict() health['disconnected_clusters'] = [] health['isolated_pores'] = [] health['trim_pores'] = [] health['duplicate_throats'] = [] health['bidirectional_throats'] = [] health['headless_throats'] = [] health['looped_throats'] = [] # Check for headless throats hits = sp.where(self['throat.conns'] > self.Np - 1)[0] if sp.size(hits) > 0: health['headless_throats'] = sp.unique(hits) logger.warning('Health check cannot complete due to connectivity ' 'errors. Please correct existing errors & recheck.') return health # Check for throats that loop back onto the same pore P12 = self['throat.conns'] hits = sp.where(P12[:, 0] == P12[:, 1])[0] if sp.size(hits) > 0: health['looped_throats'] = hits # Check for individual isolated pores Ps = self.num_neighbors(self.pores()) if sp.sum(Ps == 0) > 0: logger.warning(str(sp.sum(Ps == 0)) + ' pores have no neighbors') health['isolated_pores'] = sp.where(Ps == 0)[0] # Check for separated clusters of pores temp = [] Cs = self.find_clusters(self.tomask(throats=self.throats('all'))) if sp.shape(sp.unique(Cs))[0] > 1: logger.warning('Isolated clusters exist in the network') for i in sp.unique(Cs): temp.append(sp.where(Cs == i)[0]) b = sp.array([len(item) for item in temp]) c = sp.argsort(b)[::-1] for i in range(0, len(c)): health['disconnected_clusters'].append(temp[c[i]]) if i > 0: health['trim_pores'].extend(temp[c[i]]) # Check for duplicate throats i = self['throat.conns'][:, 0] j = self['throat.conns'][:, 1] v = sp.array(self['throat.all'], dtype=int) adjmat = sprs.coo_matrix((v, (i, j)), [self.Np, self.Np]) temp = adjmat.tolil() # Convert to lil to combine duplicates # Compile lists of which specfic throats are duplicates # Be VERY careful here, as throats are not in order mergeTs = [] for i in range(0, self.Np): if sp.any(sp.array(temp.data[i]) > 1): ind = sp.where(sp.array(temp.data[i]) > 1)[0] P = sp.array(temp.rows[i])[ind] Ts = self.find_connecting_throat(P1=i, P2=P)[0] mergeTs.append(Ts) health['duplicate_throats'] = mergeTs # Check for bidirectional throats num_full = adjmat.sum() temp = sprs.triu(adjmat, k=1) num_upper = temp.sum() if num_full > num_upper: biTs = sp.where( self['throat.conns'][:, 0] > self['throat.conns'][:, 1])[0] health['bidirectional_throats'] = biTs.tolist() return health
import pandas as pd, os, sys import numpy as np import matplotlib.pyplot as plt import scipy.sparse as sps syn = pd.read_csv("../doc/synthetic.txt",names=['a','b'],sep=" ") data = np.array(syn) from sklearn.metrics.pairwise import euclidean_distances X = euclidean_distances(data, data) X2 = X.copy() # filter out large values / distances so matrix can be sparse X2[X > 2000] = 0.0 X3 = sps.lil_matrix(X2) X4 = sps.triu(X3) print 'non-zero items', len(X4.nonzero()[0]) print X4.shape import scipy.sparse as sps from scipy.io import mmwrite, mmread mmwrite('/tmp/syndist', X4) os.system("../felzclust/felzclust /tmp/syndist.mtx 20000 100 > /tmp/out") df = pd.read_csv('/tmp/out',sep=';') syn['cluster'] = df['cluster'] print syn[:5] import matplotlib.cm as cm
def compute_distance_mean(hicmat, maxdepth=None, perchr=False): """ Converts a corrected counts matrix into a obs / expected matrix or z-scores fast. The caveat is that the obs/exp or z-score are only computed for non-zero values, although zero values that are not part of the sparse matrix are considered. For each diagonal the mean (and std when computing z-scores) are calculated and then each non-zero value of the sparse matrix is replaced by the obs/exp or z-score. Parameters ---------- hicmat: HiCMatrix object maxdepth: maximum distance from the diagonal to consider. All contacts beyond this distance will not be considered. perchr: bool to indicate if computations should be perform per chromosome Returns ------- observed / expected sparse matrix >>> from scipy.sparse import csr_matrix, dia_matrix >>> row, col = np.triu_indices(5) >>> cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ... ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] >>> hic = HiCMatrix.hiCMatrix() >>> hic.nan_bins = [] >>> matrix = np.array([ ... [ 1, 8, 5, 3, 0], ... [ 0, 4, 15, 5, 1], ... [ 0, 0, 0, 7, 2], ... [ 0, 0, 0, 0, 1], ... [ 0, 0, 0, 0, 0]]) >>> hic.matrix = csr_matrix(matrix) >>> hic.setMatrix(hic.matrix, cut_intervals) >>> hic.convert_to_obs_exp_matrix().todense() matrix([[ 1. , 0.8, 1. , 1. , 0. ], [ 0. , 4. , 1.5, 1. , 1. ], [ 0. , 0. , 0. , 0.7, 2. ], [ 0. , 0. , 0. , 0. , 1. ], [ 0. , 0. , 0. , 0. , 0. ]]) >>> hic.matrix = csr_matrix(matrix) >>> hic.convert_to_obs_exp_matrix(maxdepth=20).todense() matrix([[ 1. , 0.8, 1. , 0. , 0. ], [ 0. , 4. , 1.5, 1. , 0. ], [ 0. , 0. , 0. , 0.7, nan], [ 0. , 0. , 0. , 0. , nan], [ 0. , 0. , 0. , 0. , 0. ]]) >>> hic.matrix = csr_matrix(matrix) >>> hic.convert_to_obs_exp_matrix(zscore=True).todense() matrix([[ 0. , -0.56195149, nan, nan, -1.41421356], [ 0. , 1.93649167, 1.40487872, nan, 0. ], [ 0. , 0. , -0.64549722, -0.84292723, 1.41421356], [ 0. , 0. , 0. , -0.64549722, 0. ], [ 0. , 0. , 0. , 0. , -0.64549722]]) nans occur where the standard deviation is zero """ binsize = hicmat.getBinSize() if maxdepth: if maxdepth < binsize: exit("Please specify a maxDepth larger than bin size ({})".format( binsize)) max_depth_in_bins = int(float(maxdepth * 1.5) / binsize) # work only with the upper matrix # and remove all pixels that are beyond # max_depth_in_bis # (this is done by subtracting a second sparse matrix # that contains only the upper matrix that wants to be removed. hicmat.matrix = triu(hicmat.matrix, k=0, format='csr') - \ triu(hicmat.matrix, k=max_depth_in_bins, format='csr') else: hicmat.matrix = triu(hicmat.matrix, k=0, format='csr') hicmat.matrix.eliminate_zeros() chr_submatrix = OrderedDict() cut_intervals = OrderedDict() chrom_sizes = OrderedDict() chrom_range = OrderedDict() if perchr: for chrname in hicmat.getChrNames(): chr_range = hicmat.getChrBinRange(chrname) chr_submatrix[chrname] = hicmat.matrix[ chr_range[0]:chr_range[1], chr_range[0]:chr_range[1]].tocoo() cut_intervals[chrname] = [ hicmat.cut_intervals[x] for x in range(chr_range[0], chr_range[1]) ] chrom_sizes[chrname] = [chr_submatrix[chrname].shape[0]] chrom_range[chrname] = (chr_range[0], chr_range[1]) else: chr_submatrix['all'] = hicmat.matrix.tocoo() cut_intervals['all'] = hicmat.cut_intervals chrom_sizes['all'] = np.array( [v[1] - v[0] for k, v in iteritems(hicmat.chrBinBoundaries)]) chrom_range['all'] = (0, hicmat.matrix.shape[0]) mean_dict = {} for chrname, submatrix in iteritems(chr_submatrix): log.info("processing chromosome {}\n".format(chrname)) dist_list, chrom_list = hicmat.getDistList( submatrix.row, submatrix.col, HiCMatrix.hiCMatrix.fit_cut_intervals(cut_intervals[chrname])) # to get the sum of all values at a given distance I use np.bincount which # is quite fast. However, the input of bincount is positive integers. Moreover # it returns the sum for every consecutive integer, even if this is not on the list. # Thus, dist_list, which contains the distance in bp between any two bins is # converted to bin distance. # Because positive integers are needed we add +1 to all bin distances # such that the value of -1 (which means different chromosomes) can now be used dist_list[dist_list == -1] = -binsize # divide by binsize to get a list of bin distances and add +1 to remove negative values dist_list = (np.array(dist_list).astype(float) / binsize).astype(int) + 1 # for each distance, return the sum of all values sum_counts = np.bincount(dist_list, weights=submatrix.data) distance_len = np.bincount(dist_list) # compute the average for each distance mat_size = submatrix.shape[0] # compute mean value for each distance mu = {} zero_value_bins = [] for bin_dist_plus_one, sum_value in enumerate(sum_counts): if maxdepth and bin_dist_plus_one == 0: # this is for intra chromosomal counts # when max depth is set, the computation # of the total_intra is not accurate and is safer to # output np.nan mu[bin_dist_plus_one] = np.nan continue if bin_dist_plus_one == 0: total_intra = mat_size**2 - sum( [size**2 for size in chrom_sizes[chrname]]) diagonal_length = total_intra / 2 else: # to compute the average counts per distance we take the sum_counts and divide # by the number of values on the respective diagonal # which is equal to the size of each chromosome - the diagonal offset (for those # chromosome larger than the offset) # In the following example with two chromosomes # the first (main) diagonal has a size equal to the matrix (6), # while the next has 1 value less for each chromosome (4) and the last one has only 2 values # 0 1 2 . . . # - 0 1 . . . # - - 0 . . . # . . . 0 1 2 # . . . - 0 1 # . . . - - 0 # idx - 1 because earlier the values where # shifted. diagonal_length = sum([ size - (bin_dist_plus_one - 1) for size in chrom_sizes[chrname] if size > (bin_dist_plus_one - 1) ]) # the diagonal length should contain the number of values at a certain distance. # If the matrix is dense, the distance_len[bin_dist_plus_one] correctly contains the number of values # If the matrix is equally spaced, then, the diagonal_length as computed before is accurate. # But, if the matrix is both sparse and with unequal bins, then none of the above methods is # accurate but the the diagonal_length as computed before will be closer. diagonal_length = max(diagonal_length, distance_len[bin_dist_plus_one]) if diagonal_length == 0: mu[bin_dist_plus_one] = np.nan else: mu[bin_dist_plus_one] = np.float64(sum_value) / diagonal_length if sum_value == 0: zero_value_bins.append(bin_dist_plus_one) log.info("zero value for {}, diagonal len: {}\n".format( bin_dist_plus_one, diagonal_length)) if len(zero_value_bins) > 10: diff = np.diff(zero_value_bins) if len(diff[diff == 1]) > 10: # if too many consecutive bins with zero are found that means that probably no # further counts will be found log.info( "skipping rest of chromosome {}. Too many emtpy diagonals\n" .format(chrname)) break if np.isnan(sum_value): log.info("nan value found for distance {}\n".format( (bin_dist_plus_one - 1) * binsize)) if maxdepth is None: maxdepth = np.inf mean_dict[chrname] = OrderedDict([ ((k - 1) * binsize, v) for k, v in iteritems(mu) if k > 0 and (k - 1) * binsize <= maxdepth ]) # mean_dict[chrname]['intra_chr'] = mu[0] return mean_dict
pickle.dump(gp, open('pz/dNdz_gp_%s_%s.p' % (band, depth), 'wb')) zs = np.arange(0.0, midz.max(), 0.01).reshape(-1, 1) ys, sigma = gp.predict(zs, return_std=True) pl.plot(zs, ys, '-', alpha=0.5, color=colors[ii]) plt.fill(np.concatenate([zs, zs[::-1]]), np.concatenate( [ys - 1.9600 * sigma, (ys + 1.9600 * sigma)[::-1]]), alpha=.2, fc=colors[ii], ec='None', label='') ## sL = sparse.csr_matrix(gp.L_) sL = sparse.triu(gp.L_, k=-1) print(gp.L_) print('----------------------------------') print(sL) ## plt.imshow(sL.todense()) pl.xlim(0.0, 6.00) pl.ylim(0.0, 1.25) pl.legend(ncol=2, loc=2, frameon=False) pl.xlabel(r'$z$', fontsize=14) pl.ylabel(r'$p(z)$') plt.tight_layout() ax = pl.gca()
def plot_delaunay(cells, labels=None, color=None, style='-', centroid_style='g+', negative=None, axes=None, linewidth=1, individual=False, fallback_color='gray'): """ Delaunay plot. Arguments: cells (Partition): full partition labels (numpy.ndarray): numerical labels for cell adjacency relationship color (str): single-character colours in a string, e.g. 'rrrbgy' style (str): line style centroid_style (str): marker style of the cell centers negative (any): if ``None``, do not plot edges corresponding to negative adjacency labels; if '*voronoi*', plot the corresponding Voronoi edge instead, for edges with negative labels axes (matplotlib.axes.Axes): axes where to plot linewidth (int): line width individual (bool): plot each edge independently; this generates a lot of handles and takes time fallback_color (str): colour for unexpected labels Returns: tuple: list of handles of the plotted edges, handle of the plotted centroids """ if axes is None: import matplotlib.pyplot as plt axes = plt try: tessellation = cells.tessellation except AttributeError: tessellation = cells vertices = tessellation.cell_centers if negative == 'voronoi': voronoi = tessellation.cell_vertices labels, color = _graph_theme(tessellation, labels, color, negative) # if asymetric, can be either triu or tril A = sparse.triu(tessellation.cell_adjacency, format='coo') I, J, K = A.row, A.col, A.data if not I.size: A = sparse.tril(tessellation.cell_adjacency, format='coo') I, J, K = A.row, A.col, A.data if not individual: by_color = defaultdict(list) edge_handles, centroid_handle = [], None # handles # plot delaunay for i, j, k in zip(I, J, K): x, y = zip(vertices[i], vertices[j]) if labels is None: c = 0 else: label = tessellation.adjacency_label[k] try: c = labels.index(label) except ValueError: continue if label <= 0: if negative == 'voronoi': try: vert_ids = set(tessellation.cell_vertices.get( i, [])) & set(tessellation.cell_vertices.get( j, [])) x, y = voronoi[vert_ids].T except ValueError: continue if individual: h = axes.plot(x, y, style, color=color[c], linewidth=linewidth) assert not h[1:] edge_handles.append(h) else: by_color[c].append((x, y)) if not individual: if not color[1:]: _clr = color[0] for c in by_color: xy = by_color[c] X = np.zeros((len(xy) * 3, )) Y = np.empty((len(xy) * 3, )) Y[:] = np.nan i = 0 for x, y in xy: I = slice(i * 3, i * 3 + 2) X[I], Y[I] = x, y i += 1 if color[1:]: try: _clr = color[c] except IndexError: import warnings warnings.warn( 'too few specified colours; at least {:d} needed'. format(c), RuntimeWarning) _clr = fallback_color h = axes.plot(X, Y, style, color=_clr, linewidth=linewidth) assert not h[1:] edge_handles.append(h[0]) # plot cell centers if centroid_style: h = axes.plot(vertices[:, 0], vertices[:, 1], centroid_style) assert not h[1:] centroid_handle = h[0] # resize window try: axes.axis(cells.bounding_box[['x', 'y']].values.flatten('F')) except AttributeError: pass except ValueError: print(traceback.format_exc()) return edge_handles, centroid_handle
def visualizeLaplaceWeights(mesh, quantile=.01, weights=None, cmap='seismic', viewer=None, **kwargs): """Visualize Laplacian weights. Requires ``navis`` to be installed. Parameters ---------- mesh : trimesh.Trimesh Mesh to plot the weights for. quantile : float [0-1] The vast majority of weights will be close to the mean while the interesting outliers will be very few. By default we are showing the top and bottom 0.1 quantile (i.e. the 10% highest and lowest values). weights : np.ndarray, optional Laplacian weights. If not provided, will be computed. """ mesh = make_trimesh(mesh, validate=False) try: import navis import vispy as vp import matplotlib.pyplot as plt except ImportError: raise ImportError('This function requires navis to be installed:\n' ' pip3 install navis') if not isinstance(weights, np.ndarray): weights = laplacian_cotangent(mesh, #symmetric=False, normalized=True) if not isinstance(weights, spsp.coo_matrix): weights = spsp.coo_matrix(weights) # Get data (upper triangle only -> is supposed to be symmetrical) # Also removes diagonal (k=1) triu = spsp.triu(weights, k=1) row, col, data = triu.row, triu.col, triu.data if quantile: top = data >= np.quantile(data, 1-quantile) bottom = data <= np.quantile(data, quantile) row = row[top | bottom] col = col[top | bottom] data = data[top | bottom] # Weights are computed per edge co1, co2 = mesh.vertices[row], mesh.vertices[col] segments = np.hstack((co1, co2)).reshape(co1.shape[0] * 2, 3) # Generate colors cmap = plt.get_cmap(cmap) weights_norm = (data - data.min()) / (data.max() - data.min()) colors = cmap(weights_norm) alpha = np.clip(np.fabs(weights_norm - .5) * 2, a_min=0.01, a_max=1) # We need to provide one color per vertex colors = np.hstack((colors, colors)).reshape(colors.shape[0] * 2, 4) #alpha = np.hstack((alpha, alpha)).reshape(alpha.shape[0] * 2, 1) # Combine color with alpha #colors = np.hstack((colors[:, :3], alpha)) t = vp.scene.visuals.Line(pos=segments, color=colors, # Can only be used with method 'agg' width=kwargs.get('linewidth', 1), connect='segments', antialias=kwargs.get('antialias', True), method=kwargs.get('method', 'gl')) if not viewer: viewer = navis.get_viewer() if not viewer: viewer = navis.Viewer() viewer.add(t) return t
def mask_test_edges(adj, test_frac=.1, val_frac=.05, prevent_disconnect=True, verbose=False): # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper. # Remove diagonal elements adj = adj - sp.dia_matrix( (adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) adj.eliminate_zeros() # Check that diag is zero: assert np.diag(adj.todense()).sum() == 0 g = nx.from_scipy_sparse_matrix(adj) orig_num_cc = nx.number_connected_components(g) adj_triu = sp.triu(adj) # upper triangular portion of adj matrix adj_tuple = sparse_to_tuple( adj_triu) # (coords, values, shape), edges only 1 way edges = adj_tuple[0] # all edges, listed only once (not 2 ways) # edges_all = sparse_to_tuple(adj)[0] # ALL edges (includes both ways) num_test = int( np.floor(edges.shape[0] * test_frac)) # controls how large the test set should be num_val = int( np.floor(edges.shape[0] * val_frac)) # controls how alrge the validation set should be # Store edges in list of ordered tuples (node1, node2) where node1 < node2 edge_tuples = [(min(edge[0], edge[1]), max(edge[0], edge[1])) for edge in edges] all_edge_tuples = set(edge_tuples) train_edges = set(edge_tuples) # initialize train_edges to have all edges test_edges = set() val_edges = set() # Iterate over shuffled edges, add to train/val sets np.random.shuffle(edge_tuples) counter = 0 for edge in edge_tuples: counter += 1 if counter % 100 == 0: print("processed:" + str(counter)) # print edge node1 = edge[0] node2 = edge[1] # If removing edge would disconnect a connected component, backtrack and move on g.remove_edge(node1, node2) if prevent_disconnect == True: if nx.number_connected_components(g) > orig_num_cc: g.add_edge(node1, node2) continue # Fill test_edges first if len(test_edges) < num_test: test_edges.add(edge) train_edges.remove(edge) # Then, fill val_edges elif len(val_edges) < num_val: val_edges.add(edge) train_edges.remove(edge) # Both edge lists full --> break loop elif len(test_edges) == num_test and len(val_edges) == num_val: break if (len(val_edges) < num_val or len(test_edges) < num_test): print( "WARNING: not enough removable edges to perform full train-test split!" ) print("Num. (test, val) edges requested: (", num_test, ", ", num_val, ")") print("Num. (test, val) edges returned: (", len(test_edges), ", ", len(val_edges), ")") if prevent_disconnect == True: assert nx.number_connected_components(g) == orig_num_cc if verbose == True: print('creating false test edges...') test_edges_false = set() while len(test_edges_false) < num_test: idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue false_edge = (min(idx_i, idx_j), max(idx_i, idx_j)) # Make sure false_edge not an actual edge, and not a repeat if false_edge in all_edge_tuples: continue if false_edge in test_edges_false: continue test_edges_false.add(false_edge) if verbose == True: print('creating false val edges...') val_edges_false = set() while len(val_edges_false) < num_val: idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue false_edge = (min(idx_i, idx_j), max(idx_i, idx_j)) # Make sure false_edge in not an actual edge, not in test_edges_false, not a repeat if false_edge in all_edge_tuples or false_edge in test_edges_false or false_edge in val_edges_false: continue val_edges_false.add(false_edge) if verbose == True: print('creating false train edges...') train_edges_false = set() while len(train_edges_false) < len(train_edges): idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue false_edge = (min(idx_i, idx_j), max(idx_i, idx_j)) # Make sure false_edge in not an actual edge, not in test_edges_false, # not in val_edges_false, not a repeat if false_edge in all_edge_tuples or false_edge in test_edges_false or false_edge in val_edges_false or false_edge in train_edges_false: continue train_edges_false.add(false_edge) if verbose == True: print('final checks for disjointness...') # assert: false_edges are actually false (not in all_edge_tuples) assert test_edges_false.isdisjoint(all_edge_tuples) assert val_edges_false.isdisjoint(all_edge_tuples) assert train_edges_false.isdisjoint(all_edge_tuples) # assert: test, val, train false edges disjoint assert test_edges_false.isdisjoint(val_edges_false) assert test_edges_false.isdisjoint(train_edges_false) assert val_edges_false.isdisjoint(train_edges_false) # assert: test, val, train positive edges disjoint assert val_edges.isdisjoint(train_edges) assert test_edges.isdisjoint(train_edges) assert val_edges.isdisjoint(test_edges) if verbose == True: print('creating adj_train...') # Re-build adj matrix using remaining graph adj_train = nx.adjacency_matrix(g) # Convert edge-lists to numpy arrays train_edges = np.array([list(edge_tuple) for edge_tuple in train_edges]) train_edges_false = np.array( [list(edge_tuple) for edge_tuple in train_edges_false]) val_edges = np.array([list(edge_tuple) for edge_tuple in val_edges]) val_edges_false = np.array( [list(edge_tuple) for edge_tuple in val_edges_false]) test_edges = np.array([list(edge_tuple) for edge_tuple in test_edges]) test_edges_false = np.array( [list(edge_tuple) for edge_tuple in test_edges_false]) # NOTE: these edge lists only contain single direction of edge! return adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false
def plotMatrix(matrixinputfile,imageoutputfile, regionindex1, regionindex2, comparematrix, title, bigwig): if not checkExtension(matrixinputfile, '.cool'): msg = "input matrix must be in cooler format (.cool)" raise SystemExit(msg) if comparematrix and not checkExtension(comparematrix, ".cool"): msg = "if specified, compare matrix must be in cooler format (.cool)" raise SystemExit(msg) if not imageoutputfile: imageoutputfile = matrixinputfile.rstrip('cool') + 'png' elif imageoutputfile and not checkExtension(imageoutputfile, ".png"): imageoutputfile = os.path.splitext(imageoutputfile)[0] + ".png" #get the full matrix first to extract the desired region ma = hm.hiCMatrix(matrixinputfile) cuts = ma.cut_intervals chromosome = cuts[0][0] maxIndex = len(cuts) - 1 #check indices and get the region if ok if regionindex1 > maxIndex: msg = "invalid start region. Allowed is 0 to {0:d} (0 to {1:d})".format(maxIndex, cuts[maxIndex][1]) raise SystemExit(msg) if regionindex2 < regionindex1: msg = "region index 2 must be smaller than region index 1" raise SystemExit(msg) if regionindex2 > maxIndex: regionindex2 = maxIndex print("region index 2 clamped to max. value {0:d}".format(maxIndex)) region = str(chromosome) +":"+str(cuts[regionindex1][1])+"-"+ str(cuts[regionindex2][1]) #now get the data for the input matrix, restricted to the desired region upperHiCMatrix = hm.hiCMatrix(matrixinputfile ,pChrnameList=[region]) upperMatrix = triu(upperHiCMatrix.matrix, k=1, format="csr") #if set, get data from the same region also for the compare matrix #there's no compatibility check so far lowerHiCMatrix = None lowerMatrix = None if comparematrix: lowerHiCMatrix = hm.hiCMatrix(comparematrix) if chromosome not in [row[0] for row in lowerHiCMatrix.cut_intervals]: msg = "compare matrix must contain the same chromosome as the input matrix" raise SystemExit(msg) lowerHiCMatrix = hm.hiCMatrix(comparematrix , pChrnameList=[region]) lowerMatrix = tril(lowerHiCMatrix.matrix, k=0, format="csr") if lowerMatrix.get_shape() != upperMatrix.get_shape(): msg = "shapes of input matrix and compare matrix do not match. Check resolutions" raise SystemExit(msg) #arguments for plotting plotArgs = Namespace(bigwig=bigwig, chromosomeOrder=None, clearMaskedBins=False, colorMap='RdYlBu_r', disable_tight_layout=False, dpi=300, flipBigwigSign=False, log=False, log1p=True, perChromosome=False, region=region, region2=None, scaleFactorBigwig=1.0, scoreName=None, title=title, vMax=None, vMaxBigwig=None, vMin=1.0, vMinBigwig=None, matrix = matrixinputfile) #following code is largely duplicated from hicPlotMatrix #not exactly beautiful, but works for now chrom, region_start, region_end, idx1, start_pos1, chrom2, region_start2, region_end2, idx2, start_pos2 = hicPlot.getRegion(plotArgs, upperHiCMatrix) mixedMatrix = None if comparematrix: mixedMatrix = np.asarray((lowerMatrix + upperMatrix).todense().astype(float)) else: mixedMatrix = np.asarray(upperHiCMatrix.matrix.todense().astype(float)) #colormap for plotting cmap = cm.get_cmap(plotArgs.colorMap) # pylint: disable=no-member cmap.set_bad('black') bigwig_info = None if plotArgs.bigwig: # pylint: disable=no-member bigwig_info = {'args': plotArgs, 'axis': None, 'axis_colorbar': None, 'nan_bins': upperHiCMatrix.nan_bins} norm = None if plotArgs.log or plotArgs.log1p: # pylint: disable=no-member mask = mixedMatrix == 0 try: mixedMatrix[mask] = np.nanmin(mixedMatrix[mask == False]) except ValueError: log.info('Matrix contains only 0. Set all values to {}'.format(np.finfo(float).tiny)) mixedMatrix[mask] = np.finfo(float).tiny if np.isnan(mixedMatrix).any() or np.isinf(mixedMatrix).any(): log.debug("any nan {}".format(np.isnan(mixedMatrix).any())) log.debug("any inf {}".format(np.isinf(mixedMatrix).any())) mask_nan = np.isnan(mixedMatrix) mask_inf = np.isinf(mixedMatrix) mixedMatrix[mask_nan] = np.nanmin(mixedMatrix[mask_nan == False]) mixedMatrix[mask_inf] = np.nanmin(mixedMatrix[mask_inf == False]) log.debug("any nan after remove of nan: {}".format(np.isnan(mixedMatrix).any())) log.debug("any inf after remove of inf: {}".format(np.isinf(mixedMatrix).any())) if plotArgs.log1p: # pylint: disable=no-member mixedMatrix += 1 norm = LogNorm() elif plotArgs.log: # pylint: disable=no-member norm = LogNorm() if plotArgs.bigwig: # pylint: disable=no-member # increase figure height to accommodate bigwig track fig_height = 8.5 else: fig_height = 7 height = 4.8 / fig_height fig_width = 8 width = 5.0 / fig_width left_margin = (1.0 - width) * 0.5 fig = plt.figure(figsize=(fig_width, fig_height), dpi=plotArgs.dpi) # pylint: disable=no-member if plotArgs.bigwig: # pylint: disable=no-member gs = gridspec.GridSpec(2, 2, height_ratios=[0.90, 0.1], width_ratios=[0.97, 0.03]) gs.update(hspace=0.05, wspace=0.05) ax1 = plt.subplot(gs[0, 0]) ax2 = plt.subplot(gs[1, 0]) ax3 = plt.subplot(gs[0, 1]) bigwig_info['axis'] = ax2 bigwig_info['axis_colorbar'] = ax3 else: ax1 = None bottom = 1.3 / fig_height position = [left_margin, bottom, width, height] hicPlot.plotHeatmap(mixedMatrix, ma.get_chromosome_sizes(), fig, position, plotArgs, cmap, xlabel=chrom, ylabel=chrom2, start_pos=start_pos1, start_pos2=start_pos2, pNorm=norm, pAxis=ax1, pBigwig=bigwig_info) plt.savefig(imageoutputfile, dpi=plotArgs.dpi) # pylint: disable=no-member plt.close(fig)
def _symmetric_matrix(mat: dok_matrix) -> dok_matrix: upper = triu(mat, 1, format="dok") / 2 # `todok` is necessary because subtraction results in other format return (mat + upper.transpose() - upper).todok()
def 上三角(s): from scipy.sparse import triu return np.matrix(triu(rmat(s, s)).toarray())
def vis_aggregate_groups(V, E2V, AggOp, mesh_type, fname='output.vtu'): """Coarse grid visualization of aggregate groups. Create .vtu files for use in Paraview or display with Matplotlib. Parameters ---------- V : {array} coordinate array (N x D) E2V : {array} element index array (Nel x Nelnodes) AggOp : {csr_matrix} sparse matrix for the aggregate-vertex relationship (N x Nagg) mesh_type : {string} type of elements: vertex, tri, quad, tet, hex (all 3d) fname : {string, file object} file to be written, e.g. 'output.vtu' Returns ------- - Writes data to .vtu file for use in paraview (xml 0.1 format) or displays to screen using matplotlib Notes ----- - Works for both 2d and 3d elements. Element groupings are colored with data equal to 2.0 and stringy edges in the aggregate are colored with 3.0 Examples -------- >>> from pyamg.aggregation import standard_aggregation >>> from pyamg.vis.vis_coarse import vis_aggregate_groups >>> from pyamg.gallery import load_example >>> data = load_example('unit_square') >>> A = data['A'].tocsr() >>> V = data['vertices'] >>> E2V = data['elements'] >>> AggOp = standard_aggregation(A)[0] >>> vis_aggregate_groups(V=V, E2V=E2V, AggOp=AggOp, ... mesh_type='tri', fname='output.vtu') >>> from pyamg.aggregation import standard_aggregation >>> from pyamg.vis.vis_coarse import vis_aggregate_groups >>> from pyamg.gallery import load_example >>> data = load_example('unit_cube') >>> A = data['A'].tocsr() >>> V = data['vertices'] >>> E2V = data['elements'] >>> AggOp = standard_aggregation(A)[0] >>> vis_aggregate_groups(V=V, E2V=E2V, AggOp=AggOp, ... mesh_type='tet', fname='output.vtu') """ check_input(V=V, E2V=E2V, AggOp=AggOp, mesh_type=mesh_type) map_type_to_key = {'tri': 5, 'quad': 9, 'tet': 10, 'hex': 12} if mesh_type not in map_type_to_key: raise ValueError(f'Unknown mesh_type={mesh_type}') key = map_type_to_key[mesh_type] AggOp = csr_matrix(AggOp) # remove elements with dirichlet BCs if E2V.max() >= AggOp.shape[0]: E2V = E2V[E2V.max(axis=1) < AggOp.shape[0]] # 1 # # Find elements with all vertices in same aggregate # account for 0 rows. Mark them as solitary aggregates if len(AggOp.indices) != AggOp.shape[0]: full_aggs = ((AggOp.indptr[1:] - AggOp.indptr[:-1]) == 0).nonzero()[0] new_aggs = np.array(AggOp.sum(axis=1), dtype=int).ravel() new_aggs[full_aggs == 1] = AggOp.indices # keep existing aggregate IDs new_aggs[full_aggs == 0] = AggOp.shape[1] # fill in singletons maxID+1 ElementAggs = new_aggs[E2V] else: ElementAggs = AggOp.indices[E2V] # 2 # # find all aggregates encompassing full elements # mask[i] == True if all vertices in element i belong to the same aggregate mask = np.where(abs(np.diff(ElementAggs)).max(axis=1) == 0)[0] # mask = (ElementAggs[:,:] == ElementAggs[:,0]).all(axis=1) E2V_a = E2V[mask, :] # elements where element is full Nel_a = E2V_a.shape[0] # 3 # # find edges of elements in the same aggregate (brute force) # construct vertex to vertex graph col = E2V.ravel() row = np.kron(np.arange(0, E2V.shape[0]), np.ones((E2V.shape[1], ), dtype=int)) data = np.ones((len(col), )) if len(row) != len(col): raise ValueError('Problem constructing vertex-to-vertex map') V2V = coo_matrix((data, (row, col)), shape=(E2V.shape[0], E2V.max() + 1)) V2V = V2V.T * V2V V2V = triu(V2V, 1).tocoo() # get all the edges edges = np.vstack((V2V.row, V2V.col)).T # all the edges in the same aggregate E2V_b = edges[AggOp.indices[V2V.row] == AggOp.indices[V2V.col]] Nel_b = E2V_b.shape[0] # 3.5 # # single node aggregates sums = np.array(AggOp.sum(axis=0)).ravel() E2V_c = np.where(sums == 1)[0] Nel_c = len(E2V_c) # 4 # # now write out the elements and edges colors_a = 3 * np.ones((Nel_a, )) # color triangles with threes colors_b = 2 * np.ones((Nel_b, )) # color edges with twos colors_c = 1 * np.ones((Nel_c, )) # color the vertices with ones cells = {1: E2V_c, 3: E2V_b, key: E2V_a} cdata = {1: colors_c, 3: colors_b, key: colors_a} # make sure it's a tuple write_vtu(V=V, cells=cells, fname=fname, cdata=cdata)
def mask_test_edges( adj: sp.coo_matrix, seed: int = 0, validation_frac: float = 0.05, test_frac: float = 0.1, validation_edges_in_adj: bool = False, ): """ Split edges for graph autoencoder into train/validation/test splits. Based on https://github.com/tkipf/gae/blob/master/gae/preprocessing.py Args: adj: scipy.sparse.coo_matrix adjacency matrix. """ rng = np.random.default_rng(seed) def sparse_to_tuple(sparse_mx): if not sp.isspmatrix_coo(sparse_mx): sparse_mx = sparse_mx.tocoo() coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose() values = sparse_mx.data shape = sparse_mx.shape return coords, values, shape # Remove diagonal elements adj = adj - sp.dia_matrix( (adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) adj.eliminate_zeros() # Check that diag is zero: assert np.diag(adj.todense()).sum() == 0 adj_triu = sp.triu(adj) adj_tuple = sparse_to_tuple(adj_triu) edges = adj_tuple[0] edges_all = sparse_to_tuple(adj)[0] num_test = int(np.floor(edges.shape[0] * test_frac)) num_val = int(np.floor(edges.shape[0] * validation_frac)) all_edge_idx = list(range(edges.shape[0])) rng.shuffle(all_edge_idx) val_edge_idx = all_edge_idx[:num_val] test_edge_idx = all_edge_idx[num_val:(num_val + num_test)] test_edges = edges[test_edge_idx] val_edges = edges[val_edge_idx] train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0) # TODO: use sets? def ismember(a, b, tol=5): rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) return np.any(rows_close) test_edges_false = [] while len(test_edges_false) < len(test_edges): idx_i = rng.integers(0, adj.shape[0]) idx_j = rng.integers(0, adj.shape[0]) if idx_i == idx_j: continue if ismember([idx_i, idx_j], edges_all): continue if test_edges_false: if ismember([idx_j, idx_i], np.array(test_edges_false)): continue if ismember([idx_i, idx_j], np.array(test_edges_false)): continue test_edges_false.append([idx_i, idx_j]) val_edges_false = [] while len(val_edges_false) < len(val_edges): idx_i = rng.integers(0, adj.shape[0]) idx_j = rng.integers(0, adj.shape[0]) if idx_i == idx_j: continue if ismember([idx_i, idx_j], train_edges): continue if ismember([idx_j, idx_i], train_edges): continue if ismember([idx_i, idx_j], val_edges): continue if ismember([idx_j, idx_i], val_edges): continue if val_edges_false: if ismember([idx_j, idx_i], np.array(val_edges_false)): continue if ismember([idx_i, idx_j], np.array(val_edges_false)): continue val_edges_false.append([idx_i, idx_j]) assert ~ismember(test_edges_false, edges_all) assert ~ismember(val_edges_false, edges_all) assert ~ismember(val_edges, train_edges) assert ~ismember(test_edges, train_edges) assert ~ismember(val_edges, test_edges) if validation_edges_in_adj: adj_edges = np.concatenate((train_edges, val_edges), axis=0) else: adj_edges = train_edges data = np.ones(adj_edges.shape[0]) # Re-build adj matrix adj_train = sp.coo_matrix((data, adj_edges.T), shape=adj.shape) adj_train = adj_train + adj_train.T # NOTE: these edge lists only contain single direction of edge! return ( adj_train, val_edges, val_edges_false, test_edges, test_edges_false, )
def lambda_test_edges(dataset, adj, l): # Function to build training test with proportion l # Remove diagonal elements adj = adj - sp.dia_matrix( (adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) adj.eliminate_zeros() # Check that diag is zero: assert np.diag(adj.todense()).sum() == 0 test_ratio = 1 - 0.05 - l adj_triu = sp.triu(adj) adj_tuple = sparse_to_tuple(adj_triu) edges = adj_tuple[0] edges_all = sparse_to_tuple(adj)[0] num_test = int(np.floor(edges.shape[0] * test_ratio)) num_val = int(np.floor(edges.shape[0] * 0.05)) all_edge_idx = list(range(edges.shape[0])) np.random.shuffle(all_edge_idx) val_edge_idx = all_edge_idx[:num_val] test_edge_idx = all_edge_idx[num_val:(num_val + num_test)] test_edges = edges[test_edge_idx] val_edges = edges[val_edge_idx] train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0) def ismember(a, b, tol=5): rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) return np.any(rows_close) test_edges_false = [] while len(test_edges_false) < len(test_edges): idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue if ismember([idx_i, idx_j], edges_all): continue if test_edges_false: if ismember([idx_j, idx_i], np.array(test_edges_false)): continue if ismember([idx_i, idx_j], np.array(test_edges_false)): continue test_edges_false.append([idx_i, idx_j]) val_edges_false = [] while len(val_edges_false) < len(val_edges): idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue if ismember([idx_i, idx_j], train_edges): continue if ismember([idx_j, idx_i], train_edges): continue if ismember([idx_i, idx_j], val_edges): continue if ismember([idx_j, idx_i], val_edges): continue if val_edges_false: if ismember([idx_j, idx_i], np.array(val_edges_false)): continue if ismember([idx_i, idx_j], np.array(val_edges_false)): continue val_edges_false.append([idx_i, idx_j]) assert ~ismember(test_edges_false, edges_all) assert ~ismember(val_edges_false, edges_all) assert ~ismember(val_edges, train_edges) assert ~ismember(test_edges, train_edges) assert ~ismember(val_edges, test_edges) data = np.ones(train_edges.shape[0]) # Re-build adj matrix adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) adj_train = adj_train + adj_train.T return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false
def density(self, grid, spinor=None, tol=1e-7, eta=False): r""" Expand the density matrix to the charge density on a grid This routine calculates the real-space density components on a specified grid. This is an *in-place* operation that *adds* to the current values in the grid. Note: To calculate :math:`\rho(\mathbf r)` in a unit-cell different from the originating geometry, simply pass a grid with a unit-cell different than the originating supercell. The real-space density is calculated as: .. math:: \rho(\mathbf r) = \sum_{\nu\mu}\phi_\nu(\mathbf r)\phi_\mu(\mathbf r) D_{\nu\mu} While for non-collinear/spin-orbit calculations the density is determined from the spinor component (`spinor`) by .. math:: \rho_{\boldsymbol\sigma}(\mathbf r) = \sum_{\nu\mu}\phi_\nu(\mathbf r)\phi_\mu(\mathbf r) \sum_\alpha [\boldsymbol\sigma \mathbf \rho_{\nu\mu}]_{\alpha\alpha} Here :math:`\boldsymbol\sigma` corresponds to a spinor operator to extract relevant quantities. By passing the identity matrix the total charge is added. By using the Pauli matrix :math:`\boldsymbol\sigma_x` only the :math:`x` component of the density is added to the grid (see `Spin.X`). Parameters ---------- grid : Grid the grid on which to add the density (the density is in ``e/Ang^3``) spinor : (2,) or (2, 2), optional the spinor matrix to obtain the diagonal components of the density. For un-polarized density matrices this keyword has no influence. For spin-polarized it *has* to be either 1 integer or a vector of length 2 (defaults to total density). For non-collinear/spin-orbit density matrices it has to be a 2x2 matrix (defaults to total density). tol : float, optional DM tolerance for accepted values. For all density matrix elements with absolute values below the tolerance, they will be treated as strictly zeros. eta : bool, optional show a progressbar on stdout """ try: # Once unique has the axis keyword, we know we can safely # use it in this routine # Otherwise we raise an ImportError unique([[0, 1], [2, 3]], axis=0) except: raise NotImplementedError( self.__class__.__name__ + '.density requires numpy >= 1.13, either update ' 'numpy or do not use this function!') geometry = self.geometry # Check that the atomic coordinates, really are all within the intrinsic supercell. # If not, it may mean that the DM does not conform to the primary unit-cell paradigm # of matrix elements. It complicates things. fxyz = geometry.fxyz f_min = fxyz.min() f_max = fxyz.max() del fxyz, f_min, f_max # Extract sub variables used throughout the loop shape = _a.asarrayi(grid.shape) dcell = grid.dcell # Sparse matrix data csr = self._csr # In the following we don't care about division # So 1) save error state, 2) turn off divide by 0, 3) calculate, 4) turn on old error state old_err = np.seterr(divide='ignore', invalid='ignore') # Placeholder for the resulting coefficients DM = None if self.spin.kind > Spin.POLARIZED: if spinor is None: # Default to the total density spinor = np.identity(2, dtype=np.complex128) else: spinor = _a.arrayz(spinor) if spinor.size != 4 or spinor.ndim != 2: raise ValueError( self.__class__.__name__ + '.density with NC/SO spin, requires a 2x2 matrix.') DM = _a.emptyz([self.nnz, 2, 2]) idx = array_arange(csr.ptr[:-1], n=csr.ncol) if self.spin.kind == Spin.NONCOLINEAR: # non-collinear DM[:, 0, 0] = csr._D[idx, 0] DM[:, 0, 1] = csr._D[idx, 2] + 1j * csr._D[idx, 3] DM[:, 1, 0] = np.conj(DM[:, 0, 1]) DM[:, 1, 1] = csr._D[idx, 1] else: # spin-orbit DM[:, 0, 0] = csr._D[idx, 0] + 1j * csr._D[idx, 4] DM[:, 0, 1] = csr._D[idx, 2] + 1j * csr._D[idx, 3] DM[:, 1, 0] = csr._D[idx, 6] + 1j * csr._D[idx, 7] DM[:, 1, 1] = csr._D[idx, 1] + 1j * csr._D[idx, 5] # Perform dot-product with spinor, and take out the diagonal real part DM = dot(DM, spinor.T)[:, [0, 1], [0, 1]].sum(1).real elif self.spin.kind == Spin.POLARIZED: if spinor is None: spinor = _a.onesd(2) elif isinstance(spinor, Integral): # extract the provided spin-polarization s = _a.zerosd(2) s[spinor] = 1. spinor = s else: spinor = _a.arrayd(spinor) if spinor.size != 2 or spinor.ndim != 1: raise ValueError( self.__class__.__name__ + '.density with polarized spin, requires spinor ' 'argument as an integer, or a vector of length 2') idx = array_arange(csr.ptr[:-1], n=csr.ncol) DM = csr._D[idx, 0] * spinor[0] + csr._D[idx, 1] * spinor[1] else: idx = array_arange(csr.ptr[:-1], n=csr.ncol) DM = csr._D[idx, 0] # Create the DM csr matrix. csrDM = csr_matrix( (DM, csr.col[idx], np.insert(np.cumsum(csr.ncol), 0, 0)), shape=(self.shape[:2]), dtype=DM.dtype) # Clean-up del idx, DM # To heavily speed up the construction of the density we can recreate # the sparse csrDM matrix by summing the lower and upper triangular part. # This means we only traverse the sparse UPPER part of the DM matrix # I.e.: # psi_i * DM_{ij} * psi_j + psi_j * DM_{ji} * psi_i # is equal to: # psi_i * (DM_{ij} + DM_{ji}) * psi_j # Secondly, to ease the loops we extract the main diagonal (on-site terms) # and store this for separate usage csr_sum = [None] * geometry.n_s no = geometry.no primary_i_s = geometry.sc_index([0, 0, 0]) for i_s in range(geometry.n_s): # Extract the csr matrix o_start, o_end = i_s * no, (i_s + 1) * no csr = csrDM[:, o_start:o_end] if i_s == primary_i_s: csr_sum[i_s] = triu(csr) + tril(csr, -1).transpose() else: csr_sum[i_s] = csr # Recreate the column-stacked csr matrix csrDM = ss_hstack(csr_sum, format='csr') del csr, csr_sum # Remove all zero elements (note we use the tolerance here!) csrDM.data = np.where(np.fabs(csrDM.data) > tol, csrDM.data, 0.) # Eliminate zeros and sort indices etc. csrDM.eliminate_zeros() csrDM.sort_indices() csrDM.prune() # 1. Ensure the grid has a geometry associated with it sc = grid.sc.copy() # Find the periodic directions pbc = [ bc == grid.PERIODIC or geometry.nsc[i] > 1 for i, bc in enumerate(grid.bc[:, 0]) ] if grid.geometry is None: # Create the actual geometry that encompass the grid ia, xyz, _ = geometry.within_inf(sc, periodic=pbc) if len(ia) > 0: grid.set_geometry(Geometry(xyz, geometry.atoms[ia], sc=sc)) # Instead of looping all atoms in the supercell we find the exact atoms # and their supercell indices. add_R = _a.fulld(3, geometry.maxR()) # Calculate the required additional vectors required to increase the fictitious # supercell by add_R in each direction. # For extremely skewed lattices this will be way too much, hence we make # them square. o = sc.toCuboid(True) sc = SuperCell(o._v + np.diag(2 * add_R), origo=o.origo - add_R) # Retrieve all atoms within the grid supercell # (and the neighbours that connect into the cell) IA, XYZ, ISC = geometry.within_inf(sc, periodic=pbc) XYZ -= grid.sc.origo.reshape(1, 3) # Retrieve progressbar eta = tqdm_eta(len(IA), self.__class__.__name__ + '.density', 'atom', eta) cell = geometry.cell atom = geometry.atom axyz = geometry.axyz a2o = geometry.a2o def xyz2spherical(xyz, offset): """ Calculate the spherical coordinates from indices """ rx = xyz[:, 0] - offset[0] ry = xyz[:, 1] - offset[1] rz = xyz[:, 2] - offset[2] # Calculate radius ** 2 xyz_to_spherical_cos_phi(rx, ry, rz) return rx, ry, rz def xyz2sphericalR(xyz, offset, R): """ Calculate the spherical coordinates from indices """ rx = xyz[:, 0] - offset[0] idx = indices_fabs_le(rx, R) ry = xyz[idx, 1] - offset[1] ix = indices_fabs_le(ry, R) ry = ry[ix] idx = idx[ix] rz = xyz[idx, 2] - offset[2] ix = indices_fabs_le(rz, R) ry = ry[ix] rz = rz[ix] idx = idx[ix] if len(idx) == 0: return [], [], [], [] rx = rx[idx] # Calculate radius ** 2 ix = indices_le(rx**2 + ry**2 + rz**2, R**2) idx = idx[ix] if len(idx) == 0: return [], [], [], [] rx = rx[ix] ry = ry[ix] rz = rz[ix] xyz_to_spherical_cos_phi(rx, ry, rz) return idx, rx, ry, rz # Looping atoms in the sparse pattern is better since we can pre-calculate # the radial parts and then add them. # First create a SparseOrbital matrix, then convert to SparseAtom spO = SparseOrbital(geometry, dtype=np.int16) spO._csr = SparseCSR(csrDM) spA = spO.toSparseAtom(dtype=np.int16) del spO na = geometry.na # Remove the diagonal part of the sparse atom matrix off = na * primary_i_s for ia in range(na): del spA[ia, off + ia] # Get pointers and delete the atomic sparse pattern # The below complexity is because we are not finalizing spA csr = spA._csr a_ptr = np.insert(_a.cumsumi(csr.ncol), 0, 0) a_col = csr.col[array_arange(csr.ptr, n=csr.ncol)] del spA, csr # Get offset in supercell in orbitals off = geometry.no * primary_i_s origo = grid.origo # TODO sum the non-origo atoms to the csrDM matrix # this would further decrease the loops required. # Loop over all atoms in the grid-cell for ia, ia_xyz, isc in zip(IA, XYZ, ISC): # Get current atom ia_atom = atom[ia] IO = a2o(ia) IO_range = range(ia_atom.no) cell_offset = (cell * isc.reshape(3, 1)).sum(0) - origo # Extract maximum R R = ia_atom.maxR() if R <= 0.: warn("Atom '{}' does not have a wave-function, skipping atom.". format(ia_atom)) eta.update() continue # Retrieve indices of the grid for the atomic shape idx = grid.index(ia_atom.toSphere(ia_xyz)) # Now we have the indices for the largest orbital on the atom # Subsequently we have to loop the orbitals and the # connecting orbitals # Then we find the indices that overlap with these indices # First reduce indices to inside the grid-cell idx[idx[:, 0] < 0, 0] = 0 idx[shape[0] <= idx[:, 0], 0] = shape[0] - 1 idx[idx[:, 1] < 0, 1] = 0 idx[shape[1] <= idx[:, 1], 1] = shape[1] - 1 idx[idx[:, 2] < 0, 2] = 0 idx[shape[2] <= idx[:, 2], 2] = shape[2] - 1 # Remove duplicates, requires numpy >= 1.13 idx = unique(idx, axis=0) if len(idx) == 0: eta.update() continue # Get real-space coordinates for the current atom # as well as the radial parts grid_xyz = dot(idx, dcell) # Perform loop on connection atoms # Allocate the DM_pj arrays # This will have a size equal to number of elements times number of # orbitals on this atom # In this way we do not have to calculate the psi_j multiple times DM_io = csrDM[IO:IO + ia_atom.no, :].tolil() DM_pj = _a.zerosd([ia_atom.no, grid_xyz.shape[0]]) # Now we perform the loop on the connections for this atom # Remark that we have removed the diagonal atom (it-self) # As that will be calculated in the end for ja in a_col[a_ptr[ia]:a_ptr[ia + 1]]: # Retrieve atom (which contains the orbitals) ja_atom = atom[ja % na] JO = a2o(ja) jR = ja_atom.maxR() # Get actual coordinate of the atom ja_xyz = axyz(ja) + cell_offset # Reduce the ia'th grid points to those that connects to the ja'th atom ja_idx, ja_r, ja_theta, ja_cos_phi = xyz2sphericalR( grid_xyz, ja_xyz, jR) if len(ja_idx) == 0: # Quick step continue # Loop on orbitals on this atom for jo in range(ja_atom.no): o = ja_atom.orbital[jo] oR = o.R # Downsize to the correct indices if jR - oR < 1e-6: ja_idx1 = ja_idx ja_r1 = ja_r ja_theta1 = ja_theta ja_cos_phi1 = ja_cos_phi else: ja_idx1 = indices_le(ja_r, oR) if len(ja_idx1) == 0: # Quick step continue # Reduce arrays ja_r1 = ja_r[ja_idx1] ja_theta1 = ja_theta[ja_idx1] ja_cos_phi1 = ja_cos_phi[ja_idx1] ja_idx1 = ja_idx[ja_idx1] # Calculate the psi_j component psi = o.psi_spher(ja_r1, ja_theta1, ja_cos_phi1, cos_phi=True) # Now add this orbital to all components for io in IO_range: DM_pj[io, ja_idx1] += DM_io[io, JO + jo] * psi # Temporary clean up del ja_idx, ja_r, ja_theta, ja_cos_phi del ja_idx1, ja_r1, ja_theta1, ja_cos_phi1, psi # Now we have all components for all orbitals connection to all orbitals on atom # ia. We simply need to add the diagonal components # Loop on the orbitals on this atom ia_r, ia_theta, ia_cos_phi = xyz2spherical(grid_xyz, ia_xyz) del grid_xyz for io in IO_range: # Only loop halve the range. # This is because: triu + tril(-1).transpose() # removes the lower half of the on-site matrix. for jo in range(io + 1, ia_atom.no): DM = DM_io[io, off + IO + jo] oj = ia_atom.orbital[jo] ojR = oj.R # Downsize to the correct indices if R - ojR < 1e-6: ja_idx1 = slice(None) ja_r1 = ia_r ja_theta1 = ia_theta ja_cos_phi1 = ia_cos_phi else: ja_idx1 = indices_le(ia_r, ojR) if len(ja_idx1) == 0: # Quick step continue # Reduce arrays ja_r1 = ia_r[ja_idx1] ja_theta1 = ia_theta[ja_idx1] ja_cos_phi1 = ia_cos_phi[ja_idx1] # Calculate the psi_j component DM_pj[io, ja_idx1] += DM * oj.psi_spher( ja_r1, ja_theta1, ja_cos_phi1, cos_phi=True) # Calculate the psi_i component # Note that this one *also* zeroes points outside the shell # I.e. this step is important because it "nullifies" all but points where # orbital io is defined. psi = ia_atom.orbital[io].psi_spher(ia_r, ia_theta, ia_cos_phi, cos_phi=True) DM_pj[io, :] += DM_io[io, off + IO + io] * psi DM_pj[io, :] *= psi # Temporary clean up ja_idx1 = ja_r1 = ja_theta1 = ja_cos_phi1 = None del ia_r, ia_theta, ia_cos_phi, psi, DM_io # Now add the density grid.grid[idx[:, 0], idx[:, 1], idx[:, 2]] += DM_pj.sum(0) # Clean-up del DM_pj, idx eta.update() eta.close() # Reset the error code for division np.seterr(**old_err)
def _triu(a, sparse): if sparse: return sp.triu(a, k=1) return np.triu(a, k=1)
def assemble_adjacency_matrix(transition_counts, num_edges, inplace=True, seed=None): """ Computes an adjacency matrix for a graph based on the given transition counts and the desired number of edges. The resulting adjacency matrix will represent a graph with no singleton nodes (however, possibly with multiple connected components). Note ---- The strategy is described in *NetGAN: Generating Graphs via Random Walks* (Bojchevski, Shchur, Zügner, Günnemann, 2018). Parameters ---------- transition_counts: scipy.sparse.csr_matrix [N, N] The transition counts (e.g. obtained from random walks) for all pairs of nodes. Must be symmetric. num_edges: int The number of edges the output adjacency matrix should contain. inplace: bool, default: True Whether the transition_counts matrix may be modified. Otherwise, a copy is performed. seed: int, default: None The seed to use for generating random values. Returns ------- scipy.sparse.csr_matrix A binary adjacency matrix containing the desired number of edges. The function tries to assemble a matrix with `2 * num_edges` entries. However, if `num_edges < transitions_count.shape[0]`, then this cannot be guaranteed. The diagonal of the adjacency matrix is always zero. """ # 1) Setup # pylint: disable=no-member randomizer = np.random.RandomState(seed) # 1.1) Copy if needed if not inplace: transition_counts = transition_counts.copy() # 1.2) Set diagonal to zero transition_counts = transition_counts.tolil() transition_counts.setdiag(0) # 2) Check if the transition matrix can be converted easily if len(transition_counts.nonzero()[0]) // 2 <= num_edges: transition_counts[transition_counts.nonzero()] = 1 transition_counts += transition_counts.T transition_counts[transition_counts > 1] = 1 return transition_counts # 3) Assemble the adjacency matrix according to paper N = transition_counts.shape[0] result = sp.dok_matrix((N, N)) # transition probabilities div = transition_counts.sum(axis=0) div[div <= 0] = 1 P = (transition_counts / div).T # 3.1) Iterate over nodes in random order to sample one neighbor for node in randomizer.permutation(N)[:min(num_edges, N)]: # 3.1.1) Skip if no neighbor for the node is present if P[node].sum() == 0: continue # 3.1.2) Sample neighbor according to probabilities neighbor = randomizer.choice(N, p=P[node].A1) result[node, neighbor] = result[neighbor, node] = 1 # 3.2) Sample remaining edges # 3.2.1) Compute probabilities for drawing num_remaining_edges = int(num_edges - result.sum() / 2) if num_remaining_edges > 0: # equals size of the upper triangular matrix num_choices = (N * N + N) // 2 transition_counts[result.nonzero()] = 0 P_triu = sp.triu(transition_counts).tocsr() P_triu_indices = np.triu_indices_from(transition_counts) probabilities = (P_triu / P_triu.sum())[P_triu_indices] # 3.2.2) Choose edges edges = randomizer.choice(num_choices, replace=False, p=probabilities.A1, size=num_remaining_edges) # 3.2.3) Add edge choices to result rows = P_triu_indices[0][edges] cols = P_triu_indices[1][edges] result[rows, cols] = result[cols, rows] = 1 return result.tocsr()
def mask_test_edges2(adj): # Function to build test set with 10% positive links # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper. # TODO: Clean up. # Remove diagonal elements adj = adj - sp.dia_matrix( (adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) adj.eliminate_zeros() # Efficiently check that diag is zero: DmitriyFradkin assert np.sum(adj.diagonal()) == 0 adj_triu = sp.triu(adj) adj_tuple = sparse_to_tuple(adj_triu) edges = adj_tuple[0] edges_all = sparse_to_tuple(adj)[0] num_test = int(np.floor(edges.shape[0] / 10.)) num_val = int(np.floor(edges.shape[0] / 20.)) all_edge_idx = list(range(edges.shape[0])) np.random.shuffle(all_edge_idx) val_edge_idx = all_edge_idx[:num_val] test_edge_idx = all_edge_idx[num_val:(num_val + num_test)] test_edges = edges[test_edge_idx] val_edges = edges[val_edge_idx] train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0) data = np.ones(train_edges.shape[0]) # Re-build adj matrix adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) adj_train = adj_train + adj_train.T #def ismember(a, b, tol=5): # rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) # return np.any(rows_close) print('Generating test_edges_false {}'.format(datetime.now())) ### all edges - symmetric edges_all_set = set([(x[0], x[1]) for x in edges_all]) # generate initial set randomly: test_edges_false = generate_random_pairs(adj.shape[0], len(test_edges)) # make sure it doesn't have real edges: test_edges_false = test_edges_false - edges_all_set # add as many edges as needed: while len(test_edges_false) < len(test_edges): idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j or (idx_i, idx_j) in edges_all_set: continue if (idx_j, idx_i) in test_edges_false or (idx_i, idx_j) in test_edges_false: continue test_edges_false.add((idx_i, idx_j)) print('Generating val_edges_false {}'.format(datetime.now())) val_edges_false = generate_random_pairs(adj.shape[0], len(val_edges)) # remove edges already existing or in test_false: val_edges_false = val_edges_false - edges_all_set val_edges_false = val_edges_false - test_edges_false while len(val_edges_false) < len(val_edges): idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j or (idx_i, idx_j) in edges_all_set: continue if (idx_i, idx_j) in test_edges_false or (idx_j, idx_i) in test_edges_false: continue if (idx_i, idx_j) in val_edges_false or (idx_j, idx_i) in val_edges_false: continue val_edges_false.add((idx_i, idx_j)) # assert ~ismember(test_edges_false, edges_all) # assert ~ismember(val_edges_false, edges_all) # assert ~ismember(val_edges, train_edges) # assert ~ismember(test_edges, train_edges) # assert ~ismember(val_edges, test_edges) # convert sets to numpy arrays: test_edges_false = np.array([np.array(x) for x in test_edges_false]) val_edges_false = np.array([np.array(x) for x in val_edges_false]) # NOTE: these edge lists only contain single direction of edge! return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false
def write_hamiltonian(self, ham, hermitian=True, **kwargs): """ Writes the Hamiltonian model to the file Writes a Hamiltonian model to the intrinsic Hamiltonian file format. The file can be constructed by the implict force of Hermiticity, or without. Utilizing the Hermiticity we reduce the file-size by approximately 50%. Parameters ---------- ham : `Hamiltonian` model hermitian : boolean=True whether the stored data is halved using the Hermitian property """ # We use the upper-triangular form of the Hamiltonian # and the overlap matrix for hermitian problems geom = ham.geometry # First write the geometry self.write_geometry(geom, **kwargs) # We default to the advanced layuot if we have more than one # orbital on any one atom advanced = kwargs.get( 'advanced', np.any(np.array([a.no for a in geom.atom.atom], np.int32) > 1)) fmt = kwargs.get('fmt', 'g') if advanced: fmt1_str = ' {{0:d}}[{{1:d}}] {{2:d}}[{{3:d}}] {{4:{0}}}\n'.format( fmt) fmt2_str = ' {{0:d}}[{{1:d}}] {{2:d}}[{{3:d}}] {{4:{0}}} {{5:{0}}}\n'.format( fmt) else: fmt1_str = ' {{0:d}} {{1:d}} {{2:{0}}}\n'.format(fmt) fmt2_str = ' {{0:d}} {{1:d}} {{2:{0}}} {{3:{0}}}\n'.format(fmt) # We currently force the model to be finalized # before we can write it # This should be easily circumvented H = ham.tocsr(0) if not ham.orthogonal: S = ham.tocsr(ham.S_idx) # If the model is Hermitian we can # do with writing out half the entries if hermitian: herm_acc = kwargs.get('herm_acc', 1e-6) # We check whether it is Hermitian (not S) for i, isc in enumerate(geom.sc.sc_off): oi = i * geom.no oj = geom.sc_index(-isc) * geom.no # get the difference between the ^\dagger elements diff = H[:, oi:oi + geom.no] - \ H[:, oj:oj + geom.no].transpose() diff.eliminate_zeros() if np.any(np.abs(diff.data) > herm_acc): amax = np.amax(np.abs(diff.data)) warn( SileWarning( 'The model could not be asserted to be Hermitian ' 'within the accuracy required ({0}).'.format( amax))) hermitian = False del diff if hermitian: # Remove all double stuff for i, isc in enumerate(geom.sc.sc_off): if np.any(isc < 0): # We have ^\dagger element, remove it o = i * geom.no # Ensure that we remove all nullified quantities # (setting elements to zero will add them internally # :(, hence this actually constructs the full matrix # Therefore we do it on a row basis, to limit memory # requirements for j in range(geom.no): H[j, o:o + geom.no] = 0. H.eliminate_zeros() if not ham.orthogonal: S[j, o:o + geom.no] = 0. S.eliminate_zeros() o = geom.sc_index(np.zeros([3], np.int32)) # Get upper-triangular matrix of the unit-cell H and S ut = triu(H[:, o:o + geom.no], k=0).tocsr() for j in range(geom.no): H[j, o:o + geom.no] = 0. H[j, o:o + geom.no] = ut[j, :] H.eliminate_zeros() if not ham.orthogonal: ut = triu(S[:, o:o + geom.no], k=0).tocsr() for j in range(geom.no): S[j, o:o + geom.no] = 0. S[j, o:o + geom.no] = ut[j, :] S.eliminate_zeros() # Ensure that S and H have the same sparsity pattern for jo, io in ispmatrix(S): H[jo, io] = H[jo, io] del ut # Start writing of the model # We loop on all super-cells for i, isc in enumerate(geom.sc.sc_off): # Check that we have any contributions in this # sub-section Hsub = H[:, i * geom.no:(i + 1) * geom.no] if not ham.orthogonal: Ssub = S[:, i * geom.no:(i + 1) * geom.no] if Hsub.getnnz() == 0: continue # We have a contribution, write out the information self._write('\nbegin matrix {0:d} {1:d} {2:d}\n'.format(*isc)) if advanced: for jo, io, h in ispmatrixd(Hsub): o = np.array([jo, io], np.int32) a = geom.o2a(o) o = o - geom.a2o(a) if not ham.orthogonal: s = Ssub[jo, io] elif jo == io: s = 1. else: s = 0. if s == 0.: self._write(fmt1_str.format(a[0], o[0], a[1], o[1], h)) else: self._write( fmt2_str.format(a[0], o[0], a[1], o[1], h, s)) else: for jo, io, h in ispmatrixd(Hsub): if not ham.orthogonal: s = Ssub[jo, io] elif jo == io: s = 1. else: s = 0. if s == 0.: self._write(fmt1_str.format(jo, io, h)) else: self._write(fmt2_str.format(jo, io, h, s)) self._write('end matrix {0:d} {1:d} {2:d}\n'.format(*isc))
def _compute_global_cell_graph_features( centroids, neighbor_distances, neighbor_counts, ): """Internal support for compute_global_cell_graph_features that returns its result in a nested nametuple structure instead of a pandas DataFrame. """ vor = Voronoi(centroids) centroids = vor.points vertices = vor.vertices regions = [r for r in vor.regions if r and -1 not in r] areas = np.stack(_poly_area(vertices[r]) for r in regions) peris = np.stack(_poly_peri(vertices[r]) for r in regions) max_dists = np.stack(pdist(vertices[r]).max() for r in regions) poly_props = PolyProps._make(map(_pop_stats, (areas, peris, max_dists))) de = Delaunay(centroids) # From the docs: "Coplanar points are input points which were not # included in the triangulation due to numerical precision # issues." I don't know how this would affect the results if # present, and it doesn't appear to happen, so it's excluded here. assert not de.coplanar.size indptr, indices = de.vertex_neighbor_vertices bin_connectivity = sparse.csr_matrix( (np.ones(len(indices), dtype=bool), indices, indptr), (len(centroids), ) * 2) ridge_points = sparse.triu(bin_connectivity, format='coo') ridge_points = np.stack((ridge_points.row, ridge_points.col), axis=-1) # This isn't exactly the collection of sides, since if they should # be counted per-triangle then we weight border ridges wrong # relative to ridges that are part of two triangles. ridge_lengths = _dist(*np.swapaxes(centroids[ridge_points], 0, 1)) sides = ridge_lengths areas = np.stack(_poly_area(centroids[t]) for t in de.simplices) tri_props = TriProps._make(map(_pop_stats, (sides, areas))) graph = sparse.coo_matrix((ridge_lengths, ridge_points.T), (len(centroids), len(centroids))) mst = minimum_spanning_tree(graph) # Without looking into exactly how minimum_spanning_tree # constructs its output, elimate any explicit zeros to be on the # safe side. mst_branches = _pop_stats(mst.data[mst.data != 0]) tree = KDTree(centroids) neigbors_in_distance = { # Yes, we just throw away the actual points r: _pop_stats(np.stack(map(len, tree.query_ball_tree(tree, r))) - 1) for r in neighbor_distances } distance_for_neighbors = dict( zip( neighbor_counts, map(_pop_stats, tree.query(centroids, [c + 1 for c in neighbor_counts])[0].T), )) density_props = DensityProps(neigbors_in_distance, distance_for_neighbors) return Props(poly_props, tri_props, mst_branches, density_props)
def test_triul(shape, k): s = sparse.random(shape, density=0.5) x = s.todense() assert_eq(np.triu(x, k), sparse.triu(s, k)) assert_eq(np.tril(x, k), sparse.tril(s, k))
for EI in G6.Edges(): print EI.GetSrcNId(), EI.GetDstNId() import random import networkx as nx import numpy as np from scipy import sparse random.seed(10) np.random.seed(123) p = 5 d = 1 # G = nx.scale_free_graph(p) S = nx.barabasi_albert_graph(p, d) S = nx.adjacency_matrix(S) S = sparse.triu(S) row_ix, col_ix = sparse.find(S)[0:2] n_nonzero = len(sparse.find(S)[2]) S = S.todense().astype(float) S0 = S.copy() for i in range(n_nonzero): r = np.random.uniform(0, 1.) S[row_ix[i], col_ix[i]] = r-1. if r < 0.5 else r vec_div = 1.5*np.sum(np.absolute(S), axis = 1) for i in range(p): if vec_div[i]: # only when the absolute value of the vector is not zero do the standardization S[i,:] = S[i,:]/vec_div[i] A = (S + S.T)/2 + np.matrix(np.eye(p)) # check if A is PD
row = [] col = [] for contig in contigs: for spacer in d[contig]: row.append(contigs_id[contig]) col.append(spacers_id[spacer]) data = np.ones(len(row)) from scipy.sparse import csr_matrix, find contig_spacer_mat = csr_matrix((data, (row, col)), shape=(len(contigs), len(spacers))) spacer_cooccur_mat = contig_spacer_mat.T * contig_spacer_mat i, j, v = find(spacer_cooccur_mat) diag = spacer_cooccur_mat.diagonal() w = np.where( np.logical_and(2 * v / (diag[i] + diag[j]) >= args.min_dice_coefficient, v >= args.min_co_occurance), v, 0) spacer_cooccur_mat_ = csr_matrix((w, (i, j)), shape=spacer_cooccur_mat.shape) spacer_cooccur_mat_.setdiag(0) spacer_cooccur_mat_.eliminate_zeros() from scipy.sparse import triu for i, j, v in zip(*find(triu(spacer_cooccur_mat_, k=1))): print(spacers[i], spacers[j], v)
def mask_bipartite_perturbation_test_edges(adj): print('args.dataset: ', args.dataset) with open('data/bipartite/id2name/'+ str(args.dataset) +'u2id.pkl', 'rb') as f: u2id = pickle.load(f) with open('data/bipartite/id2name/'+ str(args.dataset) +'v2id.pkl', 'rb') as f: v2id = pickle.load(f) # Function to build test set with 10% positive links # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper. # TODO: Clean up. # Remove diagonal elements adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) adj.eliminate_zeros() # Check that diag is zero: assert np.diag(adj.todense()).sum() == 0 adj_triu = sp.triu(adj) adj_tuple = sparse_to_tuple(adj_triu) edges = adj_tuple[0] edges_all = sparse_to_tuple(adj)[0] ''' original training/test''' num_test = int(np.floor(edges.shape[0] / args.num_test)) num_val = int(np.floor(edges.shape[0] / 20.)) all_edge_idx = list(range(edges.shape[0])) np.random.seed(args.edge_idx_seed) np.random.shuffle(all_edge_idx) args.edge_idx_seed += 1 val_edge_idx = all_edge_idx[:num_val] test_edge_idx = all_edge_idx[num_val:(num_val + num_test)] test_edges = edges[test_edge_idx] val_edges = edges[val_edge_idx] train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0) # Re-build adj matrix data = np.ones(train_edges.shape[0]) adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape) # adj_train = adj_train + adj_train.T def ismember(a, b, tol=5): rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1) return np.any(rows_close) def isSetValidMember(a,b): setA = set() setB = set() for (x,y) in a: setA.add((x,y)) for (x,y) in b: setA.add((x,y)) return len(setA.intersection(setB)) > 0 def isSetMember(a,b): setA = set() setB = set() for (x,y) in a: setA.add((x,y)) for index in range(b.shape[0]): setB.add((b[index,0],b[index,1])) return len(setA.intersection(setB)) > 0 if args.use_saved_edge_false: with open(str(args.dataset) +'_test_edges_false.pkl', 'rb') as f: test_edges_false = pickle.load(f) with open(str(args.dataset) +'_val_edges_false.pkl', 'rb') as f: val_edges_false = pickle.load(f) print('len(train_edges): ',len(train_edges)) print('len(test_edges): ',len(test_edges)) print('len(edges): ', len(edges)) assert ~isSetMember(test_edges_false, edges) print('~isSetMember(test_edges_false, edges) is True') assert ~isSetMember(val_edges_false, edges) print('~isSetMember(val_edges_false, edges) is True') assert ~isSetMember(val_edges, train_edges) print('~isSetMember(val_edges, train_edges) is True') assert ~isSetMember(test_edges, train_edges) print('~isSetMember(test_edges, train_edges) is True') assert ~isSetMember(val_edges, test_edges) print('~isSetMember(val_edges, test_edges) is True') return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false, edges_all, None test_edges_false = [] val_edges_false = [] ''' only for large datasets ''' # if args.dataset == 'movie1m' or args.dataset == 'movie100k' or args.dataset == 'pubmed' or args.dataset == 'nanet': top_right_adj = adj[:len(u2id),len(u2id):].toarray() indexes = np.where(top_right_adj==0.0) np.random.seed(args.edge_idx_seed) np.random.shuffle(indexes[0]) np.random.seed(args.edge_idx_seed) np.random.shuffle(indexes[1]) val_index_i = indexes[0][:num_val] val_index_j = np.array(indexes[1][:num_val]) + len(u2id) test_index_i = indexes[0][num_val:num_test+num_val] test_index_j = np.array(indexes[1][num_val:num_test+num_val]) + len(u2id) false_edges = [] for i in range(len(indexes[0])): idx_i = indexes[0][i] idx_j = indexes[1][i] false_edges.append([idx_i, idx_j]) for i in range(len(val_edges)): idx_i = val_index_i[i] idx_j = val_index_j[i] val_edges_false.append([idx_i, idx_j]) for i in range(len(test_edges)): idx_i = test_index_i[i] idx_j = test_index_j[i] test_edges_false.append([idx_i, idx_j]) # print(test_edges_false) # print(val_edges_false) # print(np.hstack([val_edges_false, test_edges_false])) train_false_edges = np.delete(false_edges, val_edges_false + test_edges_false, axis=0) train_false_edges = train_false_edges[:len(train_edges)] assert ~isSetMember(test_edges_false, edges) print('~isSetMember(test_edges_false, edges) is True') assert ~isSetMember(val_edges_false, edges) print('~isSetMember(val_edges_false, edges) is True') assert ~isSetMember(val_edges, train_edges) print('~isSetMember(val_edges, train_edges) is True') assert ~isSetMember(test_edges, train_edges) print('~isSetMember(test_edges, train_edges) is True') assert ~isSetMember(val_edges, test_edges) print('~isSetMember(val_edges, test_edges) is True') assert ~isSetValidMember(val_edges_false, test_edges_false) print('~isSetMember(val_edges_false, test_edges_false) is True') print('len(train_edges): ',len(train_edges)) print('len(val_edges): ',len(val_edges)) print('len(test_edges): ',len(test_edges)) print('len(edges): ', len(edges)) print('len(val_edges_false):', len(val_edges_false)) print('len(test_edges_false):', len(test_edges_false)) print('len(false_edges):', len(false_edges)) print('len(edges_all):', len(edges_all)) # print('train false edges!') return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false, edges_all, false_edges
def main(args=None): args = parse_arguments().parse_args(args) log.debug(args) # parse from hicpro, homer, h5 and hic to cool if args.inputFormat != 'hic' and args.outputFormat != 'mcool': if len(args.matrices) != len(args.outFileName): log.error( 'Number of input matrices does not match number output matrices!' ) exit(1) if args.inputFormat == 'hic' and args.outputFormat == 'cool': log.info('Converting with hic2cool.') for i, matrix in enumerate(args.matrices): if args.resolutions is None: hic2cool_convert(matrix, args.outFileName[i], 0) else: for resolution in args.resolutions: out_name = args.outFileName[i].split('.') out_name[-2] = out_name[-2] + '_' + str(resolution) out_name = '.'.join(out_name) hic2cool_convert(matrix, out_name, resolution) return elif args.inputFormat in ['hicpro', 'homer', 'h5', 'cool']: format_was_h5 = False if args.inputFormat == 'h5': format_was_h5 = True applyCorrection = True if args.store_applied_correction: applyCorrection = False if args.inputFormat == 'hicpro': if len(args.matrices) != len(args.bedFileHicpro): log.error( 'Number of matrices and associated bed files need to be the same.' ) log.error('Matrices: {}; Bed files: {}'.format( len(args.matrices), len(args.bedFileHicpro))) sys.exit(1) for i, matrix in enumerate(args.matrices): if args.inputFormat == 'hicpro': matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pBedFileHicPro=args.bedFileHicpro[i]) else: correction_operator = None if args.correction_division: correction_operator = '/' chromosomes_to_load = None if args.chromosome: chromosomes_to_load = [args.chromosome] applyCorrectionCoolerLoad = True if args.load_raw_values: applyCorrectionCoolerLoad = False matrixFileHandlerInput = MatrixFileHandler( pFileType=args.inputFormat, pMatrixFile=matrix, pCorrectionFactorTable=args.correction_name, pCorrectionOperator=correction_operator, pChrnameList=chromosomes_to_load, pEnforceInteger=args.enforce_integer, pApplyCorrectionCoolerLoad=applyCorrectionCoolerLoad) _matrix, cut_intervals, nan_bins, \ distance_counts, correction_factors = matrixFileHandlerInput.load() log.debug('Setting done') if args.outputFormat in ['cool', 'h5', 'homer', 'ginteractions']: if args.outputFormat in ['homer', 'ginteractions']: # make it a upper triangular matrix in case it is not already _matrix = triu(_matrix) # make it a full symmetrical matrix _matrix = _matrix.maximum(_matrix.T) matrixFileHandlerOutput = MatrixFileHandler( pFileType=args.outputFormat, pEnforceInteger=args.enforce_integer, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save(args.outFileName[i], pSymmetric=True, pApplyCorrection=applyCorrection) elif args.outputFormat in ['mcool']: log.debug('outformat is mcool') if args.resolutions and len(args.matrices) > 1: log.error( 'Please define one matrix and many resolutions which should be created or multiple matrices.' ) if args.resolutions: log.info( 'Correction factors are removed. They are not valid for any new created resolution.' ) hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() for j, resolution in enumerate(args.resolutions): hic_matrix_res = deepcopy(hic_matrix) _mergeFactor = int(resolution) // bin_size log.debug('bin size {}'.format(bin_size)) log.debug('_mergeFactor {}'.format(_mergeFactor)) if int(resolution) != bin_size: merged_matrix = hicMergeMatrixBins.merge_bins( hic_matrix_res, _mergeFactor) else: merged_matrix = hic_matrix_res append = False if j > 0: append = True matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pEnforceInteger=args.enforce_integer, pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( merged_matrix.matrix, merged_matrix.cut_intervals, merged_matrix.nan_bins, merged_matrix.correction_factors, merged_matrix.distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(resolution), pSymmetric=True, pApplyCorrection=applyCorrection) else: append = False if i > 0: append = True hic_matrix = HiCMatrix.hiCMatrix() hic_matrix.setMatrix(_matrix, cut_intervals) bin_size = hic_matrix.getBinSize() matrixFileHandlerOutput = MatrixFileHandler( pFileType='cool', pAppend=append, pFileWasH5=format_was_h5) matrixFileHandlerOutput.set_matrix_variables( _matrix, cut_intervals, nan_bins, correction_factors, distance_counts) matrixFileHandlerOutput.save( args.outFileName[0] + '::/resolutions/' + str(bin_size), pSymmetric=True, pApplyCorrection=applyCorrection)