def parse_input(graph_file): graph = gt.Graph() graph.set_directed(is_directed=False) with open(graph_file, 'r') as in_file: lines = in_file.readlines() color_target = int(lines[0]) for line in lines[1:]: source = int(line.split(' ')[0]) target = int(line.split(' ')[1]) if source < target: graph.add_edge(source=source, target=target, add_missing=True) else: graph.add_edge(source=target, target=source, add_missing=True) if source == target: print('error: tried to add loop') # print("added edge " + source + ' ' + target) # print(i) if int(source) == int(target): print("self edge") gt.remove_parallel_edges(graph) list_to_remove = [] for vertex in graph.vertices(): if graph.vertex(i=vertex).out_degree() == 0: list_to_remove.append(vertex) graph.remove_vertex(list_to_remove) return color_target, graph
def load_vna(in_file): with open(in_file) as f: all_lines = f.read().splitlines() it = iter(all_lines) # Ignore preamble line = next(it) while not (line.lower().startswith('*node properties') or line.lower().startswith('*node data')): line = next(it) node_properties = next(it).split(' ') node_properties = [word.lower() for word in node_properties] assert ('id' in node_properties) vertices = dict() line = next(it) gt_idx = 0 # Index for gt while not line.startswith('*'): entries = line.split(' ') vna_id = entries[0] vertex = dict() for i, prop in enumerate(node_properties): vertex[prop] = entries[i] vertex['id'] = gt_idx # Replace VNA ID by numerical gt index vertices[ vna_id] = vertex # Retain VNA ID as key of the vertices dict gt_idx += 1 line = next(it) # Skip node properties, if any while not (line.lower().startswith('*tie data')): line = next(it) edge_properties = next(it).split(' ') assert (edge_properties[0] == 'from' and edge_properties[1] == 'to') edges = [] try: while True: line = next(it) entries = line.split(' ') v_i = vertices[entries[0]]['id'] v_j = vertices[entries[1]]['id'] edges.append((v_i, v_j)) except StopIteration: pass g = gt.Graph(directed=False) g.add_vertex(len(vertices)) for v_i, v_j in edges: g.add_edge(v_i, v_j) gt.remove_parallel_edges(g) return g return None
def load_tulip_layout(in_file): g = gt.load_graph(in_file) g.set_directed(False) gt.remove_parallel_edges(g) graphics = g.vertex_properties['graphics'] Y = np.zeros((g.num_vertices(), 2)) for i in range(g.num_vertices()): Y[i, :] = [graphics[i]['x'], graphics[i]['y']] return g, Y
def create_graph(adjacency): G = gt.Graph(directed=False) G.add_vertex(len(adjacency)) for idx, v in enumerate(adjacency): if idx % 1000 == 0 and idx > 0: print idx for u in adjacency[v]: if not G.edge(v, u): G.add_edge(v, u) # NOTE: Parallel edges will be removed, if any. gt.remove_parallel_edges(G) return G
def load_graph(path, algorithms, format='graphml', component=False): sys.stdout.write('Loading network ...') sys.stdout.flush() t0 = time.time() g = gt.load_graph(path, fmt=format) if 'kores' in algorithms: gt.remove_parallel_edges(g) gt.remove_self_loops(g) if component: largest_component = gt.label_largest_component(g, directed=False) g.set_vertex_filter(largest_component) g.purge_vertices() t = time.time() sys.stdout.write('Ok! ({0} s.)\n'.format(t - t0)) return g
def save_largest_component(): global Graph l = gt.label_largest_component(Graph) print l.a remove = [] for x in xrange(len(l.a)): if l.a[x] == 0: remove.append(x) Graph.remove_vertex(remove) #u = gt.GraphView(Graph, vfilt=l) gt.remove_parallel_edges(Graph) Graph.save(base_path + graph_tool_file)
def load_graph(file): if os.path.splitext(file)[1] == '.mtx': g = load_mm(file) elif os.path.splitext(file)[1] == '.csv': g = load_csv(file) elif os.path.splitext(file)[1] == '.graph': g = load_chaco(file) elif os.path.splitext(file)[1] == '.vna': g = load_vna(file) else: # Give the file to graph_tool and hope for the best. g = gt.load_graph(file) g.set_directed(False) gt.remove_parallel_edges(g) return g
def sndlib(f): from xml.etree import ElementTree as ET root = ET.parse(f) ns = {'s': 'http://sndlib.zib.de/network'} g = gt.Graph(directed=False) g.add_vertex(len(root.findall('*/*/s:node', ns))) index = {e.get('id'):i for i,e in enumerate(root.findall('*/*/s:node', ns))} for e in root.findall('*/*/s:link', ns): g.add_edge(index[e.find('s:source', ns).text], index[e.find('s:target', ns).text]) pos = g.new_vertex_property('vector<float>', scale(np.array([(float(e.find('*/s:x', ns).text), -float(e.find('*/s:y', ns).text)) for e in root.findall('*/*/s:node', ns)]))) gt.remove_parallel_edges(g) gt.remove_self_loops(g) return g, pos
def remove_equiv_nodes(qgraph, S_Q): ''' Remove equivalent nodes from qgraph. u is equivalent to v iff v\in S_Q[u] and u\in S_Q[v] ''' mark = [0 for col in xrange(0, qgraph.num_vertices())] for u in qgraph.vertices(): for v in qgraph.vertices(): if mark[int(u)] == 0 and mark[int(v)] == 0 and int(u) != int( v) and u in S_Q[v] and v in S_Q[u]: for v_p in v.in_neighbors(): eg_in = qgraph.edge(v_p, u) if not eg_in: qgraph.add_edge(v_p, u) for v_s in v.out_neighbors(): eg_out = qgraph.edge(u, v_s) if not eg_out: qgraph.add_edge(u, v_s) mark[int(v)] = 1 qgraph_view = gt.GraphView(qgraph, vfilt=lambda v: mark[int(v)] == 0) gt.remove_parallel_edges(qgraph_view) qgraph_view.purge_vertices() return qgraph_view
#--------------------------------------------------------------------------- #--------------------------------------------------------------------------- #INPUT element = int(sys.argv[1]) #--------------------------------------------------------------------------- #--------------------------------------------------------------------------- print(element) #------------------------------------- #Variables SCREENING_NETWORK = "" #------------------------------------ #loading of a second graph used in previous studies. It is larger and shows more than 90% of the 4HAP graph. It potentially includes more feedstock and pharmaceutical regions. g = gt.load_graph(SCREENING_NETWORK) gt.remove_parallel_edges(g) #a timer is started for the CPU comparison of the two methods start = time.clock() in_network = pickle.load(open("pharma_in_network.p", "rb")) in_network_xrn = pickle.load(open("pharma_in_network_xrn.p", "rb")) HUBS_ID = pickle.load(open("hubs_Int.p", "rb")) HUBS_XRN = pickle.load(open("hubs_xrn.p", "rb")) # double-check if the loaded files are correct print("testing:There are:", len(HUBS_ID), len(HUBS_XRN), " hubs") #hardcoded biofeed input! change as soon as mre data is available Biofeed = [9896190, 345687, 2239190]
def useGraphTool(pd, space): # Extract the graphml representation of the planner data graphml = pd.printGraphML() f = open("graph.xml", 'w') f.write(graphml) f.close() # Load the graphml data using graph-tool graph = gt.load_graph("graph.xml") edgeweights = graph.edge_properties["weight"] # Write some interesting statistics avgdeg, stddevdeg = gt.vertex_average(graph, "total") avgwt, stddevwt = gt.edge_average(graph, edgeweights) print "---- PLANNER DATA STATISTICS ----" print str(graph.num_vertices()) + " vertices and " + str(graph.num_edges()) + " edges" print "Average vertex degree (in+out) = " + str(avgdeg) + " St. Dev = " + str(stddevdeg) print "Average edge weight = " + str(avgwt) + " St. Dev = " + str(stddevwt) comps, hist = gt.label_components(graph) print "Strongly connected components: " + str(len(hist)) graph.set_directed(False) # Make the graph undirected (for weak components, and a simpler drawing) comps, hist = gt.label_components(graph) print "Weakly connected components: " + str(len(hist)) # Plotting the graph gt.remove_parallel_edges(graph) # Removing any superfluous edges edgeweights = graph.edge_properties["weight"] colorprops = graph.new_vertex_property("string") vertexsize = graph.new_vertex_property("double") start = -1 goal = -1 for v in range(graph.num_vertices()): # Color and size vertices by type: start, goal, other if (pd.isStartVertex(v)): start = v colorprops[graph.vertex(v)] = "cyan" vertexsize[graph.vertex(v)] = 10 elif (pd.isGoalVertex(v)): goal = v colorprops[graph.vertex(v)] = "green" vertexsize[graph.vertex(v)] = 10 else: colorprops[graph.vertex(v)] = "yellow" vertexsize[graph.vertex(v)] = 5 # default edge color is black with size 0.5: edgecolor = graph.new_edge_property("string") edgesize = graph.new_edge_property("double") for e in graph.edges(): edgecolor[e] = "black" edgesize[e] = 0.5 # using A* to find shortest path in planner data if start != -1 and goal != -1: dist, pred = gt.astar_search(graph, graph.vertex(start), edgeweights) # Color edges along shortest path red with size 3.0 v = graph.vertex(goal) while v != graph.vertex(start): p = graph.vertex(pred[v]) for e in p.out_edges(): if e.target() == v: edgecolor[e] = "red" edgesize[e] = 2.0 v = p # Writing graph to file: # pos indicates the desired vertex positions, and pin=True says that we # really REALLY want the vertices at those positions gt.graph_draw (graph, vertex_size=vertexsize, vertex_fill_color=colorprops, edge_pen_width=edgesize, edge_color=edgecolor, output="graph.png") print print 'Graph written to graph.png'
import graph_tool.all as gt from sys import argv from re import findall if __name__ == '__main__': for f in argv[1:]: g = gt.GraphView(gt.load_graph(f), directed=False, skip_properties=True) gt.remove_parallel_edges(g) gt.remove_self_loops(g) name = findall('[^/.]+', f)[-2].split('--')[0] g.save('output/{}.xml'.format(name)) gt.graph_draw(g, output='output/{}.png'.format(name))
def trimGraph(grafo, ske, ske2): g, pos, weight, clase, nodetype, age = grafo edges_to_delete = [] to_delete = [] gt.remove_parallel_edges(g) ## DELETE WEIGHT 0 ED for edge in g.get_edges(): e = g.edge(edge[0], edge[1]) w = weight[e] if w == 0: edges_to_delete.append(edge) v1 = g.get_all_neighbors(edge[0]) v2 = g.get_all_neighbors(edge[1]) if (len(v1)) == 1: to_delete.append(edge[0]) if (len(v2)) == 1: to_delete.append(edge[1]) for i in reversed(sorted(to_delete)): g.clear_vertex(i) g.remove_vertex(i) to_delete = [] vertices = g.get_vertices() for v in vertices: vecinos = g.get_out_neighbors(v) if len(vecinos) == 2: edge = g.edge(vecinos[0], vecinos[1]) if edge is None: edge = g.add_edge(vecinos[0], vecinos[1]) ed1 = g.edge(v, vecinos[0]) w1 = weight[ed1] ed2 = g.edge(v, vecinos[1]) w2 = weight[ed2] weight[edge] = w1 + w2 if w1 == 0: clase[edge] = clase[ed2] ske2[np.where(ske2 == clase[ed1][0])] == clase[ed2][0] elif w2 == 0: clase[edge] = clase[ed1] ske2[np.where(ske2 == clase[ed2][0])] == clase[ed1][0] else: clase[edge] = clase[ed2] ske2[np.where(ske2 == clase[ed1][0])] == clase[ed2][0] g.remove_edge(ed1) g.remove_edge(ed2) to_delete.append(v) for i in reversed(sorted(to_delete)): g.clear_vertex(i) g.remove_vertex(i) vertices = g.get_vertices() pos_vertex = [] for i in vertices: pos_vertex.append(pos[i]) pos_vertex = np.array(pos_vertex) pares = [] for i in vertices: d = find_dists(i, pos, pos_vertex) mask = np.ones(pos_vertex.shape[0], bool) mask[i] = False pair = np.zeros(pos_vertex.shape[0], bool) pair[mask] = d[mask] < 3 c = np.count_nonzero(pair) if c == 1: k = np.where(pair == True)[0][0] if [k, i] not in pares: pares.append([i, k]) to_delete = [] for par in pares: v1 = par[0] v2 = par[1] if g.edge(v1, v2): if weight[g.edge(v1, v2)] == 0: vecinos2 = g.get_all_neighbors(v2) for k in vecinos2: if k != v1: edge = g.edge(v2, k) w_e = weight[edge] c_e = clase[edge] n_edge = g.add_edge(v1, k) weight[n_edge] = w_e clase[n_edge] = c_e g.clear_vertex(v2) to_delete.append(v2) for i in reversed(sorted(to_delete)): g.clear_vertex(i) g.remove_vertex(i) return [g, pos, weight, clase, nodetype, age], ske, ske2 # import graph_tool.all as gt # import cv2 # from rsmlFunc import createTree # from imageFunc import getCleanSke # from graphFunc import createGraph # from trackFunc import matchGraphs # conf = {} # exec(open('/home/ncaggion/Escritorio/pRAnalyzer/confs/config.conf').read(),conf) # for image in range(1214,1250): # g = gt.load_graph('/home/ncaggion/Escritorio/aux/graph_%s.xml.gz' %image) # pos = g.vertex_properties["pos"] # nodetype = g.vertex_properties["nodetype"] # age = g.vertex_properties["age"] # weight = g.edge_properties["weight"] # clase = g.edge_properties["clase"] # grafo1 = [g, pos, weight, clase, nodetype, age ] # seg = cv2.imread("/home/ncaggion/Escritorio/Test/Results 4/Imagenes/out_%s_2.png" %image, 0) # ske, bnodes, enodes, _ = getCleanSke(seg) # grafo2, seed, ske2 = createGraph(ske.copy(), pos[0], enodes, bnodes) # grafo2, ske, ske2 = trimGraph(grafo2, ske, ske2) # grafo2 = matchGraphs(grafo1, grafo2) # rsmlTree, numberLR = createTree(conf, 0, ["/home/ncaggion/Escritorio/Paper/Figura1/2020-03-31_12-15-18_2.png"], grafo2, ske, ske2) # rsmlTree.write(open('/home/ncaggion/Escritorio/aux/rsml_%s.rsml' %image, 'w'), encoding='unicode') # gt.graph_draw(g, pos = pos)
import networkx as nx filenames = sorted(glob.glob("./Simulatednetworks/SimulatedGraphWeek*.graphml")) FakeGraphs = [gt.load_graph(File) for File in filenames] filenames = sorted(glob.glob("./Realnetworks/tags*_2015.gml")) Graphs = [gt.load_graph(File) for File in filenames] print len(Graphs) #""" xGraphs = [nx.Graph() for i in xrange(len(Graphs))] xFakeGraphs = [nx.Graph() for i in xrange(len(FakeGraphs))] for i in xrange(len(Graphs)): gt.remove_self_loops(Graphs[i]) for e in Graphs[i].edges(): xGraphs[i].add_edge(*e) for i in xrange(len(FakeGraphs)): gt.remove_parallel_edges(FakeGraphs[i]) for e in FakeGraphs[i].edges(): xFakeGraphs[i].add_edge(*e) #print nx.rich_club_coefficient(xGraphs[0], normalized = False) RealClubs = [] print len(xGraphs) for Graph in xGraphs: Coefficients = nx.rich_club_coefficient(Graph, normalized = False) print len(Coefficients) Dummy = np.zeros((len(Coefficients),)) for i in xrange(len(Dummy)): Dummy[i] = Coefficients[i] RealClubs.append(Dummy) FakeClubs = [] for Graph in xFakeGraphs: Coefficients = nx.rich_club_coefficient(Graph, normalized = False)
def trimGraph(grafo, ske, ske2): g, pos, weight, clase, nodetype, age = grafo edges_to_delete = [] to_delete = [] gt.remove_parallel_edges(g) ## DELETE WEIGHT 0 ED for edge in g.get_edges(): e = g.edge(edge[0], edge[1]) w = weight[e] if w == 0: edges_to_delete.append(edge) v1 = g.get_all_neighbors(edge[0]) v2 = g.get_all_neighbors(edge[1]) if (len(v1)) == 1: to_delete.append(edge[0]) if (len(v2)) == 1: to_delete.append(edge[1]) for i in reversed(sorted(to_delete)): g.clear_vertex(i) g.remove_vertex(i) to_delete = [] vertices = g.get_vertices() for v in vertices: vecinos = g.get_out_neighbors(v) if len(vecinos) == 2: edge = g.edge(vecinos[0], vecinos[1]) if edge is None: edge = g.add_edge(vecinos[0], vecinos[1]) ed1 = g.edge(v, vecinos[0]) w1 = weight[ed1] ed2 = g.edge(v, vecinos[1]) w2 = weight[ed2] weight[edge] = w1 + w2 if w1 == 0: clase[edge] = clase[ed2] ske2[np.where(ske2 == clase[ed1][0])] == clase[ed2][0] elif w2 == 0: clase[edge] = clase[ed1] ske2[np.where(ske2 == clase[ed2][0])] == clase[ed1][0] else: clase[edge] = clase[ed2] ske2[np.where(ske2 == clase[ed1][0])] == clase[ed2][0] g.remove_edge(ed1) g.remove_edge(ed2) to_delete.append(v) for i in reversed(sorted(to_delete)): g.clear_vertex(i) g.remove_vertex(i) vertices = g.get_vertices() pos_vertex = [] for i in vertices: pos_vertex.append(pos[i]) pos_vertex = np.array(pos_vertex) pares = [] for i in vertices: d = find_dists(i, pos, pos_vertex) mask = np.ones(pos_vertex.shape[0], bool) mask[i] = False pair = np.zeros(pos_vertex.shape[0], bool) pair[mask] = d[mask] < 3 c = np.count_nonzero(pair) if c == 1: k = np.where(pair == True)[0][0] if [k, i] not in pares: pares.append([i, k]) to_delete = [] for par in pares: v1 = par[0] v2 = par[1] if g.edge(v1, v2): if weight[g.edge(v1, v2)] == 0: vecinos2 = g.get_all_neighbors(v2) for k in vecinos2: if k != v1: edge = g.edge(v2, k) w_e = weight[edge] c_e = clase[edge] n_edge = g.add_edge(v1, k) weight[n_edge] = w_e clase[n_edge] = c_e g.clear_vertex(v2) to_delete.append(v2) for i in reversed(sorted(to_delete)): g.clear_vertex(i) g.remove_vertex(i) return [g, pos, weight, clase, nodetype, age], ske, ske2
str(graph.num_edges()) + " edges") print("Average vertex degree (in+out) = " + str(avgdeg) + " St. Dev = " + str(stddevdeg)) print("Average edge weight = " + str(avgwt) + " St. Dev = " + str(stddevwt)) _, hist = gt.label_components(graph) print("Strongly connected components: " + str(len(hist))) # Make the graph undirected (for weak components, and a simpler drawing) graph.set_directed(False) _, hist = gt.label_components(graph) print("Weakly connected components: " + str(len(hist))) # Plotting the graph gt.remove_parallel_edges(graph) # Removing any superfluous edges edgeweights = graph.edge_properties["weight"] colorprops = graph.new_vertex_property("string") colorprops2 = graph.new_vertex_property("vector<float>") shapeprops = graph.new_vertex_property("string") vertexsize = graph.new_vertex_property("double") start = -1 goal = -1 for v in range(graph.num_vertices()): # Color and size vertices by type: start, goal, other if pd.isStartVertex(v): start = v colorprops[graph.vertex(v)] = "cyan"
def load_ply_layout(file): g = gt.Graph(directed=False) with open(file) as f: all_lines = f.read().splitlines() it = iter(all_lines) line = next(it) assert (line == 'ply') line = next(it) assert (line.startswith('format ascii')) line = next(it) while not line.startswith('element'): line = next(it) words = line.split(' ') assert (words[0] == 'element') assert (words[1] == 'vertex') assert (words[2].isdigit()) n_vertices = int(words[2]) g.add_vertex(n_vertices) assert (g.num_vertices() == n_vertices) line = next(it) v_props = OrderedDict() while line.startswith('property'): words = line.split(' ') the_type = words[1] if the_type == 'list': name = words[4] v_props[name] = dict() count_type = words[2] entry_type = words[3] v_props[name]['count_type'] = count_type v_props[name]['entry_type'] = entry_type else: name = words[2] v_props[name] = dict() v_props[name]['type'] = the_type line = next(it) print(v_props) vps = dict() for i, v_prop in enumerate(v_props): name = list(v_props.keys())[i] the_type = v_props[name]['type'] if the_type == 'float': vp = g.new_vp(the_type) vps[name] = vp else: raise NotImplementedError() print(vps) assert ('x' in vps.keys()) assert ('y' in vps.keys()) assert ('z' in vps.keys()) # Scan to next element while not line.startswith('element'): line = next(it) words = line.split(' ') assert (words[0] == 'element') assert (words[1] == 'face') assert (words[2].isdigit()) n_faces = int(words[2]) print(n_faces) line = next(it) f_props = OrderedDict() while line.startswith('property'): words = line.split(' ') the_type = words[1] if the_type == 'list': name = words[4] f_props[name] = dict() count_type = words[2] entry_type = words[3] f_props[name]['count_type'] = count_type f_props[name]['entry_type'] = entry_type else: name = words[2] f_props[name] = dict() f_props[name]['type'] = the_type line = next(it) print(f_props) while not line.startswith('end_header'): line = next(it) for i in range(n_vertices): line = next(it) words = line.split(' ') words = [word for word in words if word != ''] assert (len(words) == len(v_props.keys())) for j, word in enumerate(words): name = list(v_props.keys())[j] the_type = v_props[name]['type'] if the_type == 'float': vps[name][i] = float(word) else: raise NotImplementedError for _ in range(n_faces): line = next(it) words = line.split(' ') words = [word for word in words if word != ''] i = 0 for name in f_props.keys(): the_type = f_props[name]['type'] if the_type == 'list': if f_props[name]['count_type'] == 'uchar': n_items = int(words[i]) else: raise NotImplementedError the_list = [ int(word) for word in words[i + 1:i + 1 + n_items] ] i += 1 + n_items if name == 'vertex_indices': for j, idx1 in enumerate(the_list): idx2 = the_list[(j + 1) % len(the_list)] g.add_edge(idx1, idx2) assert (i == len(words)) gt.remove_parallel_edges(g) largest_connected_component = gt.label_largest_component(g) unreferenced = sum([1 for i in largest_connected_component.a if i == 0]) if unreferenced > 0: g.set_vertex_filter(largest_connected_component) g.purge_vertices() print('Filtered {0} unreferenced vertices.'.format(unreferenced)) if 'x' in vps.keys() and 'y' in vps.keys(): if 'z' in vps.keys(): Y = np.zeros((n_vertices, 3)) for v in g.vertices(): print(type(v)) Y[v, 0] = vps['x'][v] Y[v, 1] = vps['y'][v] Y[v, 2] = vps['z'][v] else: Y = np.zeros((n_vertices, 2)) for v in g.vertices(): Y[v, 0] = vps['x'][v] Y[v, 1] = vps['y'][v] return g, Y
def load_vna_layout(in_file): with open(in_file) as f: all_lines = f.read().splitlines() it = iter(all_lines) # Ignore preamble line = next(it) while not line.startswith('*Node properties'): line = next(it) node_properties = next(it).split(' ') assert ('ID' in node_properties and 'x' in node_properties and 'y' in node_properties) vertices = dict() line = next(it) gt_idx = 0 # Index for gt while not line.startswith('*Tie data'): entries = line.split(' ') vna_id = entries[0] vertex = dict() for i, prop in enumerate(node_properties): vertex[prop] = entries[i] vertex['ID'] = gt_idx # Replace VNA ID by numerical gt index vertices[ vna_id] = vertex # Retain VNA ID as key of the vertices dict gt_idx += 1 line = next(it) edge_properties = next(it).split(' ') assert (edge_properties[0] == 'from' and edge_properties[1] == 'to') edges = [] try: while True: line = next(it) entries = line.split(' ') v_i = vertices[entries[0]]['ID'] v_j = vertices[entries[1]]['ID'] edges.append((v_i, v_j)) except StopIteration: pass g = gt.Graph(directed=False) g.add_vertex(len(vertices)) for v_i, v_j in edges: g.add_edge(v_i, v_j) gt.remove_parallel_edges(g) Y = np.zeros((g.num_vertices(), 2)) for v in vertices.keys(): Y[vertices[v]['ID'], 0] = float(vertices[v]['x']) Y[vertices[v]['ID'], 1] = float(vertices[v]['y']) pos = g.new_vertex_property('vector<double>') pos.set_2d_array(Y.T) return g, Y return None
def nested_model_multi( adatas: List[AnnData], deg_corr: bool = True, tolerance: float = 1e-6, n_sweep: int = 10, beta: float = np.inf, samples: int = 100, collect_marginals: bool = True, n_jobs: int = -1, *, random_seed: Optional[int] = None, key_added: str = 'multi_nsbm', adjacency: Optional[List[sparse.spmatrix]] = None, neighbors_key: Optional[List[str]] = ['neighbors'], directed: bool = False, use_weights: bool = False, save_model: Union[str, None] = None, copy: bool = False, # minimize_args: Optional[Dict] = {}, dispatch_backend: Optional[str] = 'processes', # equilibrate_args: Optional[Dict] = {}, ) -> Optional[List[AnnData]]: """\ Cluster cells into subgroups using multiple modalities. Cluster cells using the nested Stochastic Block Model [Peixoto14]_, performing Bayesian inference on node groups. This function takes multiple experiments, possibly across different modalities, and perform joint clustering. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first. It also requires cells having the same names if coming from paired experiments Parameters ---------- adatas A list of processed AnnData. Neighbors must have been already calculated. deg_corr Whether to use degree correction in the minimization step. In many real world networks this is the case, although this doesn't seem the case for KNN graphs used in scanpy. tolerance Tolerance for fast model convergence. n_sweep Number of iterations to be performed in the fast model MCMC greedy approach beta Inverse temperature for MCMC greedy approach samples Number of initial minimizations to be performed. The one with smaller entropy is chosen n_jobs Number of parallel computations used during model initialization key_added `adata.obs` key under which to add the cluster labels. adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6 neighbors_key The key passed to `sc.pp.neighbors`. If all AnnData share the same key, one only has to be specified, otherwise the full tuple of all keys must be provided directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). Note that this increases computation times save_model If provided, this will be the filename for the PartitionModeState to be saved copy Whether to copy `adata` or modify it inplace. random_seed Random number to be used as seed for graph-tool Returns ------- `adata.obs[key_added]` Array of dim (number of samples) that stores the subgroup id (`'0'`, `'1'`, ...) for each cell. `adata.uns['schist']['multi_level_params']` A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. `adata.uns['schist']['multi_level_stats']` A dict with the values returned by mcmc_sweep `adata.obsm['CA_multi_nsbm_level_{n}']` A `np.ndarray` with cell probability of belonging to a specific group `adata.uns['schist']['multi_level_state']` The NestedBlockModel state object """ if random_seed: np.random.seed(random_seed) seeds = np.random.choice(range(samples**2), size=samples, replace=False) if collect_marginals and samples < 100: logg.warning( 'Collecting marginals requires sufficient number of samples\n' f'It is now set to {samples} and should be at least 100') start = logg.info('minimizing the nested Stochastic Block Model') if copy: adatas = [x.copy() for x in adatas] n_keys = len(neighbors_key) n_data = len(adatas) # are we clustering a user-provided graph or the default AnnData one? if adjacency is None: adjacency = [] if n_keys > 1 and n_keys < n_data: raise ValueError( 'The number of neighbors keys does not match' 'the number of data matrices. Either fix this' 'or pass a neighbor key that is shared across all modalities') if n_keys == 1: neighbors_key = [neighbors_key[0] for x in range(n_data)] for x in range(n_data): logg.info(f'getting adjacency for data {x}', time=start) if neighbors_key[x] not in adatas[x].uns: raise ValueError('You need to run `pp.neighbors` first ' 'to compute a neighborhood graph. for' f'data entry {x}') elif 'connectivities_key' in adatas[x].uns[neighbors_key[x]]: # scanpy>1.4.6 has matrix in another slot conn_key = adatas[x].uns[ neighbors_key[x]]['connectivities_key'] adjacency.append(adatas[x].obsp[conn_key]) else: # scanpy<=1.4.6 has sparse matrix here adjacency.append( adatas[x].uns[neighbors_key[x]]['connectivities']) # convert it to igraph and graph-tool graph_list = [] for x in range(n_data): g = get_igraph_from_adjacency(adjacency[x], directed=directed) g = g.to_graph_tool() gt.remove_parallel_edges(g) # add cell names to graph, this will be used to create # layered graph g_names = g.new_vertex_property('string') d_names = adatas[x].obs_names for xn in range(len(d_names)): g_names[xn] = d_names[xn] g.vp['cell'] = g_names graph_list.append(g) # skip weights for now # recs=[] # rec_types=[] # if use_weights: # this is not ideal to me, possibly we may need to transform # weights. More tests needed. # recs=[g.ep.weight] # rec_types=['real-normal'] # get a non-redundant list of all cell names across all modalities all_names = set(adatas[0].obs_names) [all_names.update(adatas[x].obs_names) for x in range(1, n_data)] all_names = list(all_names) # create the shared graph union_g = gt.Graph(directed=False) union_g.add_vertex(len(all_names)) u_names = union_g.new_vertex_property('string') for xn in range(len(all_names)): u_names[xn] = all_names[xn] union_g.vp['cell'] = u_names # now handle in a non elegant way the index mapping across all # modalities and the unified Graph u_cell_index = dict([(union_g.vp['cell'][x], x) for x in range(union_g.num_vertices())]) # now create layers layer = union_g.new_edge_property('int') for ng in range(n_data): for e in graph_list[ng].edges(): S, T = e.source(), e.target() Sn = graph_list[ng].vp['cell'][S] Tn = graph_list[ng].vp['cell'][T] Sidx = u_cell_index[Sn] Tidx = u_cell_index[Tn] ne = union_g.add_edge(Sidx, Tidx) layer[ne] = ng + 1 # this is the layer label union_g.ep['layer'] = layer # DONE! now proceed with standard minimization, ish if samples < 1: samples = 1 states = [ gt.NestedBlockState(g=union_g, base_type=gt.LayeredBlockState, state_args=dict(deg_corr=deg_corr, ec=union_g.ep.layer, layers=True)) for n in range(samples) ] def fast_min(state, beta, n_sweep, fast_tol, seed=None): if seed: gt.seed_rng(seed) dS = 1 while np.abs(dS) > fast_tol: dS, _, _ = state.multiflip_mcmc_sweep(beta=beta, niter=n_sweep, c=0.5) return state states = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)( delayed(fast_min)(states[x], beta, n_sweep, tolerance, seeds[x]) for x in range(samples)) logg.info(' minimization step done', time=start) pmode = gt.PartitionModeState([x.get_bs() for x in states], converge=True, nested=True) bs = pmode.get_max_nested() logg.info(' consensus step done', time=start) if save_model: import pickle fname = save_model if not fname.endswith('pkl'): fname = f'{fname}.pkl' logg.info(f'Saving model into {fname}') with open(fname, 'wb') as fout: pickle.dump(pmode, fout, 2) # prune redundant levels at the top bs = [x for x in bs if len(np.unique(x)) > 1] bs.append(np.array([0], dtype=np.int32)) #in case of type changes, check this state = gt.NestedBlockState(union_g, bs=bs, base_type=gt.LayeredBlockState, state_args=dict(deg_corr=deg_corr, ec=union_g.ep.layer, layers=True)) logg.info(' done', time=start) u_groups = np.unique(bs[0]) n_groups = len(u_groups) last_group = np.max(u_groups) + 1 if collect_marginals: # note that the size of this will be equal to the number of the groups in Mode # but some entries won't sum to 1 as in the collection there may be differently # sized partitions pv_array = pmode.get_marginal(union_g).get_2d_array( range(last_group)).T[:, u_groups] / samples groups = np.zeros((union_g.num_vertices(), len(bs)), dtype=int) for x in range(len(bs)): # for each level, project labels to the vertex level # so that every cell has a name. Note that at this level # the labels are not necessarily consecutive groups[:, x] = state.project_partition(x, 0).get_array() groups = pd.DataFrame(groups).astype('category') # rename categories from 0 to n for c in groups.columns: ncat = len(groups[c].cat.categories) new_cat = [u'%s' % x for x in range(ncat)] groups[c].cat.rename_categories(new_cat, inplace=True) levels = groups.columns # recode block names to have consistency with group names i_groups = groups.astype(int) bs = [i_groups.iloc[:, 0].values] for x in range(1, groups.shape[1]): bs.append( np.where( pd.crosstab(i_groups.iloc[:, x - 1], i_groups.iloc[:, x]) > 0)[1]) state = gt.NestedBlockState(union_g, bs) del (i_groups) groups.index = all_names # add column names groups.columns = [f"{key_added}_level_{level}" for level in range(len(bs))] # remove any column with the same key for xn in range(n_data): drop_columns = groups.columns.intersection(adatas[xn].obs.columns) adatas[xn].obs.drop(drop_columns, 'columns', inplace=True) adatas[xn].obs = pd.concat( [adatas[xn].obs, groups.loc[adatas[xn].obs_names]], axis=1) # now add marginal probabilities. if collect_marginals: # add marginals for level 0, the sum up according to the hierarchy _groups = groups.loc[adatas[xn].obs_names] _pv_array = pd.DataFrame( pv_array, index=all_names).loc[adatas[xn].obs_names].values adatas[xn].obsm[f"CM_{key_added}_level_0"] = _pv_array for group in groups.columns[1:]: ct = pd.crosstab(_groups[_groups.columns[0]], _groups[group], normalize='index', dropna=False) adatas[xn].obsm[f'CM_{group}'] = _pv_array @ ct.values # add some unstructured info if not 'schist' in adatas[xn].uns: adatas[xn].uns['schist'] = {} adatas[xn].uns['schist'][f'{key_added}'] = {} adatas[xn].uns['schist'][f'{key_added}']['stats'] = dict( level_entropy=np.array( [state.level_entropy(x) for x in range(len(state.levels))]), modularity=np.array([ gt.modularity(union_g, state.project_partition(x, 0)) for x in range(len((state.levels))) ])) bl_d = {} levels = state.get_levels() for nl in range(len(levels)): bl_d[str(nl)] = np.array(levels[nl].get_blocks().a) adatas[xn].uns['schist'][f'{key_added}']['blocks'] = bl_d # last step is recording some parameters used in this analysis adatas[xn].uns['schist'][f'{key_added}']['params'] = dict( model='multiome_nested', use_weights=use_weights, neighbors_key=neighbors_key[xn], key_added=key_added, samples=samples, collect_marginals=collect_marginals, random_seed=random_seed, deg_corr=deg_corr, # recs=recs, # rec_types=rec_types ) logg.info( ' finished', time=start, deep=( f'and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)'), ) return adatas if copy else None
def state_from_blocks( adata: AnnData, state_key: Optional[str] = 'nsbm', neighbors_key: Optional[str] = 'neighbors', adjacency: Optional[spmatrix] = None, directed: bool = False, use_weights: bool = False, deg_corr: bool = True, ): """ Returns a gt state object given an AnnData Parameters ---------- adata The annotated data matrix. state_key The key under which the state has been saved neighbors_key The key passed to `sc.pp.neighbors` adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6 directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). Note that this increases computation times deg_corr Whether to use degree correction in the minimization step. In many real world networks this is the case, although this doesn't seem the case for KNN graphs used in scanpy. Returns ------- Nothing, adds a `gt.block_state` object in adata.uns """ bl_d = adata.uns['schist'][f'{state_key}']['blocks'] params = adata.uns['schist'][f'{state_key}']['params'] if params['model'] == 'nested' or params['model'] == 'multiome_nested': blocks = [] for nl in range(len(bl_d)): blocks.append(bl_d[str(nl)]) else: blocks = bl_d['0'] if 'deg_corr' in params: deg_corr=params['deg_corr'] recs=[] rec_types=[] if use_weights: # this is not ideal to me, possibly we may need to transform # weights. More tests needed. recs=[g.ep.weight] rec_types=['real-normal'] if 'recs' in params: recs=params['recs'] if 'rec_types' in params: rec_types=params['rec_types'] if adjacency is None: if neighbors_key not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first ' 'to compute a neighborhood graph.' ) elif 'connectivities_key' in adata.uns[neighbors_key]: # scanpy>1.4.6 has matrix in another slot conn_key = adata.uns[neighbors_key]['connectivities_key'] adjacency = adata.obsp[conn_key] else: # scanpy<=1.4.6 has sparse matrix here adjacency = adata.uns[neighbors_key]['connectivities'] g = get_igraph_from_adjacency(adjacency, directed=directed) g = g.to_graph_tool() gt.remove_parallel_edges(g) if params['model'] == 'flat': state = gt.BlockState(g, b=blocks, state_args=dict(deg_corr=deg_corr, recs=recs, rec_types=rec_types) ) elif params['model'] == 'ppbm': state = gt.PPBlockState(g, b=blocks, state_args=dict(deg_corr=deg_corr, recs=recs, rec_types=rec_types) ) else: state = gt.NestedBlockState(g, bs=blocks, state_args=dict(deg_corr=deg_corr, recs=recs, rec_types=rec_types) ) return state
def leiden( adata: AnnData, resolution: float = 1, samples: int = 100, *, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, random_state: _utils.AnyRandom = 0, key_added: str = 'leiden', adjacency: Optional[sparse.spmatrix] = None, directed: bool = True, use_weights: bool = True, n_iterations: int = -1, partition_type: Optional[Type[MutableVertexPartition]] = None, neighbors_key: Optional[str] = None, obsp: Optional[str] = None, collect_marginals: bool = True, n_jobs: int = -1, copy: bool = False, save_model: Union[str, None] = None, dispatch_backend: Optional[str] = 'processes', **partition_kwargs, ) -> Optional[AnnData]: """\ Cluster cells into subgroups [Traag18]_. Cluster cells using the Leiden algorithm [Traag18]_, an improved version of the Louvain algorithm [Blondel08]_. It has been proposed for single-cell analysis by [Levine15]_. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first. Parameters ---------- adata The annotated data matrix. resolution A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters. Set to `None` if overriding `partition_type` to one that doesn’t accept a `resolution_parameter`. samples samples The number of random samples to take for consensus random_state Change the initialization of the optimization. restrict_to Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain `(obs_key, list_of_categories)`. key_added `adata.obs` key under which to add the cluster labels. adjacency Sparse adjacency matrix of the graph, defaults to neighbors connectivities. directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). n_iterations How many iterations of the Leiden clustering algorithm to perform. Positive values above 2 define the total number of iterations to perform, -1 has the algorithm run until it reaches its optimal clustering. partition_type Type of partition to use. Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`. For the available options, consult the documentation for :func:`~leidenalg.find_partition`. neighbors_key Use neighbors connectivities as adjacency. If not specified, leiden looks .obsp['connectivities'] for connectivities (default storage place for pp.neighbors). If specified, leiden looks .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities. obsp Use .obsp[obsp] as adjacency. You can't specify both `obsp` and `neighbors_key` at the same time. collect_marginals Wheter to retrieve the marginal probability to belong to a group n_jobs Number of parallel jobs to calculate partitions copy Whether to copy `adata` or modify it inplace. save_model If provided, this will be the filename for the PartitionModeState to be saved **partition_kwargs Any further arguments to pass to `~leidenalg.find_partition` (which in turn passes arguments to the `partition_type`). Returns ------- `adata.obs[key_added]` Array of dim (number of samples) that stores the subgroup id (`'0'`, `'1'`, ...) for each cell. `adata.uns['leiden']['params']` A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. """ try: import leidenalg except ImportError: raise ImportError( 'Please install the leiden algorithm: `conda install -c conda-forge leidenalg` or `pip3 install leidenalg`.' ) partition_kwargs = dict(partition_kwargs) start = logg.info('running Leiden clustering') adata = adata.copy() if copy else adata # are we clustering a user-provided graph or the default AnnData one? if adjacency is None: adjacency = _choose_graph(adata, obsp, neighbors_key) if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) # convert it to igraph g = get_igraph_from_adjacency(adjacency, directed=directed) g_gt = g.to_graph_tool() gt.remove_parallel_edges(g_gt) # flip to the default partition type if not overriden by the user if partition_type is None: partition_type = leidenalg.RBConfigurationVertexPartition # Prepare find_partition arguments as a dictionary, # appending to whatever the user provided. It needs to be this way # as this allows for the accounting of a None resolution # (in the case of a partition variant that doesn't take it on input) if use_weights: partition_kwargs['weights'] = np.array(g.es['weight']).astype(np.float64) partition_kwargs['n_iterations'] = n_iterations np.random.seed(random_state) seeds = np.random.choice(range(0, samples**2), size=samples, replace=False) if resolution is not None: partition_kwargs['resolution_parameter'] = resolution # clustering proper def membership(g, partition_type, seed, **partition_kwargs): return leidenalg.find_partition(g, partition_type, seed=seed, **partition_kwargs).membership parts = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)( delayed(membership)(g, partition_type, seeds[i], **partition_kwargs) for i in range(samples)) pmode = gt.PartitionModeState(parts, converge=True) if save_model: import pickle fname = save_model if not fname.endswith('pkl'): fname = f'{fname}.pkl' logg.info(f'Saving model into {fname}') with open(fname, 'wb') as fout: pickle.dump(pmode, fout, 2) groups = np.array(pmode.get_max(g_gt).get_array()) u_groups = np.unique(groups) n_groups = len(u_groups) last_group = np.max(u_groups) + 1 if collect_marginals: pv_array = pmode.get_marginal(g_gt).get_2d_array(range(last_group)).T[:, u_groups] / samples # rename groups to ensure they are a continuous range rosetta = dict(zip(u_groups, range(len(u_groups)))) groups = np.array([rosetta[x] for x in groups]) # store output into adata.obs if restrict_to is not None: if key_added == 'leiden': key_added += '_R' groups = rename_groups( adata, key_added, restrict_key, restrict_categories, restrict_indices, groups, ) adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(map(str, np.unique(groups))), ) if collect_marginals: adata.obsm[f"CM_{key_added}"] = pv_array # store information on the clustering parameters adata.uns['leiden'] = {} adata.uns['leiden']['params'] = dict( resolution=resolution, random_state=random_state, n_iterations=n_iterations, samples=samples, collect_marginals=collect_marginals ) logg.info( ' finished', time=start, deep=( f'found {len(np.unique(groups))} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)' ), ) return adata if copy else None
def planted_model( adata: AnnData, n_sweep: int = 10, beta: float = np.inf, tolerance=1e-6, collect_marginals: bool = True, deg_corr: bool = True, samples: int = 100, n_jobs: int = -1, *, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, random_seed: Optional[int] = None, key_added: str = 'ppbm', adjacency: Optional[sparse.spmatrix] = None, neighbors_key: Optional[str] = 'neighbors', directed: bool = False, use_weights: bool = False, copy: bool = False, save_model: Union[str, None] = None, # minimize_args: Optional[Dict] = {}, dispatch_backend: Optional[str] = 'processes', ) -> Optional[AnnData]: """\ Cluster cells into subgroups [Peixoto14]_. Cluster cells using the Planted Partition Block Model [Peixoto14]_, performing Bayesian inference on node groups. This function, in particular, uses the Planted Block Model, which is particularly suitable in case of assortative graphs and it returns the optimal number of communities This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first. Parameters ---------- adata The annotated data matrix. n_sweep Number of MCMC sweeps to get the initial guess beta Inverse temperature for the initial MCMC sweep tolerance Difference in description length to stop MCMC sweep iterations collect_marginals Whether or not collect node probability of belonging to a specific partition. deg_corr Whether to use degree correction in the minimization step. In many real world networks this is the case, although this doesn't seem the case for KNN graphs used in scanpy. samples Number of initial minimizations to be performed. This influences also the precision for marginals key_added `adata.obs` key under which to add the cluster labels. adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6 neighbors_key The key passed to `sc.pp.neighbors` directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). Note that this increases computation times copy Whether to copy `adata` or modify it inplace. save_model If provided, this will be the filename for the PartitionModeState to be saved random_seed Random number to be used as seed for graph-tool n_jobs Number of parallel computations used during model initialization Returns ------- `adata.obs[key_added]` Array of dim (number of samples) that stores the subgroup id (`'0'`, `'1'`, ...) for each cell. `adata.uns['schist']['params']` A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. `adata.uns['schist']['stats']` A dict with the values returned by mcmc_sweep `adata.obsm['CM_ppbm']` A `np.ndarray` with cell probability of belonging to a specific group `adata.uns['schist']['state']` The BlockModel state object """ if random_seed: np.random.seed(random_seed) seeds = np.random.choice(range(samples**2), size=samples, replace=False) if collect_marginals and samples < 100: logg.warning( 'Collecting marginals requires sufficient number of samples\n' f'It is now set to {samples} and should be at least 100') start = logg.info('minimizing the Planted Partition Block Model') adata = adata.copy() if copy else adata # are we clustering a user-provided graph or the default AnnData one? if adjacency is None: if neighbors_key not in adata.uns: raise ValueError('You need to run `pp.neighbors` first ' 'to compute a neighborhood graph.') elif 'connectivities_key' in adata.uns[neighbors_key]: # scanpy>1.4.6 has matrix in another slot conn_key = adata.uns[neighbors_key]['connectivities_key'] adjacency = adata.obsp[conn_key] else: # scanpy<=1.4.6 has sparse matrix here adjacency = adata.uns[neighbors_key]['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) # convert it to igraph and graph-tool g = get_igraph_from_adjacency(adjacency, directed=directed) g = g.to_graph_tool() gt.remove_parallel_edges(g) recs = [] rec_types = [] if use_weights: # this is not ideal to me, possibly we may need to transform # weights. More tests needed. recs = [g.ep.weight] rec_types = ['real-normal'] if samples < 1: samples = 1 # initialize the block states def fast_min(state, beta, n_sweep, fast_tol, seed=None): if seed: gt.seed_rng(seed) dS = 1 while np.abs(dS) > fast_tol: dS, _, _ = state.multiflip_mcmc_sweep(beta=beta, niter=n_sweep) return state states = [gt.PPBlockState(g) for x in range(samples)] # perform a mcmc sweep on each # no list comprehension as I need to collect stats states = Parallel(n_jobs=n_jobs, prefer=dispatch_backend)( delayed(fast_min)(states[x], beta, n_sweep, tolerance, seeds[x]) for x in range(samples)) logg.info(' minimization step done', time=start) pmode = gt.PartitionModeState([x.get_blocks().a for x in states], converge=True) bs = pmode.get_max(g) logg.info(' consensus step done', time=start) if save_model: import pickle fname = save_model if not fname.endswith('pkl'): fname = f'{fname}.pkl' logg.info(f'Saving model into {fname}') with open(fname, 'wb') as fout: pickle.dump(pmode, fout, 2) state = gt.PPBlockState(g, b=bs) logg.info(' done', time=start) groups = np.array(bs.get_array()) u_groups = np.unique(groups) n_groups = len(u_groups) last_group = np.max(u_groups) + 1 if collect_marginals: pv_array = pmode.get_marginal(g).get_2d_array( range(last_group)).T[:, u_groups] / samples rosetta = dict(zip(u_groups, range(len(u_groups)))) groups = np.array([rosetta[x] for x in groups]) if restrict_to is not None: if key_added == 'ppbm': key_added += '_R' groups = rename_groups( adata, key_added, restrict_key, restrict_categories, restrict_indices, groups, ) # add column names adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(map(str, np.unique(groups))), ) # now add marginal probabilities. if collect_marginals: # cell marginals will be a list of arrays with probabilities # of belonging to a specific group adata.obsm[f"CM_{key_added}"] = pv_array # add some unstructured info if not 'schist' in adata.uns: adata.uns['schist'] = {} adata.uns['schist'][f'{key_added}'] = {} adata.uns['schist'][f'{key_added}']['stats'] = dict( entropy=state.entropy(), modularity=gt.modularity(g, state.get_blocks())) # record state as list of blocks # for compatibility with nested model, use a dictionary with a single key here # although a np.array would be ok adata.uns['schist'][f'{key_added}']['blocks'] = { '0': np.array(state.get_blocks().a) } # last step is recording some parameters used in this analysis adata.uns['schist'][f'{key_added}']['params'] = dict( model='planted', use_weights=use_weights, neighbors_key=neighbors_key, key_added=key_added, samples=samples, collect_marginals=collect_marginals, random_seed=random_seed, deg_corr=deg_corr, recs=recs, rec_types=rec_types) logg.info( ' finished', time=start, deep=( f'found {state.get_B()} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)'), ) return adata if copy else None
import graph_tool.all as gt import matplotlib.pyplot as plt import numpy as np import glob import scipy.stats as sp import scipy.optimize as optimize import NodeProbs as NoP filenames = sorted(glob.glob("./Realnetworks/tags*_2015.gml")) Graphs = [gt.load_graph(File) for File in filenames] filenames = sorted(glob.glob("./Simulatednetworks/SimulatedGraphWeek*.graphml")) FakeGraphs = [gt.load_graph(File) for File in filenames] #""" #Degree-Distributions for Graph in FakeGraphs: gt.remove_parallel_edges(Graph) bins = np.geomspace(0.01,1, num = 30) print bins FakeDegrees = [] for i in xrange(len(FakeGraphs)): Dummy = FakeGraphs[i].get_out_degrees(FakeGraphs[i].get_vertices()[25+i:]).astype("float") #Dummy /= Dummy.max() FakeDegrees.extend(Dummy) #splicing to deal with the dead nodes #FakeDegrees[i]/=FakeDegrees[i].max() RealDegrees = [] for i in xrange(len(Graphs)): dummy = Graphs[i].get_out_degrees(Graphs[i].get_vertices()).astype("float") #dummy /= dummy.max() RealDegrees.extend(dummy) #RealDegrees[i]/=RealDegrees[i].max()
def useGraphTool(pd): # Extract the graphml representation of the planner data graphml = pd.printGraphML() f = open("graph.graphml", 'w') f.write(graphml) f.close() # Load the graphml data using graph-tool graph = gt.load_graph("graph.graphml", fmt="xml") edgeweights = graph.edge_properties["weight"] # Write some interesting statistics avgdeg, stddevdeg = gt.vertex_average(graph, "total") avgwt, stddevwt = gt.edge_average(graph, edgeweights) print("---- PLANNER DATA STATISTICS ----") print( str(graph.num_vertices()) + " vertices and " + str(graph.num_edges()) + " edges") print("Average vertex degree (in+out) = " + str(avgdeg) + " St. Dev = " + str(stddevdeg)) print("Average edge weight = " + str(avgwt) + " St. Dev = " + str(stddevwt)) _, hist = gt.label_components(graph) print("Strongly connected components: " + str(len(hist))) # Make the graph undirected (for weak components, and a simpler drawing) graph.set_directed(False) _, hist = gt.label_components(graph) print("Weakly connected components: " + str(len(hist))) # Plotting the graph gt.remove_parallel_edges(graph) # Removing any superfluous edges edgeweights = graph.edge_properties["weight"] colorprops = graph.new_vertex_property("string") vertexsize = graph.new_vertex_property("double") start = -1 goal = -1 for v in range(graph.num_vertices()): # Color and size vertices by type: start, goal, other if pd.isStartVertex(v): start = v colorprops[graph.vertex(v)] = "cyan" vertexsize[graph.vertex(v)] = 10 elif pd.isGoalVertex(v): goal = v colorprops[graph.vertex(v)] = "green" vertexsize[graph.vertex(v)] = 10 else: colorprops[graph.vertex(v)] = "yellow" vertexsize[graph.vertex(v)] = 5 # default edge color is black with size 0.5: edgecolor = graph.new_edge_property("string") edgesize = graph.new_edge_property("double") for e in graph.edges(): edgecolor[e] = "black" edgesize[e] = 0.5 # using A* to find shortest path in planner data if start != -1 and goal != -1: _, pred = gt.astar_search(graph, graph.vertex(start), edgeweights) # Color edges along shortest path red with size 3.0 v = graph.vertex(goal) while v != graph.vertex(start): p = graph.vertex(pred[v]) for e in p.out_edges(): if e.target() == v: edgecolor[e] = "red" edgesize[e] = 2.0 v = p pos = graph.new_vertex_property("vector<double>") for v in range(graph.num_vertices()): vtx = pd.getVertex(v) st = vtx.getState() pos[graph.vertex(v)] = [st[0], st[1]] # Writing graph to file: # pos indicates the desired vertex positions, and pin=True says that we # really REALLY want the vertices at those positions # gt.graph_draw(graph, pos=pos, vertex_size=vertexsize, vertex_fill_color=colorprops, # edge_pen_width=edgesize, edge_color=edgecolor, # output="graph.pdf") gt.graph_draw(graph, pos=pos, output="graph.pdf") print('\nGraph written to graph.pdf') graph.vertex_properties["pos"] = pos graph.vertex_properties["vsize"] = vertexsize graph.vertex_properties["vcolor"] = colorprops graph.edge_properties["esize"] = edgesize graph.edge_properties["ecolor"] = edgecolor graph.save("mgraph.graphml") print('\nGraph saved to mgraph.graphml')
def calculate_affinity(adata: AnnData, level: int = 1, block_key: Optional[str] = 'nsbm', group_by: Optional[str] = None, state: Optional = None, neighbors_key: Optional[str] = 'neighbors', adjacency: Optional[sparse.spmatrix] = None, directed: bool = False, use_weights: bool = False, obsp: Optional[str] = None, back_prob: bool = False, copy: bool = False) -> Optional[AnnData]: """\ Calculate cell affinity given a partition scheme. It can be used for partitions calculated using schist or for any partition scheme, given for example by cell annotations. Parameters ---------- adata: The AnnData object. Should have been already processed with schist level: The level to calculate affinity. This parameter is effective only for Nested partitions block_key: The prefix for partitions. This parameter is ignored if the state is not gt.NestedBlockState group_by: The key for group names used for calculations. Setting this will override level and block_key. This is effective only for NestedBlockState partitions state: Optionally calculate affinities on this state. neighbors_key Use neighbors connectivities as adjacency. If not specified, leiden looks .obsp['connectivities'] for connectivities (default storage place for pp.neighbors). If specified, leiden looks .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities. adjacency Sparse adjacency matrix of the graph, defaults to neighbors connectivities. directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). copy: Return a new object or do everything in place Returns ------- Depending on `copy`, returns or updates `adata` with affinity values in adata.obsm[f'CA_{block_key}_level_{level}'] """ matrix_key = f'CA_{block_key}_level_{level}' # the default name of the matrix if group_by: logg.info(f'Calculating cell affinity to {group_by}') else: logg.info(f'Calculating cell affinity to level {level}') if not state: # if no state is provided, use the default to retrieve graph if 'schist' in adata.uns and 'blocks' in adata.uns['schist'][ f'{block_key}']: params = adata.uns['schist'][f'{block_key}']['params'] if 'neighbors_key' in params: neighbors_key = params['neighbors_key'] if 'use_weights' in params: use_weights = params['use_weights'] if 'deg_corr' in params: deg_corr = params['deg_corr'] state = state_from_blocks(adata, state_key=block_key, neighbors_key=neighbors_key, adjacency=adjacency, directed=directed, use_weights=use_weights, deg_corr=deg_corr) g = state.g elif not neighbors_key: # no state and no adjacency provided, raise an error raise ValueError("A state or an adjacency matrix should be given" "Otherwise a graph cannot be computed") else: # get the graph from the adjacency adjacency = _choose_graph(adata, obsp, neighbors_key) g = get_igraph_from_adjacency(adjacency, directed=directed) g = g.to_graph_tool() gt.remove_parallel_edges(g) state = gt.BlockState(g) else: g = state.g if group_by: matrix_key = f'CA_{group_by}' # if groups are given, we generate a new BlockState and work on that if group_by in adata.obs.columns and adata.obs[ group_by].dtype.name == 'category': partitions = adata.obs[group_by].cat.codes.values state = gt.BlockState(g, b=partitions) if back_prob: ca_matrix = get_cell_back_p(state) else: ca_matrix = get_cell_loglikelihood(state, as_prob=True) else: raise ValueError( f"{group_by} should be a categorical entry in adata.obs") else: # use precomputed blocks and states if type(state) == gt.NestedBlockState: if back_prob: p0 = get_cell_back_p(state, level=0) else: p0 = get_cell_loglikelihood(state, level=0, as_prob=True) group_col = None if group_by and group_by in adata.obs.columns: group_col = group_by else: g_name = f'{block_key}_level_{level}' if g_name in adata.obs.columns: group_col = g_name if not group_col: raise ValueError( "The provided groups or level/blocks do not exist") g0 = pd.Categorical(state.project_partition(0, 0).a) cross_tab = pd.crosstab(g0, adata.obs[group_col], normalize='index') ca_matrix = (p0 @ cross_tab).values elif type(state) == gt.PPBlockState: if back_prob: ca_matrix = get_cell_back_p(state) else: ca_matrix = get_cell_loglikelihood(state, as_prob=True) matrix_key = 'CA_ppbm' adata.obsm[matrix_key] = ca_matrix return adata if copy else None