def graph(args): """ %prog graph best.edges Convert Celera Assembler's "best.edges" to a GEXF which can be used to feed into Gephi to check the topology of the best overlapping graph. Mutual best edges are represented as thicker edges. Reference: https://github.com/PacificBiosciences/Bioinformatics-Training/blob/master/scripts/CeleraToGephi.py """ p = OptionParser(graph.__doc__) p.add_option( "--query", default=-1, type="int", help="Search from node, -1 to select random node, 0 to disable", ) p.add_option("--contig", help="Search from contigs, use comma to separate") p.add_option("--largest", default=0, type="int", help="Only show largest components") p.add_option("--maxsize", default=500, type="int", help="Max graph size") p.add_option( "--nomutualbest", default=False, action="store_true", help="Do not plot mutual best edges as heavy", ) add_graph_options(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (bestedges, ) = args query = opts.query contig = opts.contig largest = opts.largest frgctg = opts.frgctg edgeweight = not opts.nomutualbest G = read_graph(bestedges, maxerr=opts.maxerr) if largest: H = list(nx.connected_component_subgraphs(G)) c = min(len(H), largest) logging.debug("{0} components found, {1} retained".format(len(H), c)) G = nx.Graph() for x in H[:c]: G.add_edges_from(x.edges()) if query: if query == -1: query = choice(G.nodes()) reads_to_ctgs = parse_ctgs(bestedges, frgctg) if contig: contigs = set(contig.split(",")) core = [k for k, v in reads_to_ctgs.items() if v in contigs] else: ctg = reads_to_ctgs.get(query) core = [k for k, v in reads_to_ctgs.items() if v == ctg] logging.debug( "Reads ({0}) extended from the same contig {1}".format( len(core), ctg)) # Extract a local neighborhood SG = nx.Graph() H = graph_local_neighborhood(G, query=core, maxsize=opts.maxsize) SG.add_edges_from(H.edges(data=edgeweight)) G = SG seen = [] for n, attrib in G.nodes_iter(data=True): contig = reads_to_ctgs.get(n, "na") attrib["label"] = contig seen.append(contig) c = Counter(seen) cc = ["{0}({1})".format(k, v) for k, v in c.most_common()] print("Contigs: {0}".format(" ".join(cc)), file=sys.stderr) gexf = "best" if query >= 0: gexf += ".{0}".format(query) gexf += ".gexf" nx.write_gexf(G, gexf) logging.debug("Graph written to `{0}` (|V|={1}, |E|={2})".format( gexf, len(G), G.size()))
def graph(args): """ %prog graph best.edges Convert Celera Assembler's "best.edges" to a GEXF which can be used to feed into Gephi to check the topology of the best overlapping graph. Mutual best edges are represented as thicker edges. Reference: https://github.com/PacificBiosciences/Bioinformatics-Training/blob/master/scripts/CeleraToGephi.py """ p = OptionParser(graph.__doc__) p.add_option("--query", default=-1, type="int", help="Search from node, -1 to select random node, 0 to disable") p.add_option("--contig", help="Search from contigs, use comma to separate") p.add_option("--largest", default=0, type="int", help="Only show largest components") p.add_option("--maxsize", default=500, type="int", help="Max graph size") p.add_option("--nomutualbest", default=False, action="store_true", help="Do not plot mutual best edges as heavy") add_graph_options(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bestedges, = args query = opts.query contig = opts.contig largest = opts.largest frgctg = opts.frgctg edgeweight = not opts.nomutualbest G = read_graph(bestedges, maxerr=opts.maxerr) if largest: H = list(nx.connected_component_subgraphs(G)) c = min(len(H), largest) logging.debug("{0} components found, {1} retained".format(len(H), c)) G = nx.Graph() for x in H[:c]: G.add_edges_from(x.edges()) if query: if query == -1: query = choice(G.nodes()) reads_to_ctgs = parse_ctgs(bestedges, frgctg) if contig: contigs = set(contig.split(",")) core = [k for k, v in reads_to_ctgs.items() if v in contigs] else: ctg = reads_to_ctgs.get(query) core = [k for k, v in reads_to_ctgs.items() if v == ctg] logging.debug("Reads ({0}) extended from the same contig {1}".format(len(core), ctg)) # Extract a local neighborhood SG = nx.Graph() H = graph_local_neighborhood(G, query=core, maxsize=opts.maxsize) SG.add_edges_from(H.edges(data=edgeweight)) G = SG seen = [] for n, attrib in G.nodes_iter(data=True): contig = reads_to_ctgs.get(n, "na") attrib["label"] = contig seen.append(contig) c = Counter(seen) cc = ["{0}({1})".format(k, v) for k, v in c.most_common()] print >> sys.stderr, "Contigs: {0}".format(" ".join(cc)) gexf = "best" if query >= 0: gexf += ".{0}".format(query) gexf += ".gexf" nx.write_gexf(G, gexf) logging.debug("Graph written to `{0}` (|V|={1}, |E|={2})".format(gexf, len(G), G.size()))
def graph(args): """ %prog graph best.edges Convert Celera Assembler's "best.edges" to a GEXF which can be used to feed into Gephi to check the topology of the best overlapping graph. Reference: https://github.com/PacificBiosciences/Bioinformatics-Training/blob/master/scripts/CeleraToGephi.py """ import networkx as nx from jcvi.algorithms.graph import graph_stats, graph_local_neighborhood p = OptionParser(graph.__doc__) p.add_option("--maxerr", default=100, type="int", help="Maximum error rate") p.add_option("--query", default=-1, type="int", help="Search from node") p.add_option("--largest", default=1, type="int", help="Only show largest components") p.add_option("--maxsize", default=100, type="int", help="Max graph size") p.add_option("--contigs", help="Annotate graph with contig membership, " " typically from `asm.posmap.frgctg`") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bestedges, = args maxerr = opts.maxerr query = opts.query largest = opts.largest logging.debug("Max error = {0}%".format(maxerr)) bestgraph = bestedges.split(".")[0] + ".err{0}.graph".format(maxerr) if need_update(bestedges, bestgraph): G = nx.Graph() fp = open(bestedges) for row in fp: if row[0] == '#': continue id1, lib_id, best5, o1, best3, o3, j1, j2 = row.split() id1, best5, best3 = int(id1), int(best5), int(best3) j1, j2 = float(j1), float(j2) if j1 < maxerr or j2 < maxerr: G.add_node(id1) if best5 != '0' and j1 < maxerr: G.add_edge(best5, id1) if best3 != '0' and j2 < maxerr: G.add_edge(id1, best3) nx.write_gpickle(G, bestgraph) logging.debug("Graph pickled to `{0}`".format(bestgraph)) logging.debug("Read graph from `{0}`".format(bestgraph)) G = nx.read_gpickle(bestgraph) graph_stats(G) if len(G) > 10000: SG = nx.Graph() H = graph_local_neighborhood(G, query=query, maxsize=opts.maxsize) SG.add_edges_from(H.edges()) G = SG if largest > 1: # only works for un-directed graph H = nx.connected_component_subgraphs(G) c = min(len(H), largest) logging.debug("{0} components found, {1} retained".format(len(H), c)) G = nx.Graph() for x in H[:c]: G.add_edges_from(x.edges()) if opts.contigs: reads_to_ctgs = parse_ctgs(bestedges, opts.contigs) annotate_contigs(G, reads_to_ctgs) gexf = "best" if query >= 0: gexf += ".{0}".format(query) gexf += ".gexf" nx.write_gexf(G, gexf) logging.debug("Graph written to `{0}` (|V|={1}, |E|={2})".\ format(gexf, len(G), G.size()))