def load_mcl_cluster(_run_id, cluster_file): """Load cluster file from mcl into homology database""" nseqs = 0 hist = {} database.execute("BEGIN") database.execute("DELETE FROM homology WHERE run_id=?;", (_run_id,)) with open(cluster_file, "r") as f: cluster_id = 0 for line in f: cluster = filter(lambda s: s[0].isdigit(), line.rstrip().split()) n = len(cluster) hist[n] = hist.get(n, 0) + 1 if n >= 4: nseqs += n for seq_id in cluster: database.execute( """ INSERT INTO homology (run_id, component_id, sequence_id) VALUES (?,?,?);""", (_run_id, cluster_id, seq_id), ) cluster_id += 1 database.execute("COMMIT") utils.info("histogram of gene cluster sizes:\n", "\n ".join("%d\t:\t%d" % (k, hist[k]) for k in sorted(hist))) diagnostics.log("nseqs", nseqs) diagnostics.log("histogram", hist)
def load_orthofinder_cluster(_run_id, ogfile): """Load cluster file from mcl (via orthofinder) into homology database""" nseqs = 0 hist = {} database.execute("BEGIN") database.execute("DELETE FROM homology WHERE run_id=?;", (_run_id,)) with open(ogfile, 'r') as f: cluster_id = 0 for line in f: cluster = filter(lambda s : s.isdigit(), re.split(r'[@\s]', line)) n = len(cluster) hist[n] = hist.get(n, 0) + 1 if n >= 4: nseqs += n for seq_id in cluster: database.execute(""" INSERT INTO homology (run_id, component_id, sequence_id) VALUES (?,?,?);""", (_run_id, cluster_id, seq_id)) cluster_id += 1 database.execute("COMMIT") utils.info( "histogram of gene cluster sizes:\n", '\n '.join("%d\t:\t%d" % (k, hist[k]) for k in sorted(hist))) diagnostics.log('nseqs', nseqs) diagnostics.log('histogram', hist)
def parse_edges(_run_id, seq_type, blast_hits, min_overlap, min_bitscore, min_nodes): """Parse BLAST hits into edges weighted by bitscore""" graph = nx.Graph() edge_file = "allvall_edges_%s.abc" % _run_id nseqs = 0 nedges = {"all": 0, "non-self": 0, "passed-overlap": 0, "passed-bitscore": 0, "passed-bitscore-unique": 0} max_bitscore = None last_query = "" # For identification of a new query in the table for f in glob(blast_hits): for line in open(f): # fields: # qseqid sseqid bitscore qlen length id_from, id_to, bitscore, qlen, length = line.rstrip().split() # Sometimes blast outputs a bad query id if the query is longer # than 10Kb if id_from.startswith("Query_"): utils.info("discarding bad hit with query id '%s'" % id_from) continue bitscore = float(bitscore) length = float(length) / float(qlen) # Correct for nucleotide vs. amino acid length ### WHY? lenght here is the overlap proportion # if seq_type == 'nucleotide' # length *= 3.0 # Filter out self hits, low scoring hits, and short hits nedges["all"] += 1 if id_from != last_query: max_bitscore = ( bitscore ) ## The self-hit is not always at the top as the file is sorted by e-values and ties happens. last_query = id_from ## however the self-hit always have the same bitscore as the first hit. if id_from != id_to: nedges["non-self"] += 1 if length > min_overlap: nedges["passed-overlap"] += 1 srv = float(bitscore) / float( max_bitscore ) # SRV: BLAST Score Ratio Values. Blom et al. BMC Bioinformatics 2009, 10:154 doi:10.1186/1471-2105-10-154 if srv > min_bitscore: nedges["passed-bitscore"] += 1 # If an edge already exists between the nodes, update # its score to be the max if graph.has_edge(id_from, id_to): e = graph.edge[id_from][id_to] e["score"] = max(e["score"], bitscore) else: nedges["passed-bitscore-unique"] += 1 graph.add_node(id_from) graph.add_node(id_to) graph.add_edge(id_from, id_to, score=bitscore) diagnostics.prefix.append("nedges") diagnostics.log_dict(nedges) diagnostics.prefix.pop() with open(edge_file, "w") as f: for subgraph in nx.connected_component_subgraphs(graph): nnodes = subgraph.number_of_nodes() if nnodes >= min_nodes: nseqs += nnodes for id_from, id_to in subgraph.edges_iter(): print >> f, "%s\t%s\t%f" % (id_from, id_to, subgraph.edge[id_from][id_to]["score"]) if not nseqs: utils.die("no sequences were written to the FASTA file") diagnostics.log("nseqs", nseqs) ingest("edge_file")