示例#1
0
def load_mcl_cluster(_run_id, cluster_file):
    """Load cluster file from mcl into homology database"""

    nseqs = 0
    hist = {}

    database.execute("BEGIN")
    database.execute("DELETE FROM homology WHERE run_id=?;", (_run_id,))
    with open(cluster_file, "r") as f:
        cluster_id = 0
        for line in f:
            cluster = filter(lambda s: s[0].isdigit(), line.rstrip().split())
            n = len(cluster)
            hist[n] = hist.get(n, 0) + 1
            if n >= 4:
                nseqs += n
                for seq_id in cluster:
                    database.execute(
                        """
						INSERT INTO homology (run_id, component_id, sequence_id)
						VALUES (?,?,?);""",
                        (_run_id, cluster_id, seq_id),
                    )
                cluster_id += 1
    database.execute("COMMIT")

    utils.info("histogram of gene cluster sizes:\n", "\n ".join("%d\t:\t%d" % (k, hist[k]) for k in sorted(hist)))

    diagnostics.log("nseqs", nseqs)
    diagnostics.log("histogram", hist)
示例#2
0
def load_orthofinder_cluster(_run_id, ogfile):
    """Load cluster file from mcl (via orthofinder) into homology database"""

    nseqs = 0
    hist = {}

    database.execute("BEGIN")
    database.execute("DELETE FROM homology WHERE run_id=?;", (_run_id,))
    with open(ogfile, 'r') as f:
        cluster_id = 0
        for line in f:
            cluster = filter(lambda s : s.isdigit(), re.split(r'[@\s]', line))
            n = len(cluster)
            hist[n] = hist.get(n, 0) + 1
            if n >= 4:
                nseqs += n
                for seq_id in cluster:
                    database.execute("""
                        INSERT INTO homology (run_id, component_id, sequence_id)
                        VALUES (?,?,?);""",
                        (_run_id, cluster_id, seq_id))
                cluster_id += 1
    database.execute("COMMIT")

    utils.info(
        "histogram of gene cluster sizes:\n",
        '\n '.join("%d\t:\t%d" % (k, hist[k]) for k in sorted(hist)))

    diagnostics.log('nseqs', nseqs)
    diagnostics.log('histogram', hist)
示例#3
0
def parse_edges(_run_id, seq_type, blast_hits, min_overlap, min_bitscore, min_nodes):
    """Parse BLAST hits into edges weighted by bitscore"""

    graph = nx.Graph()
    edge_file = "allvall_edges_%s.abc" % _run_id

    nseqs = 0
    nedges = {"all": 0, "non-self": 0, "passed-overlap": 0, "passed-bitscore": 0, "passed-bitscore-unique": 0}
    max_bitscore = None
    last_query = ""  # For identification of a new query in the table

    for f in glob(blast_hits):
        for line in open(f):
            # fields:
            # qseqid sseqid bitscore qlen length
            id_from, id_to, bitscore, qlen, length = line.rstrip().split()
            # Sometimes blast outputs a bad query id if the query is longer
            # than 10Kb
            if id_from.startswith("Query_"):
                utils.info("discarding bad hit with query id '%s'" % id_from)
                continue
            bitscore = float(bitscore)
            length = float(length) / float(qlen)
            # Correct for nucleotide vs. amino acid length  ### WHY? lenght here is the overlap proportion
            # if seq_type == 'nucleotide'
            # 	length *= 3.0
            # Filter out self hits, low scoring hits, and short hits
            nedges["all"] += 1
            if id_from != last_query:
                max_bitscore = (
                    bitscore
                )  ## The self-hit is not always at the top as the file is sorted by e-values and ties happens.
                last_query = id_from  ## however the self-hit always have the same bitscore as the first hit.
            if id_from != id_to:
                nedges["non-self"] += 1
                if length > min_overlap:
                    nedges["passed-overlap"] += 1
                    srv = float(bitscore) / float(
                        max_bitscore
                    )  # SRV: BLAST Score Ratio Values. Blom et al. BMC Bioinformatics 2009, 10:154 doi:10.1186/1471-2105-10-154
                    if srv > min_bitscore:
                        nedges["passed-bitscore"] += 1
                        # If an edge already exists between the nodes, update
                        # its score to be the max
                        if graph.has_edge(id_from, id_to):
                            e = graph.edge[id_from][id_to]
                            e["score"] = max(e["score"], bitscore)
                        else:
                            nedges["passed-bitscore-unique"] += 1
                            graph.add_node(id_from)
                            graph.add_node(id_to)
                            graph.add_edge(id_from, id_to, score=bitscore)

    diagnostics.prefix.append("nedges")
    diagnostics.log_dict(nedges)
    diagnostics.prefix.pop()

    with open(edge_file, "w") as f:
        for subgraph in nx.connected_component_subgraphs(graph):
            nnodes = subgraph.number_of_nodes()
            if nnodes >= min_nodes:
                nseqs += nnodes
                for id_from, id_to in subgraph.edges_iter():
                    print >> f, "%s\t%s\t%f" % (id_from, id_to, subgraph.edge[id_from][id_to]["score"])

    if not nseqs:
        utils.die("no sequences were written to the FASTA file")

    diagnostics.log("nseqs", nseqs)

    ingest("edge_file")