示例#1
0
def extract_homologs(tmp_dir):
    # create outdir
    seqs_dir = f"{tmp_dir}/markers"
    if not os.path.isdir(seqs_dir):
        os.makedirs(seqs_dir)
    # fetch best hits from hmmsearch
    gene_to_aln = utilities.fetch_hmm_best_hits(f"{tmp_dir}/phyeco.hmmsearch")
    # open output files
    outfiles = {}
    marker_ids = set(aln["qacc"] for aln in list(gene_to_aln.values()))
    for marker_id in marker_ids:
        outfiles[marker_id] = {}
        outfiles[marker_id]["ffn"] = open(f"{seqs_dir}/{marker_id}.ffn", "w")
        outfiles[marker_id]["faa"] = open(f"{seqs_dir}/{marker_id}.faa", "w")
    # write seqs
    for ext in ["ffn", "faa"]:
        in_path = f"{tmp_dir}/genes.{ext}"
        for id, seq in utilities.parse_fasta(in_path):
            if id in gene_to_aln:
                marker_id = gene_to_aln[id]["qacc"]
                seq = seq.rstrip("*")
                outfiles[marker_id][ext].write(">" + id + "\n" + seq + "\n")
    # close files
    for marker_id in outfiles:
        outfiles[marker_id]["ffn"].close()
        outfiles[marker_id]["faa"].close()
示例#2
0
def find_contig_targets(args, genomes, alignments):
    contigs = dict([(id, {
        "hits": 0,
        "len": len(seq),
        "genomes": []
    }) for id, seq in utilities.parse_fasta(args["fna"])])
    for genome, alns in zip(genomes, alignments):
        hits = id_blast_hits(alns, args["contig_aln"], args["contig_pid"])
        for contig in hits:
            contigs[contig]["hits"] += 1
            contigs[contig]["genomes"].append(genome[0])
    for id in contigs:
        hit_rate = contigs[id]["hits"] / float(len(alignments))
        contigs[id]["hit_rate"] = hit_rate
    return contigs
示例#3
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["coverm"])
    print("\u001b[1m" + "• Computing contig coverage" + "\u001b[0m")
    utilities.run_coverm(args["bams"], args["tmp_dir"], args["threads"])
    coverage_df = pd.read_csv(f"{args['tmp_dir']}/coverage.tsv",
                              sep="\t",
                              index_col=0)
    contig_id_list = []
    contig_length_list = []
    for id, seq in utilities.parse_fasta(args["fna"]):
        contig_id_list.append(id)
        contig_length_list.append(len(seq))
    contig_coverage_df = coverage_df.loc[contig_id_list]
    largest_mean_coverage_sample = contig_coverage_df.mean(axis=0).idxmax()
    if contig_coverage_df.shape[1] > 1:
        print(
            "\u001b[1m" +
            f"\n• Sample being used for outlier detection: {largest_mean_coverage_sample.split()[0]}"
            + "\u001b[0m")
    contig_coverage_df = contig_coverage_df.loc[:,
                                                largest_mean_coverage_sample]
    if contig_coverage_df.mean() < 1:
        sys.exit(
            "\nError: The average coverage is less than 1 in all the supplied BAM files"
        )
    if args["weighted_mean"]:
        print(
            "\u001b[1m" +
            "\n• Computing per-contig deviation from the weighted mean coverage"
            + "\u001b[0m")
        reference = np.average(contig_coverage_df.values,
                               weights=contig_length_list)
    else:
        print("\u001b[1m" +
              "\n• Computing per-contig deviation from the mean coverage" +
              "\u001b[0m")
        reference = contig_coverage_df.mean()
    outliers = ((contig_coverage_df / reference) >= args["max_deviation"]) | (
        (contig_coverage_df / reference) <= 1 / args["max_deviation"])
    print("\u001b[1m" + "\n• Identifying outlier contigs" + "\u001b[0m")
    flagged = contig_coverage_df.loc[outliers].index.tolist()
    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
示例#4
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    if args["weighted_mean"]:
        print("\u001b[1m" + "• Computing weighted mean contig GC content" +
              "\u001b[0m")
    else:
        print("\u001b[1m" + "• Computing mean contig GC content" + "\u001b[0m")
    contigs = {}
    contig_length_list = []
    for id, seq in utilities.parse_fasta(args["fna"]):
        contig = Contig()
        contig.id = id
        contig.seq = str(seq)
        contig.gc = round(SeqUtils.GC(seq), 2)
        contigs[id] = contig
        contig_length_list.append(len(seq))
    if args["weighted_mean"]:
        print("\u001b[1m" +
              "\n• Computing per-contig deviation from weighted mean" +
              "\u001b[0m")
        reference = np.average([c.gc for c in contigs.values()],
                               weights=contig_length_list)
    else:
        print("\u001b[1m" + "\n• Computing per-contig deviation from mean" +
              "\u001b[0m")
        reference = np.average([c.gc for c in contigs.values()])
    for contig in contigs.values():
        contig.values = {}
        contig.values["delta"] = abs(contig.gc - reference)
    print("\u001b[1m" + "\n• Identifying outlier contigs" + "\u001b[0m")
    flagged = [
        contig.id for contig in contigs.values()
        if contig.values["delta"] > args["cutoff"]
    ]

    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
示例#5
0
def main(args):
    utilities.check_input(args)
    print("\u001b[1m" + "• Reading genome bin" + "\u001b[0m")
    bin = {id: seq for id, seq in utilities.parse_fasta(args["fna"])}
    bin_length = round(sum(len(_) for _ in bin.values()) / 1000, 2)
    print(f"  genome length: {len(bin)} contigs, {bin_length} Kbp")
    print("\u001b[1m" + "\n• Reading flagged contigs" + "\u001b[0m")
    flagged_contigs = []
    programs = [
        "phylo-markers",
        "clade-markers",
        "conspecific",
        "tetra-freq",
        "gc-content",
        "coverage",
        "known-contam",
    ]
    for program in programs:
        path = f"{args['out']}/{program}/flagged_contigs"
        if not os.path.exists(path):
            print(f"  {program}: no output file found")
        else:
            contigs = [_.rstrip() for _ in open(path)]
            bases = round(sum(len(bin[id]) for id in contigs) / 1000, 2)
            flagged_contigs += contigs
            print(f"  {program}: {len(contigs)} contigs, {bases} Kbp")
    flagged_contigs = list(set(flagged_contigs))
    flagged_length = round(
        sum(len(bin[id]) for id in flagged_contigs) / 1000, 2)
    print("\u001b[1m" + "\n• Removing flagged contigs" + "\u001b[0m")
    clean = bin.copy()
    for id in flagged_contigs:
        del clean[id]
    clean_length = round(sum(len(_) for _ in clean.values()) / 1000, 2)
    print(f"  removed: {len(flagged_contigs)} contigs, {flagged_length} Kbp")
    print(f"  remains: {len(clean)} contigs, {clean_length} Kbp")
    with open(args['out_fna'], "w") as f:
        for id, seq in clean.items():
            f.write(">" + id + "\n" + textwrap.fill(seq, 70) + "\n")
    print(f"  cleaned bin: {args['out_fna']}")
示例#6
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_database(args)
    print("\u001b[1m" + "• Reading database info" + "\u001b[0m")
    ref_taxonomy = read_ref_taxonomy(args["db"])
    taxon_to_taxonomy = {}
    for taxonomy in set(ref_taxonomy.values()):
        for taxon in taxonomy.split("|"):
            taxon_to_taxonomy[taxon] = taxonomy
    min_pid = {"k": 57, "p": 77, "c": 82, "o": 86, "f": 87, "g": 91, "s": 96}
    if args["min_genes"] is not None:
        args["min_genes"] = dict([(r, args["min_genes"]) for r in ranks])
    else:
        args["min_genes"] = {
            "k": 237,
            "p": 44,
            "c": 30,
            "o": 24,
            "f": 22,
            "g": 20,
            "s": 19,
        }
    print("\u001b[1m" + "\n• Calling genes with Prodigal" + "\u001b[0m")
    utilities.run_prodigal(args["fna"], args["tmp_dir"])
    print(f"  all genes: {args['tmp_dir']}/genes.[ffn|faa]")
    print(
        "\u001b[1m"
        + "\n• Performing pairwise alignment of genes against MetaPhlan2 database of clade-specific genes"
        + "\u001b[0m"
    )
    utilities.run_lastal(args["db"], args["tmp_dir"], args["threads"])
    print(f"  alignments: {args['tmp_dir']}/genes.m8")

    print("\u001b[1m" + "\n• Finding top hits to database" + "\u001b[0m")
    genes = {}
    for aln in utilities.parse_last(args["tmp_dir"] + "/genes.m8"):
        # clade exclusion
        ref_taxa = ref_taxonomy[aln["tid"]].split("|")
        if args["exclude_clades"] and any(
            taxon in ref_taxa for taxon in args["exclude_clades"]
        ):
            continue
        # initialize gene
        if aln["qid"] not in genes:
            genes[aln["qid"]] = Gene()
            genes[aln["qid"]].id = aln["qid"]
            genes[aln["qid"]].contig_id = aln["qid"].rsplit("_", 1)[0]

        # get top alignments
        if genes[aln["qid"]].aln is None:
            genes[aln["qid"]].aln = aln
            genes[aln["qid"]].ref_taxa = ref_taxa
        elif float(aln["score"]) > float(genes[aln["qid"]].aln["score"]):
            genes[aln["qid"]].ref_taxa = ref_taxa
    print("  %s genes with a database hit" % len(genes))
    print("\u001b[1m" + "\n• Classifying genes at each taxonomic rank" + "\u001b[0m")
    counts = {}
    for gene in genes.values():
        for ref_taxon in gene.ref_taxa:
            rank = ref_taxon.split("__")[0]
            if rank not in counts:
                counts[rank] = 0
            if rank == "t":
                continue
            elif float(gene.aln["pid"]) < min_pid[rank]:
                continue
            elif gene.aln["qcov"] < 0.4:
                continue
            elif gene.aln["tcov"] < 0.4:
                continue
            gene.taxa[rank] = ref_taxon
            counts[rank] += 1
    for rank in ranks:
        print(f"  {rank_names[rank]}: {counts[rank]} classified genes")
    print("\u001b[1m" + "\n• Taxonomically classifying contigs" + "\u001b[0m")
    contigs = {}
    for id, seq in utilities.parse_fasta(args["fna"]):
        contigs[id] = Contig()
        contigs[id].id = id
        contigs[id].length = len(seq)
    # aggregate hits by contig
    for gene in genes.values():
        contigs[gene.contig_id].genes.append(gene)
    # classify contigs at each level
    for contig in contigs.values():
        contig.classify()
    # summarize
    counts = {}
    for contig in contigs.values():
        for rank, taxon in contig.cons_taxa.items():
            if rank not in counts:
                counts[rank] = 0
            if taxon is not None:
                counts[rank] += 1
    print("  total contigs: %s" % len(contigs))
    for rank in ranks:
        print(f"  {rank_names[rank]}: {counts[rank]} classified contigs")

    print("\u001b[1m" + "\n• Taxonomically classifying genome" + "\u001b[0m")
    bin = Bin()
    bin.classify(
        contigs,
        args["min_bin_fract"],
        args["min_contig_fract"],
        args["min_gene_fract"],
        args["min_genes"],
        args["lowest_rank"],
    )
    print(f"  consensus taxon: {bin.cons_taxon}")
    print("\u001b[1m" + "\n• Identifying taxonomically discordant contigs" + "\u001b[0m")
    if bin.cons_taxon is not None:
        bin.rank_index = (
            taxon_to_taxonomy[bin.cons_taxon].split("|").index(bin.cons_taxon)
        )
        bin.taxonomy = taxon_to_taxonomy[bin.cons_taxon].split("|")[
            0 : bin.rank_index + 1
        ]
        flag_contigs(contigs, bin)
    flagged = [contig.id for contig in contigs.values() if contig.flagged]
    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")
示例#7
0
def flag_contigs(db_dir, tmp_dir, args):
    # step 0. read in reference data files
    # cutoffs
    cutoffs = {}
    cutoffs_path = f"{db_dir}/phylo-markers/max_fscores.tsv"
    reader = csv.DictReader(open(cutoffs_path), delimiter="\t")
    for r in reader:
        key = (r["marker_id"], r["seq_type"], r["score_type"], r["taxlevel"])
        value = {
            "sensitive": r["cutoff_lower"],
            "strict": r["cutoff_upper"],
            "none": 0.0
        }
        cutoffs[key] = value
    taxonomy_path = f"{db_dir}/phylo-markers/genome_taxonomy.tsv"
    reader = csv.DictReader(open(taxonomy_path), delimiter="\t")
    # taxonomy
    taxonomy = {r["genome_id"]: r["taxonomy"] for r in reader}
    # clustered seqs
    clusters = {}
    for type in ["ffn", "faa"]:
        clusters[type] = {}
        for file in os.listdir(f"{db_dir}/phylo-markers/{type}"):
            if file.split(".")[-1] == "uc":
                with open(f"{db_dir}/phylo-markers/{type}/{file}") as f:
                    for l in f:
                        v = l.rstrip().split()
                        rep_id = v[-1]
                        seq_id = v[-2]
                        if v[0] == "S":
                            clusters[type][seq_id] = [seq_id]
                        elif v[0] == "H":
                            clusters[type][rep_id].append(seq_id)
    # step 1. determine if bin is archaea or bacteria; initialize domain-level markers
    # to do: normalize counts by site of marker gene sets
    marker_ids = set([])
    counts = {"bacteria": 0, "archaea": 0}
    for aln_file in os.listdir(f"{tmp_dir}/alns"):
        marker_id, seq_type, ext = aln_file.split(".")
        marker_ids.add(marker_id)
    for marker_id in marker_ids:
        if "B" in marker_id:
            counts["bacteria"] += 1
        elif "A" in marker_id:
            counts["archaea"] += 1
    domain = "bacteria" if counts["bacteria"] >= counts[
        "archaea"] else "archaea"
    markers = {}
    for marker_id in marker_ids:
        if domain == "bacteria" and "B" in marker_id:
            markers[marker_id] = Marker()
            markers[marker_id].id = marker_id
            markers[marker_id].genes = []
        elif domain == "archaea" and "A" in marker_id:
            markers[marker_id] = Marker()
            markers[marker_id].id = marker_id
            markers[marker_id].genes = []
    # step 2. initialize marker genes found in bin
    bin = Bin()
    bin.genes = {}
    hmm_path = f"{tmp_dir}/phyeco.hmmsearch"
    for gene_id, aln in list(utilities.fetch_hmm_best_hits(hmm_path).items()):
        if aln["qacc"] not in markers:
            continue
        gene = Gene()
        gene.id = gene_id
        gene.contig = gene_id.rsplit("_", 1)[0]
        gene.marker = aln["qacc"]
        bin.genes[gene_id] = gene
        markers[aln["qacc"]].genes.append(gene)
    # annotate genes
    #    fetch all non-redundant taxonomic annotations for each gene
    seq_types = None
    if args["seq_type"] in ["both", "either"]:
        seq_types = ["ffn", "faa"]
    elif args["seq_type"] == "protein":
        seq_types = ["faa"]
    else:
        seq_types = ["ffn"]
    for seq_type in seq_types:
        for marker_id in markers:
            aln_path = tmp_dir + "/alns/" + marker_id + "." + seq_type + ".m8"
            for aln in utilities.parse_blast(aln_path):
                # fetch all unique taxonomies for target sequence
                # a sequence can have multiple taxonomies if it was clustered with another sequence
                genome_taxa = []
                for target_id in clusters[seq_type][aln["tname"]]:
                    genome_id = target_id.split("_")[0]
                    if (genome_id not in taxonomy
                            or taxonomy[genome_id] in genome_taxa):
                        continue
                    else:
                        genome_taxa.append(taxonomy[genome_id])
                # loop over ranks; stop when gene has been annotated
                for rank_index, rank in enumerate(
                    ["s", "g", "f", "o", "c", "p"]):
                    if seq_type == "ffn" and rank != "s":
                        if args["seq_type"] == "either":
                            continue
                    elif seq_type == "faa" and rank == "s":
                        if args["seq_type"] == "either":
                            continue
                    # get minimum % identity cutoff for transfering taxonomy
                    #   if cutoff_type is None, indicates that no cutoff should be used
                    min_pid = cutoffs[marker_id, seq_type, "pid",
                                      rank][args["cutoff_type"]]
                    if float(aln["pid"]) < float(min_pid):
                        continue
                    # add taxonomy
                    for genome_taxon in genome_taxa:
                        annotation = Annotation()
                        annotation.add_taxon(genome_taxon, rank_index)
                        annotation.score = float(aln["bitscore"])
                        bin.genes[aln["qname"]].annotations.append(annotation)
                    # stop when gene has been annotated at lowest rank
                    break
    # optionally remove annotations matching <exclude_clades>
    if args["exclude_clades"] is not None:
        bin.exclude_clades(args["exclude_clades"])
    # optionally take top hit only
    if args["hit_type"] == "top_hit":
        bin.only_keep_top_hits()
    # create None annotations for unannotated genes
    for gene in bin.genes.values():
        if len(gene.annotations) == 0:
            gene.annotations.append(Annotation())
    # classify bin
    bin.classify_taxonomy(args["allow_noclass"], args["bin_fract"])
    # flag contigs with discrepant taxonomy
    bin.contigs = {}
    for id, seq in utilities.parse_fasta(args["fna"]):
        bin.contigs[id] = Contig()
        bin.contigs[id].id = id
        bin.contigs[id].length = len(seq)
    for gene in bin.genes.values():
        bin.contigs[gene.contig].genes.append(gene)
    if bin.cons_taxon is not None:
        for contig in bin.contigs.values():
            contig.compare_taxonomy(bin)
            contig.flag(args["contig_fract"])
    return [contig.id for contig in bin.contigs.values() if contig.flagged]
示例#8
0
def main(args):
    utilities.add_tmp_dir(args)
    utilities.check_input(args)
    utilities.check_dependencies(["blastn"])

    print("\u001b[1m" + "• Counting tetranucleotides" + "\u001b[0m")
    # init data
    contigs = {}
    contig_length_list = []
    for id, seq in utilities.parse_fasta(args["fna"]):
        contig = Contig()
        contig.id = id
        contig.seq = str(seq)
        contig.kmers = init_kmers()
        contigs[id] = contig
        contig_length_list.append(len(seq))

    # count kmers
    for contig in contigs.values():
        for i in range(len(contig.seq) - 3):
            kmer_fwd = contig.seq[i : i + 4]
            if kmer_fwd in contig.kmers:
                contig.kmers[kmer_fwd] += 1
            else:
                kmer_rev = utilities.reverse_complement(kmer_fwd)
                contig.kmers[kmer_rev] += 1

    print("\u001b[1m" + "\n• Normalizing counts" + "\u001b[0m")
    for contig in contigs.values():
        total = float(sum(contig.kmers.values()))
        for kmer, count in contig.kmers.items():
            contig.kmers[kmer] = 100 * count / total if total > 0 else 0.0
    print("\u001b[1m" + "\n• Performing PCA" + "\u001b[0m")
    df = pd.DataFrame(dict([(c.id, c.kmers) for c in contigs.values()]))
    pca = PCA(n_components=1)
    pca.fit(df)
    pc1 = pca.components_[0]
    if args["weighted_mean"]:
        print(
            "\u001b[1m"
            + "\n• Computing per-contig deviation from the weighted mean along the first principal component"
            + "\u001b[0m"
        )
        reference_pc = np.average(pc1, weights=contig_length_list)
    else:
        print(
            "\u001b[1m"
            + "\n• Computing per-contig deviation from the mean along the first principal component"
            + "\u001b[0m"
        )
        reference_pc = np.average(pc1)
    for contig_id, contig_pc in zip(list(df.columns), pc1):
        contigs[contig_id].pc = contig_pc
        contigs[contig_id].values = {}
        contigs[contig_id].values["delta"] = abs(contig_pc - reference_pc)

    print("\u001b[1m" + "\n• Identifying outlier contigs" + "\u001b[0m")
    flagged = [
        contig.id
        for contig in contigs.values()
        if contig.values["delta"] > args["cutoff"]
    ]

    out = f"{args['tmp_dir']}/flagged_contigs"
    print(f"  {len(flagged)} flagged contigs: {out}")
    with open(out, "w") as f:
        for contig in flagged:
            f.write(contig + "\n")