Пример #1
0
    def parse(self,
              lca_fpath: str,
              orfs_fpath: str = None) -> Dict[str, Dict[str, Dict[int, int]]]:
        """Retrieve and construct contig dictionary from provided `lca_fpath`.

        Parameters
        ----------
        lca_fpath : str
            </path/to/lcas.tsv>
            tab-delimited ordered columns: qseqid, name, rank, lca_taxid

        orfs_fpath : str, optional (required if using prodigal version <2.6)
            </path/to/prodigal/called/orfs.fasta>
            Note: These ORFs should correspond to the ORFs provided in the BLAST table.

        Returns
        -------
        dict
            {contig:{rank:{taxid:counts, ...}, rank:{...}, ...}, ...}

        Raises
        -------
        FileNotFoundError
            `lca_fpath` does not exist.

        FileNotFoundError
            `orfs_fpath` does not exist.

        ValueError
            If prodigal version is under 2.6, `orfs_fpath` is a required input.

        """
        logger.debug(f"Parsing LCA table: {lca_fpath}")
        if not os.path.exists(lca_fpath):
            raise FileNotFoundError(lca_fpath)
        if orfs_fpath and not os.path.exists(orfs_fpath):
            raise FileNotFoundError(orfs_fpath)

        version = get_versions("prodigal")
        if version.count(".") >= 2:
            version = float(".".join(version.split(".")[:2]))
        else:
            version = float(version)
        if version < 2.6 and not orfs_fpath:
            raise ValueError(
                "Prodigal version under 2.6 requires orfs_fpath input!")
        # Create a contig translation dictionary with or without ORFs
        if orfs_fpath:
            contigs_from_orfs = prodigal.contigs_from_headers(orfs_fpath)
        else:
            df = pd.read_csv(lca_fpath, sep="\t", usecols=["qseqid"])
            df["contig"] = df.qseqid.map(lambda orf: orf.rsplit("_", 1)[0])
            contigs_from_orfs = df.set_index("qseqid")["contig"].to_dict()

        fname = os.path.basename(lca_fpath)
        n_lines = file_length(lca_fpath) if self.verbose else None
        lca_hits = {}
        with open(lca_fpath) as fh:
            __ = fh.readline()  # header
            for line in tqdm(
                    fh,
                    total=n_lines,
                    disable=self.disable,
                    desc=f"Parsing {fname}",
                    leave=False,
            ):
                # orf, name, rank, taxid
                orf_id, __, rank, taxid = line.strip().split("\t")
                taxid = int(taxid)
                contig = contigs_from_orfs.get(orf_id)
                if taxid != 1:
                    while rank not in set(NCBI.CANONICAL_RANKS):
                        taxid = self.parent(taxid)
                        rank = self.rank(taxid)
                if contig not in lca_hits:
                    lca_hits.update({contig: {rank: {taxid: 1}}})
                elif rank not in lca_hits[contig]:
                    lca_hits[contig].update({rank: {taxid: 1}})
                elif taxid not in lca_hits[contig][rank]:
                    lca_hits[contig][rank].update({taxid: 1})
                else:
                    lca_hits[contig][rank][taxid] += 1
        return lca_hits
Пример #2
0
    def search_prot_accessions(
        self,
        accessions: set,
        sseqids_to_taxids: Dict[str, int] = None,
        db: str = "live",
    ) -> Dict[str, int]:
        """Search prot.accession2taxid.gz and dead_prot.accession2taxid.gz

        Parameters
        ----------
        accessions : set
            Set of subject sequence ids retrieved from diamond blastp search (sseqids)

        sseqids_to_taxids : Dict[str, int], optional
            Dictionary containing sseqids converted to taxids

        db : str, optional
            selection of one of the prot accession to taxid databases from NCBI. Choices are live, dead, full

            * live: prot.accession2taxid.gz
            * full: prot.accession2taxid.FULL.gz
            * dead: dead_prot.accession2taxid.gz

        Returns
        -------
        Dict[str, int]
            Dictionary containing sseqids converted to taxids
        """
        if not sseqids_to_taxids:
            sseqids_to_taxids = {}
        if not isinstance(db, str):
            raise ValueError(f"db must be a string! Type Given: {type(db)}")
        db = db.lower()
        choices = {
            "live": self.accession2taxid_fpath,
            "dead": self.dead_accession2taxid_fpath,
            "full": self.accession2taxidfull_fpath,
        }
        if db not in choices:
            raise ValueError(f"db must be one of live, full or dead. Given: {db}")
        fpath = choices.get(db)

        # Revert to accession2taxid if FULL is not present
        if db == "full" and (not os.path.exists(fpath) or not os.path.getsize(fpath)):
            logger.warn(
                "prot.accession2taxid.FULL.gz was not found. Reverting to prot.accession2taxid.gz"
            )
            logger.warn(
                "To achieve greater resolution of your metagenome taxonomy, considering downloading the prot.accession2taxid.FULL.gz database file"
            )
            fpath = choices.get("live")
            db = "live"

        if not os.path.exists(fpath) or not os.path.getsize(fpath):
            raise FileNotFoundError(fpath)

        # "rt" open the database in text mode instead of binary to be handled like a text file
        fh = gzip.open(fpath, "rt") if fpath.endswith(".gz") else open(fpath)
        filename = os.path.basename(fpath)
        # skip the header line
        __ = fh.readline()
        logger.debug(
            f"Searching for {len(accessions):,} accessions in {filename}. This may take a while..."
        )
        n_lines = file_length(fpath, approximate=True) if self.verbose else None
        desc = f"Parsing {filename}"
        converted_sseqid_count = 0
        for line in tqdm(
            fh, disable=self.disable, desc=desc, total=n_lines, leave=False
        ):
            if db == "full":
                # FULL format is accession.version\ttaxid\n
                acc_num = None  # Just in case
                acc_ver, taxid = line.strip().split("\t")
            else:
                # dead and live formats are accession\taccession.version\ttaxid\tgi\n
                acc_num, acc_ver, taxid, _ = line.strip().split("\t")

            taxid = int(taxid)
            if acc_ver in accessions:
                sseqids_to_taxids[acc_ver] = taxid
                converted_sseqid_count += 1

            # So prog will not have to search through the accessions set
            if db == "full":
                continue

            # Search for base accession if using live or dead accession2taxid databases
            if acc_num in accessions:
                sseqids_to_taxids[acc_num] = taxid
                converted_sseqid_count += 1

        fh.close()
        logger.debug(f"sseqids converted from {filename}: {converted_sseqid_count:,}")
        return sseqids_to_taxids
Пример #3
0
def parse(results: str,
          bitscore_filter: float = 0.9,
          verbose: bool = False) -> Dict[str, Set[str]]:
    """
    Retrieve diamond results from output table

    Parameters
    ----------
    results : str
        Path to BLASTP output file in outfmt6

    bitscore_filter : 0 < float <= 1, optional
        Bitscore filter applied to each sseqid, by default 0.9
        Used to determine whether the bitscore is above a threshold value.
        For example, if it is 0.9 then only bitscores >= 0.9 * the top bitscore are accepted

    verbose : bool, optional
        log progress to terminal, by default False

    Returns
    -------
    dict
        {qseqid: {sseqid, sseqid, ...}, ...}

    Raises
    -------
    FileNotFoundError
        diamond results table does not exist
    ValueError
        bitscore_filter value is not a float or not in range of 0 to 1
    """
    disable = False if verbose else True
    # boolean toggle --> keeping above vs. below because I think this is more readable.
    # disable = not verbose
    if verbose:
        logger.debug(f"Parsing accessions from {os.path.basename(results)}")
    if not os.path.exists(results):
        raise FileNotFoundError(results)
    try:
        float(bitscore_filter)
    except ValueError:
        raise ValueError(
            f"bitscore_filter must be a float! Input: {bitscore_filter} Type: {type(bitscore_filter)}"
        )
    in_range = 0.0 < bitscore_filter <= 1.0
    if not in_range:
        raise ValueError(
            f"bitscore_filter not in range(0,1)! Input: {bitscore_filter}")
    hits = {}
    n_lines = file_length(results) if verbose else None
    topbitscore = float("-inf")
    with open(results) as fh:
        for line in tqdm(fh,
                         disable=disable,
                         total=n_lines,
                         desc="Parsing Accessions",
                         leave=False):
            llist = line.rstrip().split("\t")
            qseqid = llist[0]
            sseqid = llist[1]
            bitscore = float(llist[11])
            # Reassign the topbitscore if this is a new qseqid from the BLAST table.
            if qseqid not in hits:
                hits.update({qseqid: set([sseqid])})
                topbitscore = bitscore
                continue
            if bitscore >= bitscore_filter * topbitscore:
                hits[qseqid].add(sseqid)
    return hits