Пример #1
0
def parse_file(fpath, skip: Union[tuple, str] = "#"):
    """basic file parsing"""
    fpath = cleanpath(fpath)
    with open(fpath) as lines:
        for line in lines:
            line = line.strip()
            if line.startswith(skip):
                continue
            yield line
Пример #2
0
def _lazy_provider_selection(name, provider=None):
    """return the first PROVIDER which has genome NAME"""
    providers = []
    for p in online_providers(provider):
        providers.append(p.name)
        if name in p.genomes:
            return p
        if p.name == "URL" and try_except_pass(ValueError, check_url, name):
            return p
        if p.name == "Local" and os.path.exists(cleanpath(name)):
            return p

    raise GenomeDownloadError(f"{name} not found on {', '.join(providers)}.")
Пример #3
0
    def __init__(self,
                 name: str,
                 genomes_dir: str = None,
                 quiet: bool = False):
        # name and directory
        n, g = _get_name_and_dir(name, genomes_dir)
        self.name = n
        "genome name"
        self.genome_dir = g
        "path to the genome directory"

        # annotation files
        fname = cleanpath(name)
        suffixes = Path(fname).suffixes[-2:]
        b = fname
        if not (".bed" in suffixes or ".BED" in suffixes):
            b = _get_file(self.genome_dir, f"{self.name}.annotation.bed",
                          not quiet)
        self.annotation_bed_file = b
        "path to the gene annotation BED file"
        g = fname
        if not (".gtf" in suffixes or ".GTF" in suffixes):
            g = _get_file(self.genome_dir, f"{self.name}.annotation.gtf",
                          not quiet)
        self.annotation_gtf_file = g
        "path to the gene annotation GTF file"

        # genome files
        g = fname
        if ".fa" not in suffixes:
            g = _get_file(self.genome_dir, f"{self.name}.fa", False)
        self.genome_file = g
        "path to the genome fasta"
        self.readme_file = _get_file(self.genome_dir, "README.txt", False)
        "path to the README file"
        self.index_file = _get_file(self.genome_dir, f"{self.name}.fa.fai",
                                    False)
        "path to the genome index"
        self.sizes_file = _get_file(self.genome_dir, f"{self.name}.fa.sizes",
                                    False)
        "path to the chromosome sizes file"

        # genome attributes
        t = read_readme(str(self.readme_file))[0]["tax_id"]
        self.tax_id = None if t == "na" else int(t)
        "genome taxonomy identifier"
Пример #4
0
    def get_annotation_download_links(self, name, **kwargs):
        """Returns all files containing both name and an annotation extension"""
        name = cleanpath(name)
        genome_dir = os.path.dirname(name)
        search_list = os.listdir(genome_dir)
        search_name = get_genomename(name)

        hits = []
        for ext in ["gtf", "gff", "gff3"]:
            # .*? = non greedy filler. (\.gz)? = optional .gz
            expr = fr"{search_name}.*?\.{ext}(\.gz)?"  # noqa: W605
            for line in search_list:
                hit = re.search(expr, line, flags=re.IGNORECASE)
                if hit:
                    hit = os.path.join(genome_dir, hit[0])
                    hits.append(hit)

        return hits
Пример #5
0
    def _parse_filename(self, name: str) -> str:
        """
        accepts path to a fasta file, path to a fasta folder, or
        the name of a genome (e.g. hg38).

        returns the abspath to the fasta file
        """
        path_name = cleanpath(name)
        if os.path.isfile(path_name):
            return path_name

        default_genome_dir = os.path.join(self.genomes_dir, self.name)
        for f in glob_ext_files(path_name) + glob_ext_files(default_genome_dir):
            if self.name + ".fa" in os.path.basename(f):
                return f

        raise FileNotFoundError(
            f"could not find {self.name}.fa(.gz) in genome_dir {default_genome_dir}"
        )
Пример #6
0
    def get_annotation_download_link(self, name: str, **kwargs) -> str:
        """
        Return a filepath to a matching annotation.

        Parameters
        ----------
        name : str
            genome name
        **kwargs: dict, optional:
            path_to_annotation : direct path to the gene annotation

        Returns
        -------
        str
            path

        Raises
        ------
        GenomeDownloadError
            if no functional path was found
        """
        path = kwargs.get("path_to_annotation")
        if path:
            path = cleanpath(path)
            if not os.path.exists(path):
                raise FileNotFoundError(
                    f"Local path to annotation does not exist: {path}")
            ext = get_file_info(path)[0]
            if ext not in [".gtf", ".gff", ".gff3", ".bed"]:
                raise TypeError(
                    "Only (gzipped) gtf, gff and bed files are supported.\n")
            return path

        paths = self.get_annotation_download_links(name)
        if paths:
            return paths[0]

        raise GenomeDownloadError(
            f"No gene annotations found for {get_genomename(name)}.\n")
Пример #7
0
def _get_name_and_dir(name, genomes_dir=None):
    """
    Returns the name and directory of the genome.
    """
    fname = cleanpath(name)
    genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
    if os.path.isfile(fname):
        exts = ["gtf", "GTF", "bed", "BED", "fa"]
        if not any(ext in fname for ext in exts):
            raise NotImplementedError(
                "Only (gzipped) bed, gtf or fasta files are supported!")
        genome_dir = os.path.dirname(fname)
        name = safe(os.path.basename(fname))
        # remove suffices
        any_ext = "(" + ")|(".join(exts) + ")"
        name = re.sub(fr"(\.annotation)?\.({any_ext})(\.gz)?$", "", name)
    elif os.path.isdir(fname):
        genome_dir = fname
        name = safe(os.path.basename(fname))
    elif name in os.listdir(genomes_dir):
        genome_dir = os.path.join(genomes_dir, name)
    else:
        raise FileNotFoundError(f"Could not find {name}")
    return name, genome_dir
Пример #8
0
 def get_genome_download_link(self, path, mask=None, **kwargs):
     path = cleanpath(path)
     if not os.path.exists(path):
         raise FileNotFoundError(
             f"Local path to genome does not exist: {path}")
     return path