Пример #1
0
    def search(self, term):
        """
        Search for term in genome names, descriptions and taxonomy ID.

        The search is case-insensitive.

        Parameters
        ----------
        term : str
            Search term, case-insensitive. Can be assembly name (e.g. hg38),
            (part of a) scientific name (Danio rerio) or taxonomy id (722).

        Yields
        ------
        tuples with name and metadata
        """
        genomes = self.genomes
        term = safe(str(term))
        if term.startswith("GCA_") and self.name != "NCBI":
            for row in self._search_accessions(term):
                yield (row)

        elif is_number(term):
            for name in genomes:
                if self._search_taxids(genomes[name], term):
                    yield self._genome_info_tuple(name)

        else:
            term = term.lower()
            for name in genomes:
                if term in safe(name).lower() or self._search_descriptions(
                        genomes[name], term):
                    yield self._genome_info_tuple(name)
Пример #2
0
def get_genomes(rest_url):
    logger.info("Downloading assembly summaries from Ensembl")

    genomes = {}
    divisions = retry(request_json, 3, rest_url, "info/divisions?")
    for division in divisions:
        if division == "EnsemblBacteria":
            continue
        division_genomes = retry(request_json, 3, rest_url,
                                 f"info/genomes/division/{division}?")

        # filter summaries to these keys (to reduce the size of the cached data)
        summary_keys_to_keep = [
            "assembly_name",
            "assembly_accession",
            "taxonomy_id",
            "name",
            "scientific_name",
            "url_name",
            "display_name",
            "genebuild",
            "division",
            "base_count",
        ]
        for genome in division_genomes:
            name = safe(genome["assembly_name"])
            genomes[name] = {k: genome[k] for k in summary_keys_to_keep}

    genomes = add_grch37(genomes)
    return genomes
Пример #3
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return NCBI ftp link to top-level genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http/ftp download link.
        """
        genome = self.genomes[safe(name)]

        # only soft masked genomes available. can be (un)masked in _post _process_download
        link = genome["ftp_path"]
        link = link.replace("ftp://", "https://")
        link += "/" + link.split("/")[-1] + "_genomic.fna.gz"

        if check_url(link, 2):
            return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URL is broken. Select another genome or provider.\n"
            f"Broken URL: {link}")
Пример #4
0
def search(term, provider=None):
    """
    Search for a genome.

    If provider is specified, search only that specific provider, else
    search all providers. Both the name and description are used for the
    search. Search term is case-insensitive.

    Parameters
    ----------
    term : str or int
        Search term, case-insensitive.

    provider : str , optional
        Provider name

    Yields
    ------
    tuple
        genome information (name/identifier and description)
    """
    term = safe(str(term))
    providers = _providers(provider)
    for p in providers:
        for row in p.search(term):
            yield [
                x.encode("utf-8")
                for x in list(row[:1]) + [p.name] + list(row[1:])
            ]
Пример #5
0
    def __init__(self, name, genomes_dir=None, *args, **kwargs):
        self.name = safe(os.path.basename(re.sub(r"\.fa(\.gz)?$", "", name)))
        "genome name"
        self.genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        "path to the genomepy genomes directory"
        self.filename = self._parse_filename(name)
        super(Genome, self).__init__(self.filename, *args, **kwargs)

        # file paths
        self.genome_file = self.filename
        "path to the genome fasta"
        self.genome_dir = os.path.dirname(self.filename)
        "path to the genome directory"
        self.index_file = self.genome_file + ".fai"
        "path to the genome index"
        self.sizes_file = self._check_support_file("sizes")
        "path to the chromosome sizes file"
        self.gaps_file = self._check_support_file("gaps")
        "path to the chromosome gaps file"
        self.annotation_gtf_file = self._check_annotation_file("gtf")
        "path to the gene annotation GTF file"
        self.annotation_bed_file = self._check_annotation_file("bed")
        "path to the gene annotation BED file"
        self.readme_file = os.path.join(self.genome_dir, "README.txt")
        "path to the README file"

        # genome attributes
        metadata, _ = read_readme(self.readme_file)
        self.tax_id = metadata["tax_id"]
        "genome taxonomy identifier"
        self.assembly_accession = metadata["assembly_accession"]
        "genome assembly accession"
Пример #6
0
def search(term, provider: str = None, size=False):
    """
    Search for a genome.

    If provider is specified, search only that specific provider, else
    search all providers. Both the name and description are used for the
    search. Search term is case-insensitive.

    Parameters
    ----------
    term : str
        Search term, case-insensitive.
    provider : str , optional
        Only search the specified provider (faster).
    size : bool, optional
        Show absolute genome size.

    Yields
    ------
    list
        genome name, provider and metadata
    """
    term = safe(str(term))
    for p in online_providers(provider):
        for row in p.search(term, size):
            ret = list(row[:1]) + [p.name] + list(row[1:])
            yield ret
Пример #7
0
    def get_annotation_download_links(self, name, **kwargs):
        """
        Retrieve functioning gene annotation download link(s).

        Parameters
        ----------
        name : str
            genome name
        **kwargs: dict, optional:
            version : Ensembl version to use. By default the latest version is used

        Returns
        -------
        list
            http link(s)
        """
        genome = self.genomes[safe(name)]
        division, is_vertebrate = get_division(genome)

        # base directory of the genome
        ftp = "http://ftp.ensemblgenomes.org"
        if is_vertebrate:
            ftp = "http://ftp.ensembl.org"
        version = self.get_version(is_vertebrate, kwargs.get("version"))
        div_path = "" if is_vertebrate else f"/{division}"
        lwr_name = genome["url_name"].lower()

        ftp_directory = f"{ftp}/pub/release-{version}{div_path}/gtf/{lwr_name}"
        # some entries don't use url_name in their url... -,-
        # examples:
        #   - EnsemblVertebrates: mus_musculus_nzohlltj
        #   - EnsemblMetazoa: caenorhabditis_elegans
        if not check_url(ftp_directory, 2):
            lwr_name = genome["name"]
            ftp_directory = f"{ftp}/pub/release-{version}{div_path}/gtf/{lwr_name}"

        # specific gtf file
        cap_name = lwr_name.capitalize()
        asm_name = re.sub(r"\.p\d+$", "", safe(genome["assembly_name"]))

        ftp_file = f"{cap_name}.{asm_name}.{version}.gtf.gz"

        # combine
        link = f"{ftp_directory}/{ftp_file}"
        if name == "GRCh37":
            link = genome["annotation"].format(version)
        return [link] if check_url(link, max_tries=2) else []
Пример #8
0
    def annotation_links(self, name: str, **kwargs) -> List[str]:
        """
        Return available gene annotation links (http/ftp) for a genome

        Parameters
        ----------
        name: str
            genome name

        Returns
        ------
        list
            Gene annotation links
        """
        if "annotations" not in self.genomes[safe(name)]:
            links = self.get_annotation_download_links(name, **kwargs)
            self.genomes[safe(name)]["annotations"] = links
        return self.genomes[safe(name)]["annotations"]
Пример #9
0
    def _check_name(self, name):
        """check if genome name can be found for provider"""
        name = safe(name)
        if name in self.genomes:
            return name

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n\n"
            "Check for typos or try\n"
            f"  genomepy search {name} -p {self.name}")
Пример #10
0
    def _ftp_or_html_link(self, name, file_suffix, skip_check=False):
        """
        NCBI's files are accessible over FTP and HTTPS
        Try HTTPS first and return the first functioning link
        """
        genome = self.genomes[safe(name)]
        ftp_link = genome["ftp_path"]
        html_link = ftp_link.replace("ftp://", "https://")
        for link in [html_link, ftp_link]:
            link += "/" + link.split("/")[-1] + file_suffix

            if skip_check or check_url(link, max_tries=2, timeout=10):
                return link
Пример #11
0
    def _get_genomes(self, rest_url):
        sys.stderr.write("Downloading assembly summaries from Ensembl\n")

        genomes = {}
        divisions = retry(self._request_json, 3, rest_url, "info/divisions?")
        for division in divisions:
            if division == "EnsemblBacteria":
                continue
            division_genomes = retry(self._request_json, 3, rest_url,
                                     f"info/genomes/division/{division}?")
            for genome in division_genomes:
                genomes[safe(genome["assembly_name"])] = genome
        return genomes
Пример #12
0
def get_genomes(assembly_url):
    """Parse genomes from assembly summary txt files."""
    logger.info(
        "Downloading assembly summaries from NCBI, this will take a while...")

    def load_summary(url):
        """
        lazy loading of the url so we can parse while downloading
        """
        for row in urlopen(url):
            yield row

    genomes = {}
    # order is important as asm_name can repeat (overwriting the older name)
    names = [
        "assembly_summary_genbank_historical.txt",
        "assembly_summary_refseq_historical.txt",
        "assembly_summary_genbank.txt",
        "assembly_summary_refseq.txt",
    ]
    # filter summaries to these keys (to reduce the size of the cached data)
    summary_keys_to_keep = [
        0,  # 'assembly_accession',
        5,  # 'taxid',
        6,  # 'species_taxid',
        7,  # 'organism_name',
        16,  # 'submitter',
        17,  # 'gbrs_paired_asm',
        18,  # 'paired_asm_comp',
        19,  # 'ftp_path',
    ]
    for fname in names:
        lines = load_summary(f"{assembly_url}/{fname}")
        # line 0 = comment
        _ = next(lines)
        # line 1 = header
        header = next(lines).decode("utf-8").strip("# ").strip("\n").split(
            "\t")
        header = [header[n] for n in summary_keys_to_keep]
        for line in tqdm(lines,
                         desc=fname[17:-4],
                         unit_scale=1,
                         unit=" genomes"):
            line = line.decode("utf-8").strip("\n").split("\t")
            name = safe(line[15])  # overwrites older asm_names
            if line[19] != "na":  # ftp_path must exist
                line = [line[n] for n in summary_keys_to_keep]
                genomes[name] = dict(zip(header, line))
    return genomes
Пример #13
0
def _get_name_and_dir(name, genomes_dir=None):
    """
    Returns the name and directory of the genome.
    """
    fname = cleanpath(name)
    genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
    if os.path.isfile(fname):
        exts = ["gtf", "GTF", "bed", "BED", "fa"]
        if not any(ext in fname for ext in exts):
            raise NotImplementedError(
                "Only (gzipped) bed, gtf or fasta files are supported!")
        genome_dir = os.path.dirname(fname)
        name = safe(os.path.basename(fname))
        # remove suffices
        any_ext = "(" + ")|(".join(exts) + ")"
        name = re.sub(fr"(\.annotation)?\.({any_ext})(\.gz)?$", "", name)
    elif os.path.isdir(fname):
        genome_dir = fname
        name = safe(os.path.basename(fname))
    elif name in os.listdir(genomes_dir):
        genome_dir = os.path.join(genomes_dir, name)
    else:
        raise FileNotFoundError(f"Could not find {name}")
    return name, genome_dir
Пример #14
0
        def get_url(level="toplevel"):
            masks = {
                "soft": "dna_sm.{}",
                "hard": "dna_rm.{}",
                "none": "dna.{}"
            }
            pattern = masks[mask].format(level)

            asm_url = "{}/{}.{}.{}.fa.gz".format(
                url,
                genome["url_name"].capitalize(),
                re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])),
                pattern,
            )
            return asm_url
Пример #15
0
    def get_annotation_download_link(self, name, **kwargs):
        """
        Parse and test the link to the NCBI annotation file.

        Parameters
        ----------
        name : str
            Genome name
        """
        genome = self.genomes[safe(name)]
        link = genome["ftp_path"]
        link = link.replace("ftp://", "https://")
        link += "/" + link.split("/")[-1] + "_genomic.gff.gz"

        if check_url(link, 2):
            return link
Пример #16
0
    def _update_metadata(self, metadata):
        """check if there is missing info that can be updated"""
        print("Updating metadata in README.txt", file=sys.stderr)
        if metadata.get("provider", "na") == "na":
            self._update_provider(metadata)

        known_provider = metadata["provider"] in ["Ensembl", "UCSC", "NCBI"]
        name = safe(metadata.get("original name", ""))
        missing_info = any(key not in metadata
                           for key in ["tax_id", "assembly_accession"])
        p = genome = None
        if known_provider and name and missing_info:
            p = ProviderBase.create(metadata["provider"])
            genome = p.genomes.get(name)

        if "tax_id" not in metadata:
            self._update_tax_id(metadata, p, genome)
        if "assembly_accession" not in metadata:
            self._update_assembly_accession(metadata, p, genome)
Пример #17
0
    def get_annotation_download_link(self, name, **kwargs):
        """
        Parse and test the link to the Ensembl annotation file.

        Parameters
        ----------
        name : str
            Genome name
        kwargs: dict , optional:
            Provider specific options.

            version : int , optional
                Ensembl version. By default the latest version is used.
        """
        genome = self.genomes[safe(name)]
        division = genome["division"].lower().replace("ensembl", "")

        ftp_site = "ftp://ftp.ensemblgenomes.org/pub"
        if division == "vertebrates":
            ftp_site = "ftp://ftp.ensembl.org/pub"

        # Ensembl release version
        version = kwargs.get("version")
        if version is None:
            version = self.get_version(self.rest_url,
                                       division == "vertebrates")

        if division != "vertebrates":
            ftp_site += f"/{division}"

        # Get the GTF URL
        base_url = ftp_site + "/release-{}/gtf/{}/{}.{}.{}.gtf.gz"
        safe_name = re.sub(r"\.p\d+$", "", name)
        link = base_url.format(
            version,
            genome["url_name"].lower(),
            genome["url_name"].capitalize(),
            safe_name,
            version,
        )

        if check_url(link, 2):
            return link
Пример #18
0
    def _get_genomes(assembly_url):
        """Parse genomes from assembly summary txt files."""
        sys.stderr.write(
            "Downloading assembly summaries from NCBI, this will take a while...\n"
        )

        genomes = {}
        # order is important as asm_name can repeat (overwriting the older name)
        names = [
            "assembly_summary_refseq_historical.txt",
            "assembly_summary_genbank.txt",
            "assembly_summary_refseq.txt",
        ]
        for fname in names:
            urlcleanup()
            with urlopen(os.path.join(assembly_url, fname)) as response:
                lines = response.read().decode("utf-8").splitlines()
            header = lines[1].strip("# ").split("\t")
            for line in lines[2:]:
                vals = line.strip("# ").split("\t")
                # overwrites older asm_names
                genomes[safe(vals[15])] = dict(zip(header, vals))
        return genomes
Пример #19
0
    def download_genome(
        self,
        name,
        genomes_dir=None,
        localname=None,
        mask="soft",
        keep_alt=False,
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        keep_alt : bool , optional
            Set to true to keep these alternative regions.

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        name = safe(name)
        self.check_name(name)

        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        sys.stderr.write(
            f"Downloading genome from {self.name}.\nTarget URL: {link}...\n")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        tmp_dir = mkdtemp(dir=out_dir)
        fname = os.path.join(tmp_dir, f"{localname}.fa")

        urlcleanup()
        download_file(link, fname)
        sys.stderr.write(
            "Genome download successful, starting post processing...\n")

        # unzip genome
        if link.endswith(".tar.gz"):
            tar_to_bigfile(fname, fname)
        elif link.endswith(".gz"):
            os.rename(fname, fname + ".gz")
            ret = sp.check_call(["gunzip", "-f", fname])
            if ret != 0:
                raise Exception(f"Error gunzipping genome {fname}")

        def regex_filer(_fname, _regex, _v):
            infa = _fname + "_to_regex"
            os.rename(_fname, infa)
            # filter the fasta and store the output's keys
            keys_out = filter_fasta(infa,
                                    outfa=_fname,
                                    regex=_regex,
                                    v=_v,
                                    force=True).keys()
            keys_in = Fasta(infa).keys()
            return [k for k in keys_in if k not in keys_out]

        not_included = []
        # remove alternative regions
        if not keep_alt:
            not_included.extend(regex_filer(fname, "alt", True))

        # keep/remove user defined regions
        if regex:
            not_included.extend(regex_filer(fname, regex, invert_match))

        # process genome (e.g. masking)
        if hasattr(self, "_post_process_download"):
            self._post_process_download(name=name,
                                        localname=localname,
                                        out_dir=tmp_dir,
                                        mask=mask)

        # bgzip genome if requested
        if bgzip or config.get("bgzip"):
            # bgzip to stdout, track progress, and output to file
            fsize = int(os.path.getsize(fname) * 10**-6)
            cmd = (
                f"bgzip -fc {fname} | "
                f"tqdm --bytes --desc Bgzipping {fsize}MB fasta --log ERROR | "
                f"cat > {fname}.gz")
            ret = sp.check_call(cmd, shell=True)
            if ret != 0:
                raise Exception(f"Error bgzipping {name}. Is tabix installed?")
            fname += ".gz"

        # transfer the genome from the tmpdir to the genome_dir
        src = fname
        dst = os.path.join(genomes_dir, localname, os.path.basename(fname))
        shutil.move(src, dst)
        rm_rf(tmp_dir)

        sys.stderr.write("\n")
        sys.stderr.write("name: {}\n".format(name))
        sys.stderr.write("local name: {}\n".format(localname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession":
            self.assembly_accession(self.genomes.get(name)),
            "tax_id": self.genome_taxid(self.genomes.get(name)),
            "mask": mask,
            "genome url": link,
            "annotation url": "na",
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        lines = []
        if not keep_alt or regex:
            regex_line = "regex: "
            if not keep_alt:
                regex_line += "'alt' (inverted match)"

            if not keep_alt and regex:
                regex_line += " and "

            if regex:
                regex_line += f"'{regex}'"
                if invert_match:
                    regex_line += " (inverted match)"

            lines += ["", regex_line, "sequences that were excluded:"]
            for seq in not_included:
                lines.append(f"\t{seq}")
        write_readme(readme, metadata, lines)
Пример #20
0
def install_genome(
    name: str,
    provider: Optional[str] = None,
    genomes_dir: Optional[str] = None,
    localname: Optional[str] = None,
    mask: Optional[str] = "soft",
    keep_alt: Optional[bool] = False,
    regex: Optional[str] = None,
    invert_match: Optional[bool] = False,
    bgzip: Optional[bool] = None,  # None -> check config. False -> dont check.
    annotation: Optional[bool] = False,
    only_annotation: Optional[bool] = False,
    skip_matching: Optional[bool] = False,
    skip_filter: Optional[bool] = False,
    threads: Optional[int] = 1,
    force: Optional[bool] = False,
    **kwargs: Optional[dict],
) -> Genome:
    """
    Install a genome (& gene annotation).

    Parameters
    ----------
    name : str
        Genome name

    provider : str , optional
        Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified.

    genomes_dir : str , optional
        Where to create the output folder.

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Genome masking of repetitive sequences. Options: hard/soft/none, default is soft.

    keep_alt : bool , optional
        Some genomes contain alternative regions. These regions cause issues with
        sequence alignment, as they are inherently duplications of the consensus regions.
        Set to true to keep these alternative regions.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that *don't* match the regex.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip,
        and gene annotation will be compressed with gzip.

    threads : int , optional
        Build genome index using multithreading (if supported). Default: lowest of 8/all threads.

    force : bool , optional
        Set to True to overwrite existing files.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    only_annotation : bool , optional
        If set to True, only download the gene annotation files.

    skip_matching : bool , optional
        If set to True, contigs in the annotation not matching
        those in the genome will not be corrected.

    skip_filter : bool , optional
        If set to True, the gene annotations will not be filtered to match the genome contigs.

    kwargs : dict , optional
        Provider specific options.

        toplevel : bool , optional
            Ensembl only: Always download the toplevel genome. Ignores potential primary assembly.

        version : int , optional
            Ensembl only: Specify release version. Default is latest.

        to_annotation : text , optional
            URL only: direct link to annotation file.
            Required if this is not the same directory as the fasta.

    Returns
    -------
    Genome
        Genome class with the installed genome
    """
    name = safe(name)
    localname = get_localname(name, localname)
    genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
    out_dir = os.path.join(genomes_dir, localname)
    genome_file = os.path.join(out_dir, f"{localname}.fa")
    provider = _provider_selection(name, localname, genomes_dir, provider)

    # check which files need to be downloaded
    genome_found = _is_genome_dir(out_dir)
    download_genome = (
        genome_found is False or force is True
    ) and only_annotation is False
    annotation_found = bool(glob_ext_files(out_dir, "annotation.gtf")) and bool(
        glob_ext_files(out_dir, "annotation.bed")
    )
    download_annotation = (annotation_found is False or force is True) and any(
        [
            annotation,
            only_annotation,
            skip_matching,
            skip_filter,
            kwargs.get("to_annotation"),
            kwargs.get("path_to_annotation"),
            kwargs.get("ucsc_annotation_type"),
        ]
    )

    genome = None
    genome_downloaded = False
    if download_genome:
        if force:
            _delete_extensions(out_dir, ["fa", "fai"])
        provider.download_genome(
            name,
            genomes_dir,
            mask=mask,
            localname=localname,
            **kwargs,
        )
        genome_found = True
        genome_downloaded = True

        # Filter genome
        _filter_genome(genome_file, regex, invert_match, keep_alt)

        # Generates a Fasta object and the genome index, gaps and sizes files
        genome = Genome(localname, genomes_dir=genomes_dir)

        # Download the NCBI assembly report
        asm_report = os.path.join(out_dir, "assembly_report.txt")
        asm_acc = genome.assembly_accession
        if not os.path.exists(asm_report) and asm_acc != "na":
            download_assembly_report(asm_acc, asm_report)

        # Export installed genome(s)
        generate_env(genomes_dir=genomes_dir)

    annotation_downloaded = False
    if download_annotation:
        if force:
            _delete_extensions(out_dir, ["annotation.gtf", "annotation.bed"])
        provider.download_annotation(name, genomes_dir, localname=localname, **kwargs)
        annotation_downloaded = bool(
            glob_ext_files(out_dir, "annotation.gtf")
        ) and bool(glob_ext_files(out_dir, "annotation.bed"))

    if annotation_downloaded:
        annotation = Annotation(localname, genomes_dir=genomes_dir)
        if genome_found and not (skip_matching and skip_filter):
            annotation.sanitize(not skip_matching, not skip_filter, True)

    # Run active plugins (also if the genome was downloaded earlier)
    if genome_found:
        genome = genome if genome else Genome(localname, genomes_dir=genomes_dir)
        for plugin in get_active_plugins():
            plugin.after_genome_download(genome, threads, force)

    # zip files downloaded now
    if bgzip is True or (bgzip is None and config.get("bgzip")):
        if genome_downloaded:
            bgzip_and_name(genome.filename)
        if annotation_downloaded:
            gzip_and_name(annotation.annotation_gtf_file)
            gzip_and_name(annotation.annotation_bed_file)

    return genome
Пример #21
0
def install_genome(
    name,
    provider=None,
    genomes_dir=None,
    localname=None,
    mask="soft",
    keep_alt=False,
    regex=None,
    invert_match=False,
    bgzip=None,
    annotation=False,
    only_annotation=False,
    skip_sanitizing=False,
    threads=1,
    force=False,
    **kwargs,
):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str , optional
        Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified.

    genomes_dir : str , optional
        Where to store the fasta files

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', choices 'hard'/'soft/'none' for respective masking level.

    keep_alt : bool , optional
        Some genomes contain alternative regions. These regions cause issues with
        sequence alignment, as they are inherently duplications of the consensus regions.
        Set to true to keep these alternative regions.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip.
        If not specified, the setting from the configuration file will be used.

    threads : int , optional
        Build genome index using multithreading (if supported). Default: lowest of 8/all threads

    force : bool , optional
        Set to True to overwrite existing files.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    only_annotation : bool , optional
        If set to True, only download the annotation files.

    skip_sanitizing : bool , optional
        If set to True, downloaded annotation files whose sequence names do not match
        with the (first header fields of) the genome.fa will not be corrected.

    kwargs : dict , optional
        Provider specific options.
        toplevel : bool , optional
            Ensembl only: Always download the toplevel genome. Ignores potential primary assembly.

        version : int , optional
            Ensembl only: Specify release version. Default is latest.

        to_annotation : text , optional
            URL only: direct link to annotation file.
            Required if this is not the same directory as the fasta.
    """
    name = safe(name)
    localname = get_localname(name, localname)
    genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
    out_dir = os.path.join(genomes_dir, localname)

    # Check if genome already exists, or if downloading is forced
    genome_found = _is_genome_dir(out_dir)
    if (not genome_found or force) and not only_annotation:
        # Download genome from provider
        p = _provider_selection(name, localname, genomes_dir, provider)
        p.download_genome(
            name,
            genomes_dir,
            mask=mask,
            keep_alt=keep_alt,
            regex=regex,
            invert_match=invert_match,
            localname=localname,
            bgzip=bgzip,
            **kwargs,
        )
        genome_found = True

        # Export installed genome(s)
        generate_env(genomes_dir=genomes_dir)

    # Generates a Fasta object, index, gaps and sizes file
    g = None
    if genome_found:
        g = Genome(localname, genomes_dir=genomes_dir)
        if force:
            # overwrite previous versions
            generate_fa_sizes(g.genome_file, g.sizes_file)
            generate_gap_bed(g.genome_file, g.gaps_file)

    # Check if any annotation flags are given, if annotation already exists, or if downloading is forced
    if any([
            annotation,
            only_annotation,
            skip_sanitizing,
            kwargs.get("to_annotation"),
            kwargs.get("ucsc_annotation_type"),
    ]):
        annotation = True
    annotation_found = bool(glob_ext_files(out_dir, "gtf"))
    if (not annotation_found or force) and annotation:
        # Download annotation from provider
        p = _provider_selection(name, localname, genomes_dir, provider)
        p.download_annotation(name, genomes_dir, localname=localname, **kwargs)

        # Sanitize annotation if needed (requires genome)
        annotation_found = bool(glob_ext_files(out_dir, "gtf"))
        if genome_found and annotation_found and not skip_sanitizing:
            sanitize_annotation(g)

    if genome_found:
        # Run all active plugins (requires genome)
        for plugin in get_active_plugins():
            plugin.after_genome_download(g, threads, force)
Пример #22
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return http link to the genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http download link.
        """
        genome = self.genomes[safe(name)]
        division, is_vertebrate = get_division(genome)

        # base directory of the genome
        ftp = "http://ftp.ensemblgenomes.org"
        if is_vertebrate:
            ftp = "http://ftp.ensembl.org"
        version = self.get_version(is_vertebrate, kwargs.get("version"))
        div_path = "" if is_vertebrate else f"/{division}"
        lwr_name = genome["url_name"].lower()

        ftp_directory = f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna"
        # some entries don't use url_name in their url... -,-
        # examples:
        #   - EnsemblVertebrates: mus_musculus_nzohlltj
        #   - EnsemblMetazoa: caenorhabditis_elegans
        if not check_url(ftp_directory, 2):
            lwr_name = genome["name"]
            ftp_directory = f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna"

        # this assembly has its own directory
        if name == "GRCh37":
            ftp_directory = genome["genome"].format(version)

        # specific fasta file
        cap_name = lwr_name.capitalize()
        asm_name = re.sub(r"\.p\d+$", "", safe(genome["assembly_name"]))
        mask_lvl = {"soft": "_sm", "hard": "_rm", "none": ""}[mask]
        asm_lvl = "toplevel" if kwargs.get("toplevel") else "primary_assembly"
        version_tag = "" if int(version) > 30 else f".{version}"

        ftp_file = f"{cap_name}.{asm_name}{version_tag}.dna{mask_lvl}.{asm_lvl}.fa.gz"

        # combine
        link = f"{ftp_directory}/{ftp_file}"
        if check_url(link, 2):
            return link

        # primary assemblies do not always exist
        if asm_lvl == "primary_assembly":
            link = link.replace("primary_assembly", "toplevel")
            if check_url(link, 2):
                return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URL is broken. Select another genome or provider.\n"
            f"Broken URL: {link}")
Пример #23
0
    def _post_process_download(self, name, localname, out_dir, mask="soft"):
        """
        Replace accessions with sequence names in fasta file.

        Applies masking.

        Parameters
        ----------
        name : str
            NCBI genome name

        localname : str
            Custom name for your genome

        out_dir : str
            Output directory

        mask : str , optional
            masking level: soft/hard/none, default=soft
        """
        # Create mapping of accessions to names
        genome = self.genomes[safe(name)]
        url = genome["ftp_path"]
        url += f"/{url.split('/')[-1]}_assembly_report.txt"
        url = url.replace("ftp://", "https://")

        tr = {}
        urlcleanup()
        with urlopen(url) as response:
            for line in response.read().decode("utf-8").splitlines():
                if line.startswith("#"):
                    continue
                vals = line.strip().split("\t")
                tr[vals[6]] = vals[0]

        # mask sequence if required
        if mask == "soft":

            def mask_cmd(txt):
                return txt

        elif mask == "hard":
            sys.stderr.write(
                "\nNCBI genomes are softmasked by default. Hard masking...\n")

            def mask_cmd(txt):
                return re.sub("[actg]", "N", txt)

        else:
            sys.stderr.write(
                "\nNCBI genomes are softmasked by default. Unmasking...\n")

            def mask_cmd(txt):
                return txt.upper()

        # apply mapping and masking
        fa = os.path.join(out_dir, f"{localname}.fa")
        old_fa = os.path.join(out_dir, f"old_{localname}.fa")
        os.rename(fa, old_fa)
        with open(old_fa) as old, open(fa, "w") as new:
            for line in old:
                if line.startswith(">"):
                    desc = line.strip()[1:]
                    name = desc.split(" ")[0]
                    new.write(">{} {}\n".format(tr.get(name, name), desc))
                else:
                    new.write(mask_cmd(line))
Пример #24
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return Ensembl http or ftp link to the genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http/ftp download link.
        """
        genome = self.genomes[safe(name)]

        # parse the division
        division = genome["division"].lower().replace("ensembl", "")
        if division == "bacteria":
            raise NotImplementedError(
                "bacteria from ensembl not yet supported")

        ftp_site = "ftp://ftp.ensemblgenomes.org/pub"
        if division == "vertebrates":
            ftp_site = "ftp://ftp.ensembl.org/pub"

        # Ensembl release version
        version = kwargs.get("version")
        if version is None:
            version = self.get_version(self.rest_url,
                                       division == "vertebrates")

        # division dependent url format
        ftp_dir = "{}/release-{}/fasta/{}/dna".format(
            division, version, genome["url_name"].lower())
        if division == "vertebrates":
            ftp_dir = "release-{}/fasta/{}/dna".format(
                version, genome["url_name"].lower())
        url = f"{ftp_site}/{ftp_dir}"

        # masking and assembly level
        def get_url(level="toplevel"):
            masks = {
                "soft": "dna_sm.{}",
                "hard": "dna_rm.{}",
                "none": "dna.{}"
            }
            pattern = masks[mask].format(level)

            asm_url = "{}/{}.{}.{}.fa.gz".format(
                url,
                genome["url_name"].capitalize(),
                re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])),
                pattern,
            )
            return asm_url

        # try to get the (much smaller) primary assembly,
        # unless specified otherwise
        link = get_url("primary_assembly")
        if kwargs.get("toplevel") or not check_url(link, 2):
            link = get_url()

        if check_url(link, 2):
            return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URL is broken. Select another genome or provider.\n"
            f"Broken URL: {link}")
Пример #25
0
    def download_genome(
        self,
        name,
        genomes_dir=None,
        localname=None,
        mask="soft",
        keep_alt=False,
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        keep_alt : bool , optional
            Set to true to keep these alternative regions.

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        name = safe(name)
        self.check_name(name)

        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        sys.stderr.write(
            f"Downloading genome from {self.name}.\nTarget URL: {link}...\n")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        with TemporaryDirectory(dir=out_dir) as tmp_dir:
            fname = os.path.join(tmp_dir, f"{localname}.fa")

            # actual download
            urlcleanup()
            with urlopen(link) as response:
                # check available memory vs file size.
                available_memory = int(virtual_memory().available)
                file_size = int(response.info()["Content-Length"])
                # download file in chunks if >75% of memory would be used
                cutoff = int(available_memory * 0.75)
                chunk_size = None if file_size < cutoff else cutoff
                with open(fname, "wb") as f_out:
                    shutil.copyfileobj(response, f_out, chunk_size)
            sys.stderr.write(
                "Genome download successful, starting post processing...\n")

            # unzip genome
            if link.endswith(".tar.gz"):
                tar_to_bigfile(fname, fname)
            elif link.endswith(".gz"):
                os.rename(fname, fname + ".gz")
                ret = sp.check_call(["gunzip", "-f", fname])
                if ret != 0:
                    raise Exception(f"Error gunzipping genome {fname}")

            def regex_filer(_fname, _regex, _v):
                os.rename(_fname, _fname + "_to_regex")
                infa = _fname + "_to_regex"
                outfa = _fname
                filter_fasta(infa, outfa, regex=_regex, v=_v, force=True)

                return [
                    k for k in Fasta(infa).keys()
                    if k not in Fasta(outfa).keys()
                ]

            not_included = []
            # remove alternative regions
            if not keep_alt:
                not_included.extend(regex_filer(fname, "alt", True))

            # keep/remove user defined regions
            if regex:
                not_included.extend(regex_filer(fname, regex, invert_match))

            # process genome (e.g. masking)
            if hasattr(self, "_post_process_download"):
                self._post_process_download(name=name,
                                            localname=localname,
                                            out_dir=tmp_dir,
                                            mask=mask)

            # bgzip genome if requested
            if bgzip or config.get("bgzip"):
                ret = sp.check_call(["bgzip", "-f", fname])
                if ret != 0:
                    raise Exception(
                        f"Error bgzipping {name}. Is tabix installed?")
                fname += ".gz"

            # transfer the genome from the tmpdir to the genome_dir
            src = fname
            dst = os.path.join(genomes_dir, localname, os.path.basename(fname))
            shutil.move(src, dst)

        sys.stderr.write("\n")
        sys.stderr.write("name: {}\n".format(name))
        sys.stderr.write("local name: {}\n".format(localname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession":
            self.assembly_accession(self.genomes.get(name)),
            "tax_id": self.genome_taxid(self.genomes.get(name)),
            "mask": mask,
            "genome url": link,
            "annotation url": "na",
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        lines = []
        if regex:
            regex_line = f"regex: {regex}"
            if invert_match:
                regex_line += " (inverted match)"
            lines += ["", regex_line, "sequences that were excluded:"]
            for seq in not_included:
                lines.append(f"\t{seq}")
        write_readme(readme, metadata, lines)
Пример #26
0
 def _search_descriptions(self, genome, term):
     """check if search term corresponds to the provider's description field(s)"""
     for field in self.description_fields:
         if term in safe(genome[field].lower()):
             return True