예제 #1
0
    def download_annotation(self,
                            name,
                            genomes_dir=None,
                            localname=None,
                            **kwargs):
        """
        Download annotation file to to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install annotation

        localname : str , optional
            Custom name for your genome
        """
        self.check_name(name)

        link = self.get_annotation_download_link(name, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        self.attempt_and_report(name, localname, link, genomes_dir)
예제 #2
0
    def _post_process_download(self, name, localname, out_dir, mask="soft"):
        """
        Replace accessions with sequence names in fasta file.

        Parameters
        ----------
        name : str
            NCBI genome name

        out_dir : str
            Output directory
        """
        # Get the FTP url for this specific genome and download
        # the assembly report
        for genome in self.genomes:
            if name in [genome["asm_name"], genome["asm_name"].replace(" ", "_")]:
                url = genome["ftp_path"]
                url += "/" + url.split("/")[-1] + "_assembly_report.txt"
                url = url.replace("ftp://", "https://")
                break

        # Create mapping of accessions to names
        tr = {}
        urlcleanup()
        with urlopen(url) as response:
            for line in response.read().decode("utf-8").splitlines():
                if line.startswith("#"):
                    continue
                vals = line.strip().split("\t")
                tr[vals[6]] = vals[0]

        localname = get_localname(name, localname)
        # Check of the original genome fasta exists
        fa = os.path.join(out_dir, "{}.fa".format(localname))
        if not os.path.exists(fa):
            raise Exception("Genome fasta file not found, {}".format(fa))

        # Use a tmp file and replace the names
        new_fa = os.path.join(out_dir, ".process.{}.fa".format(localname))
        if mask != "soft":
            sys.stderr.write(
                "NCBI genomes are softmasked by default. Changing mask...\n"
            )

        with open(fa) as old:
            with open(new_fa, "w") as new:
                for line in old:
                    if line.startswith(">"):
                        desc = line.strip()[1:]
                        name = desc.split(" ")[0]
                        new.write(">{} {}\n".format(tr.get(name, name), desc))
                    elif mask == "hard":
                        new.write(re.sub("[actg]", "N", line))
                    elif mask not in ["hard", "soft"]:
                        new.write(line.upper())
                    else:
                        new.write(line)

        # Rename tmp file to real genome file
        shutil.move(new_fa, fa)
예제 #3
0
    def download_annotation(self,
                            url,
                            genomes_dir=None,
                            localname=None,
                            **kwargs):
        """
        Attempts to download a gtf or gff3 file from the same location as the genome url

        Parameters
        ----------
        url : str
            url of where to download genome from

        genomes_dir : str
            Directory to install annotation

        localname : str , optional
            Custom name for your genome

        kwargs: dict , optional:
            Provider specific options.

            to_annotation : str , optional
                url to annotation file (only required if this not located in the same directory as the fasta)
        """
        name = get_localname(url)
        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)

        if kwargs.get("to_annotation"):
            links = [self.get_annotation_download_link(None, **kwargs)]
        else:
            # can return multiple possible hits
            links = self.search_url_for_annotations(url, name)

        for link in links:
            try:
                self.attempt_and_report(name, localname, link, genomes_dir)
                break
            except GenomeDownloadError as e:
                if not link == links[-1]:
                    sys.stdout.write(
                        "\nOne of the potential annotations was incompatible with genomepy."
                        + "\nAttempting another...\n\n")
                    continue
                return e
예제 #4
0
def search_url_for_annotations(url: str) -> list:
    """Attempts to find gtf or gff3 files in the same location as the genome url"""
    name = get_localname(url)

    urldir = url[:url.rfind("/")]
    logger.info("You have requested the gene annotation to be downloaded. "
                "Genomepy will check the remote directory: "
                f"{urldir} for annotation files...")

    # try to find a GTF or GFF3 file
    dirty_list = read_url(urldir).split("\n")
    fnames = fuzzy_annotation_search(name, dirty_list)
    links = [urldir + "/" + fname for fname in fnames]
    if not links:
        logger.warning("Could not parse the remote directory. "
                       "Please supply a URL using --URL-to-annotation.\n")
    return links
예제 #5
0
    def download_annotation(self,
                            name,
                            genomes_dir=None,
                            localname=None,
                            **kwargs):
        """
        Download annotation file to to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install annotation

        localname : str , optional
            Custom name for your genome
        """
        name = self._check_name(name)
        link = self.get_annotation_download_link(name, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)

        logger.info(
            f"Downloading annotation from {self.name}. Target URL: {link}...")
        try:
            # download exact assembly report to rename the scaffolds
            acc = self.assembly_accession(name)
            fname = os.path.join(genomes_dir, localname, "assembly_report.txt")
            download_assembly_report(acc, fname)

            download_annotation(genomes_dir, link, localname)
            logger.info("Annotation download successful")
        except Exception as e:
            raise GenomeDownloadError(
                f"An error occured while installing the gene annotation for {name} from {self.name}.\n"
                "If you think the annotation should be there, please file a bug report at: "
                "https://github.com/vanheeringen-lab/genomepy/issues\n\n"
                f"Error: {e.args[0]}")

        # Add annotation URL to readme
        readme = os.path.join(genomes_dir, localname, "README.txt")
        update_readme(readme, updated_metadata={"annotation url": link})
예제 #6
0
    def search_url_for_annotation(url):
        """Attempts to find a gtf or gff3 file in the same location as the genome url"""
        urldir = os.path.dirname(url)
        sys.stderr.write(
            "You have requested gene annotation to be downloaded.\n"
            "Genomepy will check the remote directory:\n"
            f"{urldir} for annotation files...\n")

        # try to find a GTF or GFF3 file
        name = get_localname(url)
        with urlopen(urldir) as f:
            for urlline in f.readlines():
                urlstr = str(urlline)
                if any(substring in urlstr.lower()
                       for substring in [".gtf", name + ".gff"]):
                    break

        # retrieve the filename from the HTML line
        fname = ""
        for split in re.split('>|<|><|/|"', urlstr):
            if split.lower().endswith((
                    ".gtf",
                    ".gtf.gz",
                    name + ".gff",
                    name + ".gff.gz",
                    name + ".gff3",
                    name + ".gff3.gz",
            )):
                fname = split
                break
        else:
            raise FileNotFoundError(
                "Could not parse the remote directory. "
                "Please supply a URL using --url-to-annotation.\n")

        # set variables for downloading
        link = urldir + "/" + fname

        if check_url(link):
            return link
예제 #7
0
    def _post_process_download(self, name, localname, out_dir, mask="soft"):
        """
        Unmask a softmasked genome if required

        Parameters
        ----------
        name : str
            UCSC genome name

        out_dir : str
            Output directory
        """
        if mask not in ["hard", "soft"]:
            localname = get_localname(name, localname)

            # Check of the original genome fasta exists
            fa = os.path.join(out_dir, "{}.fa".format(localname))
            if not os.path.exists(fa):
                raise Exception("Genome fasta file not found, {}".format(fa))

            sys.stderr.write("UCSC genomes are softmasked by default. Unmasking...\n")

            # Use a tmp file and replace the names
            new_fa = os.path.join(
                out_dir, localname, ".process.{}.fa".format(localname)
            )
            with open(fa) as old:
                with open(new_fa, "w") as new:
                    for line in old:
                        if not line.startswith(">"):
                            new.write(line.upper())
                        else:
                            new.write(line)

            # Rename tmp file to real genome file
            shutil.move(new_fa, fa)
예제 #8
0
    def download_annotation(self,
                            name,
                            genomes_dir=None,
                            localname=None,
                            **kwargs):
        """
        Download the UCSC genePred via their MySQL database, and convert to annotations.
        """
        name = self._check_name(name)
        annot = self.get_annotation_download_link(name, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)

        logger.info(
            f"Downloading the {annot} annotation from the UCSC MySQL database."
        )
        try:
            download_annotation(name, annot, genomes_dir, localname)
            logger.info("Annotation download successful")
        except Exception as e:
            raise GenomeDownloadError(
                f"An error occured while installing the gene annotation for {name} from {self.name}.\n"
                "If you think the annotation should be there, please file a bug report at: "
                "https://github.com/vanheeringen-lab/genomepy/issues\n\n"
                f"Error: {e.args[0]}")

        # Add annotation URL to readme
        readme = os.path.join(genomes_dir, localname, "README.txt")
        update_readme(
            readme,
            updated_metadata={
                "annotation url":
                f"UCSC MySQL database: {name}, table: {annot}"
            },
        )
예제 #9
0
    def download_annotation(self, name, genome_dir, localname=None, **kwargs):
        """
        Download Ensembl annotation file to to a specific directory

        Parameters
        ----------
        name : str
            Ensembl genome name.
        genome_dir : str
            Genome directory.
        kwargs: dict , optional:
            Provider specific options.

            version : int , optional
                Ensembl version. By default the latest version is used.
        """
        sys.stderr.write("Downloading gene annotation...\n")

        localname = get_localname(name, localname)
        genome_info = self._get_genome_info(name)

        # parse the division
        division = genome_info["division"].lower().replace("ensembl", "")
        if division == "bacteria":
            raise NotImplementedError("bacteria from ensembl not yet supported")

        # Get the base link depending on division
        ftp_site = "ftp://ftp.ensemblgenomes.org/pub"
        if division == "vertebrates":
            ftp_site = "https://ftp.ensembl.org/pub"

        version = self.version
        if kwargs.get("version", None):
            version = kwargs.get("version")
        elif not version:
            version = self.get_version(ftp_site)

        if division != "vertebrates":
            ftp_site += "/{}".format(division)

        # Get the GTF URL
        base_url = ftp_site + "/release-{}/gtf/{}/{}.{}.{}.gtf.gz"
        safe_name = name.replace(" ", "_")
        safe_name = re.sub(r"\.p\d+$", "", safe_name)

        ftp_link = base_url.format(
            version,
            genome_info["url_name"].lower(),
            genome_info["url_name"].capitalize(),
            safe_name,
            version,
        )

        out_dir = os.path.join(genome_dir, localname)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)

        # download to tmp dir. Move genome on completion.
        with TemporaryDirectory(dir=out_dir) as tmpdir:
            try:
                # actual download
                sys.stderr.write("Using {}\n".format(ftp_link))
                with urlopen(ftp_link) as response:
                    gtf_file = os.path.join(tmpdir, localname + ".annotation.gtf.gz")
                    with open(gtf_file, "wb") as f:
                        f.write(response.read())

                bed_file = gtf_file.replace("gtf.gz", "bed")
                cmd = (
                    "gtfToGenePred {0} /dev/stdout | "
                    "genePredToBed /dev/stdin {1} && gzip -f {1}"
                )
                sp.check_call(cmd.format(gtf_file, bed_file), shell=True)

                # transfer the genome from the tmpdir to the genome_dir
                for f in [gtf_file, bed_file + ".gz"]:
                    src = f
                    dst = os.path.join(out_dir, os.path.basename(f))
                    shutil.move(src, dst)

                readme = os.path.join(out_dir, "README.txt")
                with open(readme, "a") as f:
                    f.write("annotation url: {}\n".format(ftp_link))

            except Exception:
                sys.stderr.write("\nCould not download {}\n".format(ftp_link))
                raise
예제 #10
0
    def download_genome(
        self,
        name,
        genome_dir,
        localname=None,
        mask="soft",
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genome_dir : str
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        genome_dir = os.path.expanduser(genome_dir)
        if not os.path.exists(genome_dir):
            os.makedirs(genome_dir)

        dbname, link = self.get_genome_download_link(name, mask=mask, **kwargs)
        myname = get_localname(dbname, localname)
        if not os.path.exists(os.path.join(genome_dir, myname)):
            os.makedirs(os.path.join(genome_dir, myname))

        sys.stderr.write("Downloading genome from {}...\n".format(link))

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        with TemporaryDirectory(dir=os.path.join(genome_dir, myname)) as tmpdir:
            fname = os.path.join(tmpdir, myname + ".fa")

            # actual download
            urlcleanup()
            with urlopen(link) as response:
                # check available memory vs file size.
                available_memory = int(virtual_memory().available)
                file_size = int(response.info()["Content-Length"])
                # download file in chunks if >75% of memory would be used
                cutoff = int(available_memory * 0.75)
                chunk_size = None if file_size < cutoff else cutoff
                with open(fname, "wb") as f_out:
                    shutil.copyfileobj(response, f_out, chunk_size)

            # unzip genome
            if link.endswith("tar.gz"):
                self.tar_to_bigfile(fname, fname)
            elif link.endswith(".gz"):
                # gunzip will only work with files ending with ".gz"
                os.rename(fname, fname + ".gz")
                ret = sp.check_call(["gunzip", "-f", fname])
                if ret != 0:
                    raise Exception("Error gunzipping genome {}".format(fname))

            # process genome (e.g. masking)
            if hasattr(self, "_post_process_download"):
                self._post_process_download(name, localname, tmpdir, mask)

            if regex:
                os.rename(fname, fname + "_to_regex")
                infa = fname + "_to_regex"
                outfa = fname
                filter_fasta(infa, outfa, regex=regex, v=invert_match, force=True)

                not_included = [
                    k for k in Fasta(infa).keys() if k not in Fasta(outfa).keys()
                ]

            # bgzip genome if requested
            if bgzip is None:
                bgzip = config.get("bgzip", False)

            if bgzip:
                ret = sp.check_call(["bgzip", "-f", fname])
                if ret != 0:
                    raise Exception(
                        "Error bgzipping {}. ".format(fname) + "Is tabix installed?"
                    )
                fname += ".gz"

            # transfer the genome from the tmpdir to the genome_dir
            src = fname
            dst = os.path.join(genome_dir, myname, os.path.basename(fname))
            shutil.move(src, dst)

        sys.stderr.write("name: {}\n".format(dbname))
        sys.stderr.write("local name: {}\n".format(myname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genome_dir, myname, "README.txt")
        with open(readme, "w") as f:
            f.write("name: {}\n".format(myname))
            f.write("original name: {}\n".format(dbname))
            f.write("original filename: {}\n".format(os.path.split(link)[-1]))
            f.write("url: {}\n".format(link))
            f.write("mask: {}\n".format(mask))
            f.write("date: {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S")))
            if regex:
                if invert_match:
                    f.write("regex: {} (inverted match)\n".format(regex))
                else:
                    f.write("regex: {}\n".format(regex))
                f.write("sequences that were excluded:\n")
                for seq in not_included:
                    f.write("\t{}\n".format(seq))
예제 #11
0
    def download_genome(
        self,
        name: str,
        genomes_dir: str = None,
        localname: str = None,
        mask: str = "soft",
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)
        """
        name = self._check_name(name)
        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        mkdir_p(out_dir)

        logger.info(
            f"Downloading genome from {self.name}. Target URL: {link}...")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        get_file = shutil.copyfile if os.path.exists(link) else download_file
        with TemporaryDirectory(dir=out_dir) as tmp_dir:
            tmp_fname = os.path.join(tmp_dir, link.split("/")[-1])
            fname = os.path.join(tmp_dir, f"{localname}.fa")

            get_file(link, tmp_fname)
            logger.info(
                "Genome download successful, starting post processing...")

            # unzip genome
            _, is_compressed = get_file_info(link)
            if is_compressed:
                extract_archive(tmp_fname, outfile=fname, concat=True)
            else:
                shutil.move(tmp_fname, fname)

            # process genome (e.g. masking)
            if hasattr(self, "_post_process_download"):
                self._post_process_download(name=name,
                                            fname=fname,
                                            out_dir=out_dir,
                                            mask=mask)

            # transfer the genome from the tmpdir to the genome_dir
            src = fname
            dst = os.path.join(out_dir, f"{localname}.fa")
            shutil.move(src, dst)

        logger.info("name: {}".format(name))
        logger.info("local name: {}".format(localname))
        logger.info("fasta: {}".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        asm_acc = self.assembly_accession(name)
        tax_id = self.genome_taxid(name)
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession": asm_acc if asm_acc else "na",
            "tax_id": tax_id if tax_id else "na",
            "mask": mask,
            "genome url": link,
            "genomepy version": __version__,
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        update_readme(readme, metadata)
예제 #12
0
    def download_annotation(self,
                            name,
                            genome_dir,
                            localname=None,
                            version=None):
        """
        Download annotation file to to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name
        
        genome_dir : str
            Directory to install annotation
        """
        sys.stderr.write("Downloading annotation...\n")
        if not self.genomes:
            self.genomes = self._get_genomes()

        for genome in self.genomes:
            if genome["asm_name"] == name:
                url = genome["ftp_path"]
                url += "/" + url.split("/")[-1] + "_genomic.gff.gz"

        localname = get_localname(name, localname)
        out_dir = os.path.join(genome_dir, localname)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)

        # Download the file
        try:
            response = urlopen(url)
            gff_file = out_dir + "/" + localname + ".annotation.gff.gz"
            with open(gff_file, "wb") as f:
                f.write(response.read())
        except Exception:
            sys.stderr.write(
                "WARNING: Could not download annotation from NCBI, " +
                "skipping.\n")
            sys.stderr.write("URL: {}\n".format(url))

            sys.stderr.write("If you think the annotation should be there, " +
                             "please file a bug report at:\n")
            sys.stderr.write("https://github.com/simonvh/genomepy/issues\n")
            return

        cmd = "gff3ToGenePred {0} /dev/stdout | wc -l"
        out = sp.check_output(cmd.format(gff_file), shell=True)
        if out.strip() == b"0":
            sys.stderr.write(
                "WARNING: annotation from NCBI contains no genes, " +
                "skipping.\n")
        else:
            # Convert to BED file
            bed_file = gff_file.replace("gff.gz", "bed")
            cmd = ("gff3ToGenePred -rnaNameAttr=gene {0} /dev/stdout | "
                   "genePredToBed /dev/stdin {1} && gzip {1}")
            sp.check_call(cmd.format(gff_file, bed_file), shell=True)

            # Convert to GTF file
            gtf_file = gff_file.replace("gff.gz", "gtf")
            cmd = ("gff3ToGenePred -geneNameAttr=gene {0} /dev/stdout | " +
                   "genePredToGtf file /dev/stdin {1} && gzip {1}")
            sp.check_call(cmd.format(gff_file, gtf_file), shell=True)

        readme = os.path.join(genome_dir, name, "README.txt")
        with open(readme, "a") as f:
            f.write("annotation url: {}\n".format(url))

        return out_dir
예제 #13
0
    def download_annotation(self, name, genome_dir, localname=None, **kwargs):
        """
        Download NCBI annotation file to to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genome_dir : str
            Directory to install annotation
        """
        sys.stderr.write("Downloading gene annotation...\n")

        localname = get_localname(name, localname)
        if not self.genomes:
            self.genomes = self._get_genomes()

        for genome in self.genomes:
            if name in [genome["asm_name"], genome["asm_name"].replace(" ", "_")]:
                url = genome["ftp_path"]
                url = url.replace("ftp://", "https://")
                url += "/" + url.split("/")[-1] + "_genomic.gff.gz"

        out_dir = os.path.join(genome_dir, localname)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)

        # download to tmp dir. Move genome on completion.
        with TemporaryDirectory(dir=out_dir) as tmpdir:
            try:
                # actual download
                sys.stderr.write("Using {}\n".format(url))
                gff_file = os.path.join(tmpdir, localname + ".annotation.gff.gz")
                with urlopen(url) as response:
                    with open(gff_file, "wb") as f:
                        f.write(response.read())

                # check gff for genes
                cmd = "gff3ToGenePred {0} /dev/stdout | wc -l"
                out = sp.check_output(cmd.format(gff_file), shell=True)
                if out.strip() == b"0":
                    sys.stderr.write(
                        "WARNING: annotation from NCBI contains no genes, "
                        + "skipping.\n"
                    )
                    return
                else:
                    # Convert to BED file
                    bed_file = gff_file.replace("gff.gz", "bed")
                    cmd = (
                        "gff3ToGenePred -rnaNameAttr=gene {0} /dev/stdout | "
                        "genePredToBed /dev/stdin {1} && gzip -f {1}"
                    )
                    sp.check_call(cmd.format(gff_file, bed_file), shell=True)

                    # Convert to GTF file
                    gtf_file = gff_file.replace("gff.gz", "gtf")
                    cmd = (
                        "gff3ToGenePred -geneNameAttr=gene {0} /dev/stdout | "
                        + "genePredToGtf file /dev/stdin {1} && gzip -f {1}"
                    )
                    sp.check_call(cmd.format(gff_file, gtf_file), shell=True)

                # transfer the genome from the tmpdir to the genome_dir
                for f in [gtf_file + ".gz", bed_file + ".gz"]:
                    src = f
                    dst = os.path.join(out_dir, os.path.basename(f))
                    shutil.move(src, dst)

                readme = os.path.join(genome_dir, localname, "README.txt")
                with open(readme, "a") as f:
                    f.write("annotation url: {}\n".format(url))

            except Exception:
                sys.stderr.write(
                    "WARNING: Could not download annotation from NCBI, " + "skipping.\n"
                )
                sys.stderr.write("URL: {}\n".format(url))

                sys.stderr.write(
                    "If you think the annotation should be there, "
                    + "please file a bug report at:\n"
                )
                sys.stderr.write("https://github.com/simonvh/genomepy/issues\n")
예제 #14
0
    def download_genome(
        self,
        name,
        genomes_dir=None,
        localname=None,
        mask="soft",
        keep_alt=False,
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        keep_alt : bool , optional
            Set to true to keep these alternative regions.

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        name = safe(name)
        self.check_name(name)

        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        sys.stderr.write(
            f"Downloading genome from {self.name}.\nTarget URL: {link}...\n")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        tmp_dir = mkdtemp(dir=out_dir)
        fname = os.path.join(tmp_dir, f"{localname}.fa")

        urlcleanup()
        download_file(link, fname)
        sys.stderr.write(
            "Genome download successful, starting post processing...\n")

        # unzip genome
        if link.endswith(".tar.gz"):
            tar_to_bigfile(fname, fname)
        elif link.endswith(".gz"):
            os.rename(fname, fname + ".gz")
            ret = sp.check_call(["gunzip", "-f", fname])
            if ret != 0:
                raise Exception(f"Error gunzipping genome {fname}")

        def regex_filer(_fname, _regex, _v):
            infa = _fname + "_to_regex"
            os.rename(_fname, infa)
            # filter the fasta and store the output's keys
            keys_out = filter_fasta(infa,
                                    outfa=_fname,
                                    regex=_regex,
                                    v=_v,
                                    force=True).keys()
            keys_in = Fasta(infa).keys()
            return [k for k in keys_in if k not in keys_out]

        not_included = []
        # remove alternative regions
        if not keep_alt:
            not_included.extend(regex_filer(fname, "alt", True))

        # keep/remove user defined regions
        if regex:
            not_included.extend(regex_filer(fname, regex, invert_match))

        # process genome (e.g. masking)
        if hasattr(self, "_post_process_download"):
            self._post_process_download(name=name,
                                        localname=localname,
                                        out_dir=tmp_dir,
                                        mask=mask)

        # bgzip genome if requested
        if bgzip or config.get("bgzip"):
            # bgzip to stdout, track progress, and output to file
            fsize = int(os.path.getsize(fname) * 10**-6)
            cmd = (
                f"bgzip -fc {fname} | "
                f"tqdm --bytes --desc Bgzipping {fsize}MB fasta --log ERROR | "
                f"cat > {fname}.gz")
            ret = sp.check_call(cmd, shell=True)
            if ret != 0:
                raise Exception(f"Error bgzipping {name}. Is tabix installed?")
            fname += ".gz"

        # transfer the genome from the tmpdir to the genome_dir
        src = fname
        dst = os.path.join(genomes_dir, localname, os.path.basename(fname))
        shutil.move(src, dst)
        rm_rf(tmp_dir)

        sys.stderr.write("\n")
        sys.stderr.write("name: {}\n".format(name))
        sys.stderr.write("local name: {}\n".format(localname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession":
            self.assembly_accession(self.genomes.get(name)),
            "tax_id": self.genome_taxid(self.genomes.get(name)),
            "mask": mask,
            "genome url": link,
            "annotation url": "na",
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        lines = []
        if not keep_alt or regex:
            regex_line = "regex: "
            if not keep_alt:
                regex_line += "'alt' (inverted match)"

            if not keep_alt and regex:
                regex_line += " and "

            if regex:
                regex_line += f"'{regex}'"
                if invert_match:
                    regex_line += " (inverted match)"

            lines += ["", regex_line, "sequences that were excluded:"]
            for seq in not_included:
                lines.append(f"\t{seq}")
        write_readme(readme, metadata, lines)
예제 #15
0
    def download_annotation(self,
                            name,
                            genome_dir,
                            localname=None,
                            version=None):
        """
        Download gene annotation from Ensembl based on genome name.

        Parameters
        ----------
        name : str
            Ensembl genome name.
        genome_dir : str
            Genome directory.
        version : str , optional
            Ensembl version. By default the latest version is used.
        """
        localname = get_localname(name, localname)
        genome_info = self._get_genome_info(name)

        # parse the division
        division = genome_info["division"].lower().replace("ensembl", "")
        if division == "bacteria":
            raise NotImplementedError(
                "bacteria from ensembl not yet supported")

        # Get the base link depending on division
        ftp_site = "ftp://ftp.ensemblgenomes.org/pub"
        if division == 'vertebrates':
            ftp_site = "https://ftp.ensembl.org/pub"

        if not version:
            version = self.get_version(ftp_site)

        if division != "vertebrates":
            ftp_site += "/{}".format(division)

        # Get the GTF URL
        base_url = ftp_site + "/release-{}/gtf/{}/{}.{}.{}.gtf.gz"
        safe_name = name.replace(" ", "_")
        safe_name = re.sub('\.p\d+$', '', safe_name)

        ftp_link = base_url.format(version, genome_info["url_name"].lower(),
                                   genome_info["url_name"].capitalize(),
                                   safe_name, version)

        out_dir = os.path.join(genome_dir, localname)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        # Download the file
        try:
            response = urlopen(ftp_link)
            gtf_file = out_dir + "/" + localname + ".annotation.gtf.gz"
            with open(gtf_file, "wb") as f:
                f.write(response.read())

            bed_file = gtf_file.replace("gtf.gz", "bed")
            cmd = ("gtfToGenePred {0} /dev/stdout | "
                   "genePredToBed /dev/stdin {1} && gzip {1}")
            sp.check_call(cmd.format(gtf_file, bed_file), shell=True)
            readme = os.path.join(genome_dir, localname, "README.txt")
            with open(readme, "a") as f:
                f.write("annotation url: {}\n".format(ftp_link))
        except Exception:
            sys.stderr.write("\nCould not download {}\n".format(ftp_link))
            raise

        return out_dir
예제 #16
0
    def download_genome(
        self,
        name,
        genomes_dir=None,
        localname=None,
        mask="soft",
        keep_alt=False,
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        keep_alt : bool , optional
            Set to true to keep these alternative regions.

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        name = safe(name)
        self.check_name(name)

        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        sys.stderr.write(
            f"Downloading genome from {self.name}.\nTarget URL: {link}...\n")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        with TemporaryDirectory(dir=out_dir) as tmp_dir:
            fname = os.path.join(tmp_dir, f"{localname}.fa")

            # actual download
            urlcleanup()
            with urlopen(link) as response:
                # check available memory vs file size.
                available_memory = int(virtual_memory().available)
                file_size = int(response.info()["Content-Length"])
                # download file in chunks if >75% of memory would be used
                cutoff = int(available_memory * 0.75)
                chunk_size = None if file_size < cutoff else cutoff
                with open(fname, "wb") as f_out:
                    shutil.copyfileobj(response, f_out, chunk_size)
            sys.stderr.write(
                "Genome download successful, starting post processing...\n")

            # unzip genome
            if link.endswith(".tar.gz"):
                tar_to_bigfile(fname, fname)
            elif link.endswith(".gz"):
                os.rename(fname, fname + ".gz")
                ret = sp.check_call(["gunzip", "-f", fname])
                if ret != 0:
                    raise Exception(f"Error gunzipping genome {fname}")

            def regex_filer(_fname, _regex, _v):
                os.rename(_fname, _fname + "_to_regex")
                infa = _fname + "_to_regex"
                outfa = _fname
                filter_fasta(infa, outfa, regex=_regex, v=_v, force=True)

                return [
                    k for k in Fasta(infa).keys()
                    if k not in Fasta(outfa).keys()
                ]

            not_included = []
            # remove alternative regions
            if not keep_alt:
                not_included.extend(regex_filer(fname, "alt", True))

            # keep/remove user defined regions
            if regex:
                not_included.extend(regex_filer(fname, regex, invert_match))

            # process genome (e.g. masking)
            if hasattr(self, "_post_process_download"):
                self._post_process_download(name=name,
                                            localname=localname,
                                            out_dir=tmp_dir,
                                            mask=mask)

            # bgzip genome if requested
            if bgzip or config.get("bgzip"):
                ret = sp.check_call(["bgzip", "-f", fname])
                if ret != 0:
                    raise Exception(
                        f"Error bgzipping {name}. Is tabix installed?")
                fname += ".gz"

            # transfer the genome from the tmpdir to the genome_dir
            src = fname
            dst = os.path.join(genomes_dir, localname, os.path.basename(fname))
            shutil.move(src, dst)

        sys.stderr.write("\n")
        sys.stderr.write("name: {}\n".format(name))
        sys.stderr.write("local name: {}\n".format(localname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession":
            self.assembly_accession(self.genomes.get(name)),
            "tax_id": self.genome_taxid(self.genomes.get(name)),
            "mask": mask,
            "genome url": link,
            "annotation url": "na",
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        lines = []
        if regex:
            regex_line = f"regex: {regex}"
            if invert_match:
                regex_line += " (inverted match)"
            lines += ["", regex_line, "sequences that were excluded:"]
            for seq in not_included:
                lines.append(f"\t{seq}")
        write_readme(readme, metadata, lines)
예제 #17
0
def install_genome(
    name,
    provider,
    version=None,
    genome_dir=None,
    localname=None,
    mask="soft",
    regex=None,
    invert_match=False,
    annotation=False,
    bgzip=None,
):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str
        Provider name

    version : str
        Version (only for Ensembl)

    genome_dir : str , optional
        Where to store the fasta files

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', specify 'hard' for hard masking.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip.
        If not specified, the setting from the configuration file will be used.
    """
    if not genome_dir:
        genome_dir = config.get("genome_dir", None)
    if not genome_dir:
        raise norns.exceptions.ConfigError("Please provide or configure a genome_dir")

    genome_dir = os.path.expanduser(genome_dir)
    localname = get_localname(name, localname)

    # Download genome from provider
    p = ProviderBase.create(provider)
    p.download_genome(
        name,
        genome_dir,
        version=version,
        mask=mask,
        localname=localname,
        regex=regex,
        invert_match=invert_match,
        bgzip=bgzip,
    )

    if annotation:
        # Download annotation from provider
        p.download_annotation(name, genome_dir, localname=localname, version=version)

    g = Genome(localname, genome_dir=genome_dir)

    for plugin in get_active_plugins():
        plugin.after_genome_download(g)

    generate_env()
예제 #18
0
    def download_annotation(self, name, genome_dir, localname=None, **kwargs):
        """
        Download UCSC annotation file to to a specific directory.

        Will check UCSC, Ensembl and RefSeq annotation.

        Parameters
        ----------
        name : str
            UCSC genome name.
        genome_dir : str
            Genome directory.
        """
        sys.stderr.write("Downloading gene annotation...\n")

        localname = get_localname(name, localname)

        UCSC_GENE_URL = "http://hgdownload.cse.ucsc.edu/goldenPath/{}/database/"
        ANNOS = ["knownGene.txt.gz", "ensGene.txt.gz", "refGene.txt.gz"]
        pred = "genePredToBed"

        tmp = NamedTemporaryFile(delete=False, suffix=".gz")

        anno = []
        p = re.compile(r"\w+.Gene.txt.gz")
        with urlopen(UCSC_GENE_URL.format(name)) as f:
            for line in f.readlines():
                m = p.search(line.decode())
                if m:
                    anno.append(m.group(0))

        url = ""
        for a in ANNOS:
            if a in anno:
                url = UCSC_GENE_URL.format(name) + a
                break

        out_dir = os.path.join(genome_dir, localname)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)

        # download to tmp dir. Move genome on completion.
        with TemporaryDirectory(dir=out_dir) as tmpdir:
            try:
                if url == "":
                    raise Exception
                sys.stderr.write("Using {}\n".format(url))
                urlretrieve(url, tmp.name)

                with gzip.open(tmp.name) as f:
                    cols = f.readline().decode(errors="ignore").split("\t")

                start_col = 1
                for i, col in enumerate(cols):
                    if col == "+" or col == "-":
                        start_col = i - 1
                        break
                end_col = start_col + 10

                # Convert to BED file
                bed_file = os.path.join(tmpdir, localname + ".annotation.bed")
                cmd = "zcat {} | cut -f{}-{} | {} /dev/stdin {} && gzip -f {}"
                sp.call(
                    cmd.format(tmp.name, start_col, end_col, pred, bed_file, bed_file),
                    shell=True,
                )

                # Convert to GTF file
                gtf_file = bed_file.replace(".bed", ".gtf")
                cmd = (
                    "bedToGenePred {0}.gz /dev/stdout | "
                    "genePredToGtf file /dev/stdin /dev/stdout -utr -honorCdsStat | "
                    "sed 's/.dev.stdin/UCSC/' > {1} && gzip -f {1}"
                )
                sp.check_call(cmd.format(bed_file, gtf_file), shell=True)

                # transfer the genome from the tmpdir to the genome_dir
                for f in [gtf_file + ".gz", bed_file + ".gz"]:
                    src = f
                    dst = os.path.join(out_dir, os.path.basename(f))
                    shutil.move(src, dst)

                readme = os.path.join(genome_dir, localname, "README.txt")
                with open(readme, "a") as f:
                    f.write("annotation url: {}\n".format(url))

            except Exception:
                sys.stderr.write("No annotation found!")
예제 #19
0
def install_genome(
    name,
    provider=None,
    genomes_dir=None,
    localname=None,
    mask="soft",
    keep_alt=False,
    regex=None,
    invert_match=False,
    bgzip=None,
    annotation=False,
    only_annotation=False,
    skip_sanitizing=False,
    threads=1,
    force=False,
    **kwargs,
):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str , optional
        Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified.

    genomes_dir : str , optional
        Where to store the fasta files

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', choices 'hard'/'soft/'none' for respective masking level.

    keep_alt : bool , optional
        Some genomes contain alternative regions. These regions cause issues with
        sequence alignment, as they are inherently duplications of the consensus regions.
        Set to true to keep these alternative regions.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip.
        If not specified, the setting from the configuration file will be used.

    threads : int , optional
        Build genome index using multithreading (if supported). Default: lowest of 8/all threads

    force : bool , optional
        Set to True to overwrite existing files.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    only_annotation : bool , optional
        If set to True, only download the annotation files.

    skip_sanitizing : bool , optional
        If set to True, downloaded annotation files whose sequence names do not match
        with the (first header fields of) the genome.fa will not be corrected.

    kwargs : dict , optional
        Provider specific options.
        toplevel : bool , optional
            Ensembl only: Always download the toplevel genome. Ignores potential primary assembly.

        version : int , optional
            Ensembl only: Specify release version. Default is latest.

        to_annotation : text , optional
            URL only: direct link to annotation file.
            Required if this is not the same directory as the fasta.
    """
    name = safe(name)
    localname = get_localname(name, localname)
    genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
    out_dir = os.path.join(genomes_dir, localname)

    # Check if genome already exists, or if downloading is forced
    genome_found = _is_genome_dir(out_dir)
    if (not genome_found or force) and not only_annotation:
        # Download genome from provider
        p = _provider_selection(name, localname, genomes_dir, provider)
        p.download_genome(
            name,
            genomes_dir,
            mask=mask,
            keep_alt=keep_alt,
            regex=regex,
            invert_match=invert_match,
            localname=localname,
            bgzip=bgzip,
            **kwargs,
        )
        genome_found = True

        # Export installed genome(s)
        generate_env(genomes_dir=genomes_dir)

    # Generates a Fasta object, index, gaps and sizes file
    g = None
    if genome_found:
        g = Genome(localname, genomes_dir=genomes_dir)
        if force:
            # overwrite previous versions
            generate_fa_sizes(g.genome_file, g.sizes_file)
            generate_gap_bed(g.genome_file, g.gaps_file)

    # Check if any annotation flags are given, if annotation already exists, or if downloading is forced
    if any([
            annotation,
            only_annotation,
            skip_sanitizing,
            kwargs.get("to_annotation"),
            kwargs.get("ucsc_annotation_type"),
    ]):
        annotation = True
    annotation_found = bool(glob_ext_files(out_dir, "gtf"))
    if (not annotation_found or force) and annotation:
        # Download annotation from provider
        p = _provider_selection(name, localname, genomes_dir, provider)
        p.download_annotation(name, genomes_dir, localname=localname, **kwargs)

        # Sanitize annotation if needed (requires genome)
        annotation_found = bool(glob_ext_files(out_dir, "gtf"))
        if genome_found and annotation_found and not skip_sanitizing:
            sanitize_annotation(g)

    if genome_found:
        # Run all active plugins (requires genome)
        for plugin in get_active_plugins():
            plugin.after_genome_download(g, threads, force)
예제 #20
0
def install_genome(
    name: str,
    provider: Optional[str] = None,
    genomes_dir: Optional[str] = None,
    localname: Optional[str] = None,
    mask: Optional[str] = "soft",
    keep_alt: Optional[bool] = False,
    regex: Optional[str] = None,
    invert_match: Optional[bool] = False,
    bgzip: Optional[bool] = None,  # None -> check config. False -> dont check.
    annotation: Optional[bool] = False,
    only_annotation: Optional[bool] = False,
    skip_matching: Optional[bool] = False,
    skip_filter: Optional[bool] = False,
    threads: Optional[int] = 1,
    force: Optional[bool] = False,
    **kwargs: Optional[dict],
) -> Genome:
    """
    Install a genome (& gene annotation).

    Parameters
    ----------
    name : str
        Genome name

    provider : str , optional
        Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified.

    genomes_dir : str , optional
        Where to create the output folder.

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Genome masking of repetitive sequences. Options: hard/soft/none, default is soft.

    keep_alt : bool , optional
        Some genomes contain alternative regions. These regions cause issues with
        sequence alignment, as they are inherently duplications of the consensus regions.
        Set to true to keep these alternative regions.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that *don't* match the regex.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip,
        and gene annotation will be compressed with gzip.

    threads : int , optional
        Build genome index using multithreading (if supported). Default: lowest of 8/all threads.

    force : bool , optional
        Set to True to overwrite existing files.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    only_annotation : bool , optional
        If set to True, only download the gene annotation files.

    skip_matching : bool , optional
        If set to True, contigs in the annotation not matching
        those in the genome will not be corrected.

    skip_filter : bool , optional
        If set to True, the gene annotations will not be filtered to match the genome contigs.

    kwargs : dict , optional
        Provider specific options.

        toplevel : bool , optional
            Ensembl only: Always download the toplevel genome. Ignores potential primary assembly.

        version : int , optional
            Ensembl only: Specify release version. Default is latest.

        to_annotation : text , optional
            URL only: direct link to annotation file.
            Required if this is not the same directory as the fasta.

    Returns
    -------
    Genome
        Genome class with the installed genome
    """
    name = safe(name)
    localname = get_localname(name, localname)
    genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
    out_dir = os.path.join(genomes_dir, localname)
    genome_file = os.path.join(out_dir, f"{localname}.fa")
    provider = _provider_selection(name, localname, genomes_dir, provider)

    # check which files need to be downloaded
    genome_found = _is_genome_dir(out_dir)
    download_genome = (
        genome_found is False or force is True
    ) and only_annotation is False
    annotation_found = bool(glob_ext_files(out_dir, "annotation.gtf")) and bool(
        glob_ext_files(out_dir, "annotation.bed")
    )
    download_annotation = (annotation_found is False or force is True) and any(
        [
            annotation,
            only_annotation,
            skip_matching,
            skip_filter,
            kwargs.get("to_annotation"),
            kwargs.get("path_to_annotation"),
            kwargs.get("ucsc_annotation_type"),
        ]
    )

    genome = None
    genome_downloaded = False
    if download_genome:
        if force:
            _delete_extensions(out_dir, ["fa", "fai"])
        provider.download_genome(
            name,
            genomes_dir,
            mask=mask,
            localname=localname,
            **kwargs,
        )
        genome_found = True
        genome_downloaded = True

        # Filter genome
        _filter_genome(genome_file, regex, invert_match, keep_alt)

        # Generates a Fasta object and the genome index, gaps and sizes files
        genome = Genome(localname, genomes_dir=genomes_dir)

        # Download the NCBI assembly report
        asm_report = os.path.join(out_dir, "assembly_report.txt")
        asm_acc = genome.assembly_accession
        if not os.path.exists(asm_report) and asm_acc != "na":
            download_assembly_report(asm_acc, asm_report)

        # Export installed genome(s)
        generate_env(genomes_dir=genomes_dir)

    annotation_downloaded = False
    if download_annotation:
        if force:
            _delete_extensions(out_dir, ["annotation.gtf", "annotation.bed"])
        provider.download_annotation(name, genomes_dir, localname=localname, **kwargs)
        annotation_downloaded = bool(
            glob_ext_files(out_dir, "annotation.gtf")
        ) and bool(glob_ext_files(out_dir, "annotation.bed"))

    if annotation_downloaded:
        annotation = Annotation(localname, genomes_dir=genomes_dir)
        if genome_found and not (skip_matching and skip_filter):
            annotation.sanitize(not skip_matching, not skip_filter, True)

    # Run active plugins (also if the genome was downloaded earlier)
    if genome_found:
        genome = genome if genome else Genome(localname, genomes_dir=genomes_dir)
        for plugin in get_active_plugins():
            plugin.after_genome_download(genome, threads, force)

    # zip files downloaded now
    if bgzip is True or (bgzip is None and config.get("bgzip")):
        if genome_downloaded:
            bgzip_and_name(genome.filename)
        if annotation_downloaded:
            gzip_and_name(annotation.annotation_gtf_file)
            gzip_and_name(annotation.annotation_bed_file)

    return genome
예제 #21
0
def install_genome(name,
                   provider,
                   genome_dir=None,
                   localname=None,
                   mask="soft",
                   regex=None,
                   invert_match=False,
                   bgzip=None,
                   annotation=False,
                   force=False,
                   **kwargs):
    """
    Install a genome.

    Parameters
    ----------
    name : str
        Genome name

    provider : str
        Provider name

    genome_dir : str , optional
        Where to store the fasta files

    localname : str , optional
        Custom name for this genome.

    mask : str , optional
        Default is 'soft', choices 'hard'/'soft/'none' for respective masking level.

    regex : str , optional
        Regular expression to select specific chromosome / scaffold names.

    invert_match : bool , optional
        Set to True to select all chromosomes that don't match the regex.

    bgzip : bool , optional
        If set to True the genome FASTA file will be compressed using bgzip.
        If not specified, the setting from the configuration file will be used.

    annotation : bool , optional
        If set to True, download gene annotation in BED and GTF format.

    force : bool , optional
        Set to True to overwrite existing files.

    kwargs : dict, optional
        Provider specific options.
        Ensembl:

        toplevel : bool , optional
            Ensembl only: Always download the toplevel genome. Ignores potential primary assembly.

        version : int, optional
            Ensembl only: Specify release version. Default is latest.
    """
    if not genome_dir:
        genome_dir = config.get("genome_dir", None)
    if not genome_dir:
        raise norns.exceptions.ConfigError(
            "Please provide or configure a genome_dir")

    genome_dir = os.path.expanduser(genome_dir)
    localname = get_localname(name, localname)
    out_dir = os.path.join(genome_dir, localname)

    # Check if genome already exists, or if downloading is forced
    no_genome_found = not any(
        os.path.exists(fname) for fname in glob_ext_files(out_dir, "fa"))
    if no_genome_found or force:
        # Download genome from provider
        p = ProviderBase.create(provider)
        p.download_genome(name,
                          genome_dir,
                          mask=mask,
                          regex=regex,
                          invert_match=invert_match,
                          localname=localname,
                          bgzip=bgzip,
                          **kwargs)

    # If annotation is requested, check if annotation already exists, or if downloading is forced
    no_annotation_found = not any(
        os.path.exists(fname) for fname in glob_ext_files(out_dir, "gtf"))
    if annotation and (no_annotation_found or force):
        # Download annotation from provider
        p = ProviderBase.create(provider)
        p.download_annotation(name, genome_dir, localname=localname, **kwargs)

    # generates a Fasta object and the index file
    g = Genome(localname, genome_dir=genome_dir)

    # Run all active plugins
    for plugin in get_active_plugins():
        plugin.after_genome_download(g, force)

    # Generate gap file if not found or if generation is forced
    gap_file = os.path.join(out_dir, localname + ".gaps.bed")
    if not os.path.exists(gap_file) or force:
        generate_gap_bed(glob_ext_files(out_dir, "fa")[0], gap_file)

    generate_env()
예제 #22
0
파일: genome.py 프로젝트: masastat/genomepy
 def _parse_name(name):
     """extract a safe name from file path, url or regular names"""
     return os.path.basename(re.sub(".fa(.gz)?$", "", get_localname(name)))
예제 #23
0
    def download_annotation(self,
                            name,
                            genome_dir,
                            localname=None,
                            version=None):
        """
        Download gene annotation from UCSC based on genomebuild.
    
        Will check UCSC, Ensembl and RefSeq annotation.
    
        Parameters
        ----------
        genomebuild : str
            UCSC genome name.
        genome_dir : str
            Genome directory.
        """
        localname = get_localname(name, localname)

        UCSC_GENE_URL = "http://hgdownload.cse.ucsc.edu/goldenPath/{}/database/"
        ANNOS = ["knownGene.txt.gz", "ensGene.txt.gz", "refGene.txt.gz"]
        pred = "genePredToBed"

        tmp = NamedTemporaryFile(delete=False, suffix=".gz")

        anno = []
        f = urlopen(UCSC_GENE_URL.format(name))
        p = re.compile(r'\w+.Gene.txt.gz')
        for line in f.readlines():
            m = p.search(line.decode())
            if m:
                anno.append(m.group(0))
        sys.stderr.write("Retrieving gene annotation for {}\n".format(name))
        url = ""
        for a in ANNOS:
            if a in anno:
                url = UCSC_GENE_URL.format(name) + a
                break
        if url:
            sys.stderr.write("Using {}\n".format(url))
            urlretrieve(url, tmp.name)

            with gzip.open(tmp.name) as f:
                cols = f.readline().decode(errors='ignore').split("\t")

            start_col = 1
            for i, col in enumerate(cols):
                if col == "+" or col == "-":
                    start_col = i - 1
                    break
            end_col = start_col + 10

            localname = localname.replace(" ", "_")
            path = os.path.join(genome_dir, localname)
            if not os.path.exists(path):
                os.mkdir(path)
            bed_file = os.path.join(genome_dir, localname,
                                    localname + ".annotation.bed")
            cmd = "zcat {} | cut -f{}-{} | {} /dev/stdin {} && gzip {}"
            sp.call(cmd.format(tmp.name, start_col, end_col, pred, bed_file,
                               bed_file),
                    shell=True)

            gtf_file = bed_file.replace(".bed", ".gtf")
            cmd = (
                "bedToGenePred {0}.gz /dev/stdout | "
                "genePredToGtf file /dev/stdin /dev/stdout -utr -honorCdsStat | "
                "sed 's/.dev.stdin/UCSC/' > {1} && gzip {1}")
            sp.check_call(cmd.format(bed_file, gtf_file), shell=True)

            readme = os.path.join(genome_dir, localname, "README.txt")
            with open(readme, "a") as f:
                f.write("annotation url: {}\n".format(url))
        else:
            sys.stderr.write("No annotation found!")