def __init__(self, name, genomes_dir=None, *args, **kwargs): self.name = safe(os.path.basename(re.sub(r"\.fa(\.gz)?$", "", name))) "genome name" self.genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) "path to the genomepy genomes directory" self.filename = self._parse_filename(name) super(Genome, self).__init__(self.filename, *args, **kwargs) # file paths self.genome_file = self.filename "path to the genome fasta" self.genome_dir = os.path.dirname(self.filename) "path to the genome directory" self.index_file = self.genome_file + ".fai" "path to the genome index" self.sizes_file = self._check_support_file("sizes") "path to the chromosome sizes file" self.gaps_file = self._check_support_file("gaps") "path to the chromosome gaps file" self.annotation_gtf_file = self._check_annotation_file("gtf") "path to the gene annotation GTF file" self.annotation_bed_file = self._check_annotation_file("bed") "path to the gene annotation BED file" self.readme_file = os.path.join(self.genome_dir, "README.txt") "path to the README file" # genome attributes metadata, _ = read_readme(self.readme_file) self.tax_id = metadata["tax_id"] "genome taxonomy identifier" self.assembly_accession = metadata["assembly_accession"] "genome assembly accession"
def download_annotation(self, name, genomes_dir=None, localname=None, **kwargs): """ Download annotation file to to a specific directory Parameters ---------- name : str Genome / species name genomes_dir : str , optional Directory to install annotation localname : str , optional Custom name for your genome """ self.check_name(name) link = self.get_annotation_download_link(name, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) self.attempt_and_report(name, localname, link, genomes_dir)
def download_annotation(self, url, genomes_dir=None, localname=None, **kwargs): """ Attempts to download a gtf or gff3 file from the same location as the genome url Parameters ---------- url : str url of where to download genome from genomes_dir : str Directory to install annotation localname : str , optional Custom name for your genome kwargs: dict , optional: Provider specific options. to_annotation : str , optional url to annotation file (only required if this not located in the same directory as the fasta) """ name = get_localname(url) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) if kwargs.get("to_annotation"): link = self.get_annotation_download_link(None, **kwargs) else: link = self.search_url_for_annotation(url) self.attempt_and_report(name, localname, link, genomes_dir)
def download_annotation(self, url, genomes_dir=None, localname=None, **kwargs): """ Attempts to download a gtf or gff3 file from the same location as the genome url Parameters ---------- url : str url of where to download genome from genomes_dir : str Directory to install annotation localname : str , optional Custom name for your genome kwargs: dict , optional: Provider specific options. to_annotation : str , optional url to annotation file (only required if this not located in the same directory as the fasta) """ name = get_localname(url) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) if kwargs.get("to_annotation"): links = [self.get_annotation_download_link(None, **kwargs)] else: # can return multiple possible hits links = self.search_url_for_annotations(url, name) for link in links: try: self.attempt_and_report(name, localname, link, genomes_dir) break except GenomeDownloadError as e: if not link == links[-1]: sys.stdout.write( "\nOne of the potential annotations was incompatible with genomepy." + "\nAttempting another...\n\n") continue return e
def download_annotation(self, name, genomes_dir=None, localname=None, **kwargs): """ Download annotation file to to a specific directory Parameters ---------- name : str Genome / species name genomes_dir : str , optional Directory to install annotation localname : str , optional Custom name for your genome """ name = self._check_name(name) link = self.get_annotation_download_link(name, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) logger.info( f"Downloading annotation from {self.name}. Target URL: {link}...") try: # download exact assembly report to rename the scaffolds acc = self.assembly_accession(name) fname = os.path.join(genomes_dir, localname, "assembly_report.txt") download_assembly_report(acc, fname) download_annotation(genomes_dir, link, localname) logger.info("Annotation download successful") except Exception as e: raise GenomeDownloadError( f"An error occured while installing the gene annotation for {name} from {self.name}.\n" "If you think the annotation should be there, please file a bug report at: " "https://github.com/vanheeringen-lab/genomepy/issues\n\n" f"Error: {e.args[0]}") # Add annotation URL to readme readme = os.path.join(genomes_dir, localname, "README.txt") update_readme(readme, updated_metadata={"annotation url": link})
def head_annotation(self, name, genomes_dir=None, n: int = 5, **kwargs): """ Download the first n genes of each UCSC annotation type. The first line of the GTF is printed for review (of the gene_name field, for instance). Parameters ---------- name : str genome name genomes_dir : str, optional genomes directory to install the annotation in. n : int, optional download the annotation for n genes. kwargs : dict , optional annotations : list specify which UCSC annotation types to download. Downloads all available if left blank. """ name = self._check_name(name) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) annotations = kwargs.get("annotations") if annotations is None: annotations = self.annotation_links(name) for annot in annotations: if annot not in ANNOTATIONS: raise ValueError(f"{annot} not in {ANNOTATIONS}") localname = f"{name}_head_{annot}" fpath = os.path.join(genomes_dir, localname, f"{localname}.annotation.gtf") download_annotation(name, annot, genomes_dir, localname, n=n) logger.info(f"{self.name} {annot}") with open(fpath) as f: for m, line in enumerate(f): if line: print(line.strip()) if m + 1 == n: break
def list_installed_genomes(genomes_dir=None): """ List all available genomes. Parameters ---------- genomes_dir : str Directory with installed genomes. Returns ------- list with genome names """ genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) if os.path.exists(genomes_dir): return [ subdir for subdir in os.listdir(genomes_dir) if _is_genome_dir(os.path.join(genomes_dir, subdir)) ] return []
def __init__(self, name, genomes_dir=None): self.genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) self.name = self._parse_name(name) self.filename = self._parse_filename(name) super(Genome, self).__init__(self.filename) # file paths self.genome_file = self.filename self.genome_dir = os.path.dirname(self.filename) self.index_file = self.genome_file + ".fai" self.sizes_file = self.genome_file + ".sizes" self.gaps_file = os.path.join(self.genome_dir, self.name + ".gaps.bed") self.readme_file = os.path.join(self.genome_dir, "README.txt") # genome attributes self.sizes = {} self.gaps = {} metadata = self._read_metadata() self.tax_id = metadata.get("tax_id") self.assembly_accession = metadata.get("assembly_accession")
def head_annotation(self, name: str, genomes_dir=None, n: int = 5, **kwargs): """ Download the first n lines of the annotation. The first line of the GTF is printed for review (of the gene_name field, for instance). Parameters ---------- name : str genome name genomes_dir : str, optional genomes directory to install the annotation in. n : int, optional download the annotation for n genes. """ name = self._check_name(name) link = self.get_annotation_download_link(name, **kwargs) localname = f"{name}_head" genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) fpath = os.path.join(genomes_dir, localname, f"{localname}.annotation.gtf") download_annotation(genomes_dir, link, localname, n=n) logger.info(self.name) m = 0 with open(fpath) as f: for line in f: line = line.strip() if line and line[0] != "#": print(line) m += 1 if m == n: break
def list_installed_genomes(genomes_dir: str = None) -> list: """ List all locally available genomes. Parameters ---------- genomes_dir : str, optional Directory with genomes installed by genomepy. Returns ------- list genome names """ genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) if os.path.exists(genomes_dir): return [ subdir for subdir in os.listdir(genomes_dir) if _is_genome_dir(os.path.join(genomes_dir, subdir)) ] return []
def _get_name_and_dir(name, genomes_dir=None): """ Returns the name and directory of the genome. """ fname = cleanpath(name) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) if os.path.isfile(fname): exts = ["gtf", "GTF", "bed", "BED", "fa"] if not any(ext in fname for ext in exts): raise NotImplementedError( "Only (gzipped) bed, gtf or fasta files are supported!") genome_dir = os.path.dirname(fname) name = safe(os.path.basename(fname)) # remove suffices any_ext = "(" + ")|(".join(exts) + ")" name = re.sub(fr"(\.annotation)?\.({any_ext})(\.gz)?$", "", name) elif os.path.isdir(fname): genome_dir = fname name = safe(os.path.basename(fname)) elif name in os.listdir(genomes_dir): genome_dir = os.path.join(genomes_dir, name) else: raise FileNotFoundError(f"Could not find {name}") return name, genome_dir
def download_annotation(self, name, genomes_dir=None, localname=None, **kwargs): """ Download the UCSC genePred via their MySQL database, and convert to annotations. """ name = self._check_name(name) annot = self.get_annotation_download_link(name, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) logger.info( f"Downloading the {annot} annotation from the UCSC MySQL database." ) try: download_annotation(name, annot, genomes_dir, localname) logger.info("Annotation download successful") except Exception as e: raise GenomeDownloadError( f"An error occured while installing the gene annotation for {name} from {self.name}.\n" "If you think the annotation should be there, please file a bug report at: " "https://github.com/vanheeringen-lab/genomepy/issues\n\n" f"Error: {e.args[0]}") # Add annotation URL to readme readme = os.path.join(genomes_dir, localname, "README.txt") update_readme( readme, updated_metadata={ "annotation url": f"UCSC MySQL database: {name}, table: {annot}" }, )
def download_genome( self, name: str, genomes_dir: str = None, localname: str = None, mask: str = "soft", **kwargs, ): """ Download a (gzipped) genome file to a specific directory Parameters ---------- name : str Genome / species name genomes_dir : str , optional Directory to install genome localname : str , optional Custom name for your genome mask: str , optional Masking, soft, hard or none (all other strings) """ name = self._check_name(name) link = self.get_genome_download_link(name, mask=mask, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) mkdir_p(out_dir) logger.info( f"Downloading genome from {self.name}. Target URL: {link}...") # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks get_file = shutil.copyfile if os.path.exists(link) else download_file with TemporaryDirectory(dir=out_dir) as tmp_dir: tmp_fname = os.path.join(tmp_dir, link.split("/")[-1]) fname = os.path.join(tmp_dir, f"{localname}.fa") get_file(link, tmp_fname) logger.info( "Genome download successful, starting post processing...") # unzip genome _, is_compressed = get_file_info(link) if is_compressed: extract_archive(tmp_fname, outfile=fname, concat=True) else: shutil.move(tmp_fname, fname) # process genome (e.g. masking) if hasattr(self, "_post_process_download"): self._post_process_download(name=name, fname=fname, out_dir=out_dir, mask=mask) # transfer the genome from the tmpdir to the genome_dir src = fname dst = os.path.join(out_dir, f"{localname}.fa") shutil.move(src, dst) logger.info("name: {}".format(name)) logger.info("local name: {}".format(localname)) logger.info("fasta: {}".format(dst)) # Create readme with information readme = os.path.join(genomes_dir, localname, "README.txt") asm_acc = self.assembly_accession(name) tax_id = self.genome_taxid(name) metadata = { "name": localname, "provider": self.name, "original name": name, "original filename": os.path.split(link)[-1], "assembly_accession": asm_acc if asm_acc else "na", "tax_id": tax_id if tax_id else "na", "mask": mask, "genome url": link, "genomepy version": __version__, "date": time.strftime("%Y-%m-%d %H:%M:%S"), } update_readme(readme, metadata)
def install_genome( name, provider=None, genomes_dir=None, localname=None, mask="soft", keep_alt=False, regex=None, invert_match=False, bgzip=None, annotation=False, only_annotation=False, skip_sanitizing=False, threads=1, force=False, **kwargs, ): """ Install a genome. Parameters ---------- name : str Genome name provider : str , optional Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified. genomes_dir : str , optional Where to store the fasta files localname : str , optional Custom name for this genome. mask : str , optional Default is 'soft', choices 'hard'/'soft/'none' for respective masking level. keep_alt : bool , optional Some genomes contain alternative regions. These regions cause issues with sequence alignment, as they are inherently duplications of the consensus regions. Set to true to keep these alternative regions. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. threads : int , optional Build genome index using multithreading (if supported). Default: lowest of 8/all threads force : bool , optional Set to True to overwrite existing files. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. only_annotation : bool , optional If set to True, only download the annotation files. skip_sanitizing : bool , optional If set to True, downloaded annotation files whose sequence names do not match with the (first header fields of) the genome.fa will not be corrected. kwargs : dict , optional Provider specific options. toplevel : bool , optional Ensembl only: Always download the toplevel genome. Ignores potential primary assembly. version : int , optional Ensembl only: Specify release version. Default is latest. to_annotation : text , optional URL only: direct link to annotation file. Required if this is not the same directory as the fasta. """ name = safe(name) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) # Check if genome already exists, or if downloading is forced genome_found = _is_genome_dir(out_dir) if (not genome_found or force) and not only_annotation: # Download genome from provider p = _provider_selection(name, localname, genomes_dir, provider) p.download_genome( name, genomes_dir, mask=mask, keep_alt=keep_alt, regex=regex, invert_match=invert_match, localname=localname, bgzip=bgzip, **kwargs, ) genome_found = True # Export installed genome(s) generate_env(genomes_dir=genomes_dir) # Generates a Fasta object, index, gaps and sizes file g = None if genome_found: g = Genome(localname, genomes_dir=genomes_dir) if force: # overwrite previous versions generate_fa_sizes(g.genome_file, g.sizes_file) generate_gap_bed(g.genome_file, g.gaps_file) # Check if any annotation flags are given, if annotation already exists, or if downloading is forced if any([ annotation, only_annotation, skip_sanitizing, kwargs.get("to_annotation"), kwargs.get("ucsc_annotation_type"), ]): annotation = True annotation_found = bool(glob_ext_files(out_dir, "gtf")) if (not annotation_found or force) and annotation: # Download annotation from provider p = _provider_selection(name, localname, genomes_dir, provider) p.download_annotation(name, genomes_dir, localname=localname, **kwargs) # Sanitize annotation if needed (requires genome) annotation_found = bool(glob_ext_files(out_dir, "gtf")) if genome_found and annotation_found and not skip_sanitizing: sanitize_annotation(g) if genome_found: # Run all active plugins (requires genome) for plugin in get_active_plugins(): plugin.after_genome_download(g, threads, force)
def install_genome( name: str, provider: Optional[str] = None, genomes_dir: Optional[str] = None, localname: Optional[str] = None, mask: Optional[str] = "soft", keep_alt: Optional[bool] = False, regex: Optional[str] = None, invert_match: Optional[bool] = False, bgzip: Optional[bool] = None, # None -> check config. False -> dont check. annotation: Optional[bool] = False, only_annotation: Optional[bool] = False, skip_matching: Optional[bool] = False, skip_filter: Optional[bool] = False, threads: Optional[int] = 1, force: Optional[bool] = False, **kwargs: Optional[dict], ) -> Genome: """ Install a genome (& gene annotation). Parameters ---------- name : str Genome name provider : str , optional Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified. genomes_dir : str , optional Where to create the output folder. localname : str , optional Custom name for this genome. mask : str , optional Genome masking of repetitive sequences. Options: hard/soft/none, default is soft. keep_alt : bool , optional Some genomes contain alternative regions. These regions cause issues with sequence alignment, as they are inherently duplications of the consensus regions. Set to true to keep these alternative regions. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that *don't* match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip, and gene annotation will be compressed with gzip. threads : int , optional Build genome index using multithreading (if supported). Default: lowest of 8/all threads. force : bool , optional Set to True to overwrite existing files. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. only_annotation : bool , optional If set to True, only download the gene annotation files. skip_matching : bool , optional If set to True, contigs in the annotation not matching those in the genome will not be corrected. skip_filter : bool , optional If set to True, the gene annotations will not be filtered to match the genome contigs. kwargs : dict , optional Provider specific options. toplevel : bool , optional Ensembl only: Always download the toplevel genome. Ignores potential primary assembly. version : int , optional Ensembl only: Specify release version. Default is latest. to_annotation : text , optional URL only: direct link to annotation file. Required if this is not the same directory as the fasta. Returns ------- Genome Genome class with the installed genome """ name = safe(name) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) genome_file = os.path.join(out_dir, f"{localname}.fa") provider = _provider_selection(name, localname, genomes_dir, provider) # check which files need to be downloaded genome_found = _is_genome_dir(out_dir) download_genome = ( genome_found is False or force is True ) and only_annotation is False annotation_found = bool(glob_ext_files(out_dir, "annotation.gtf")) and bool( glob_ext_files(out_dir, "annotation.bed") ) download_annotation = (annotation_found is False or force is True) and any( [ annotation, only_annotation, skip_matching, skip_filter, kwargs.get("to_annotation"), kwargs.get("path_to_annotation"), kwargs.get("ucsc_annotation_type"), ] ) genome = None genome_downloaded = False if download_genome: if force: _delete_extensions(out_dir, ["fa", "fai"]) provider.download_genome( name, genomes_dir, mask=mask, localname=localname, **kwargs, ) genome_found = True genome_downloaded = True # Filter genome _filter_genome(genome_file, regex, invert_match, keep_alt) # Generates a Fasta object and the genome index, gaps and sizes files genome = Genome(localname, genomes_dir=genomes_dir) # Download the NCBI assembly report asm_report = os.path.join(out_dir, "assembly_report.txt") asm_acc = genome.assembly_accession if not os.path.exists(asm_report) and asm_acc != "na": download_assembly_report(asm_acc, asm_report) # Export installed genome(s) generate_env(genomes_dir=genomes_dir) annotation_downloaded = False if download_annotation: if force: _delete_extensions(out_dir, ["annotation.gtf", "annotation.bed"]) provider.download_annotation(name, genomes_dir, localname=localname, **kwargs) annotation_downloaded = bool( glob_ext_files(out_dir, "annotation.gtf") ) and bool(glob_ext_files(out_dir, "annotation.bed")) if annotation_downloaded: annotation = Annotation(localname, genomes_dir=genomes_dir) if genome_found and not (skip_matching and skip_filter): annotation.sanitize(not skip_matching, not skip_filter, True) # Run active plugins (also if the genome was downloaded earlier) if genome_found: genome = genome if genome else Genome(localname, genomes_dir=genomes_dir) for plugin in get_active_plugins(): plugin.after_genome_download(genome, threads, force) # zip files downloaded now if bgzip is True or (bgzip is None and config.get("bgzip")): if genome_downloaded: bgzip_and_name(genome.filename) if annotation_downloaded: gzip_and_name(annotation.annotation_gtf_file) gzip_and_name(annotation.annotation_bed_file) return genome
def download_genome( self, name, genomes_dir=None, localname=None, mask="soft", keep_alt=False, regex=None, invert_match=False, bgzip=None, **kwargs, ): """ Download a (gzipped) genome file to a specific directory Parameters ---------- name : str Genome / species name genomes_dir : str , optional Directory to install genome localname : str , optional Custom name for your genome mask: str , optional Masking, soft, hard or none (all other strings) keep_alt : bool , optional Set to true to keep these alternative regions. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. """ name = safe(name) self.check_name(name) link = self.get_genome_download_link(name, mask=mask, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) if not os.path.exists(out_dir): mkdir_p(out_dir) sys.stderr.write( f"Downloading genome from {self.name}.\nTarget URL: {link}...\n") # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks tmp_dir = mkdtemp(dir=out_dir) fname = os.path.join(tmp_dir, f"{localname}.fa") urlcleanup() download_file(link, fname) sys.stderr.write( "Genome download successful, starting post processing...\n") # unzip genome if link.endswith(".tar.gz"): tar_to_bigfile(fname, fname) elif link.endswith(".gz"): os.rename(fname, fname + ".gz") ret = sp.check_call(["gunzip", "-f", fname]) if ret != 0: raise Exception(f"Error gunzipping genome {fname}") def regex_filer(_fname, _regex, _v): infa = _fname + "_to_regex" os.rename(_fname, infa) # filter the fasta and store the output's keys keys_out = filter_fasta(infa, outfa=_fname, regex=_regex, v=_v, force=True).keys() keys_in = Fasta(infa).keys() return [k for k in keys_in if k not in keys_out] not_included = [] # remove alternative regions if not keep_alt: not_included.extend(regex_filer(fname, "alt", True)) # keep/remove user defined regions if regex: not_included.extend(regex_filer(fname, regex, invert_match)) # process genome (e.g. masking) if hasattr(self, "_post_process_download"): self._post_process_download(name=name, localname=localname, out_dir=tmp_dir, mask=mask) # bgzip genome if requested if bgzip or config.get("bgzip"): # bgzip to stdout, track progress, and output to file fsize = int(os.path.getsize(fname) * 10**-6) cmd = ( f"bgzip -fc {fname} | " f"tqdm --bytes --desc Bgzipping {fsize}MB fasta --log ERROR | " f"cat > {fname}.gz") ret = sp.check_call(cmd, shell=True) if ret != 0: raise Exception(f"Error bgzipping {name}. Is tabix installed?") fname += ".gz" # transfer the genome from the tmpdir to the genome_dir src = fname dst = os.path.join(genomes_dir, localname, os.path.basename(fname)) shutil.move(src, dst) rm_rf(tmp_dir) sys.stderr.write("\n") sys.stderr.write("name: {}\n".format(name)) sys.stderr.write("local name: {}\n".format(localname)) sys.stderr.write("fasta: {}\n".format(dst)) # Create readme with information readme = os.path.join(genomes_dir, localname, "README.txt") metadata = { "name": localname, "provider": self.name, "original name": name, "original filename": os.path.split(link)[-1], "assembly_accession": self.assembly_accession(self.genomes.get(name)), "tax_id": self.genome_taxid(self.genomes.get(name)), "mask": mask, "genome url": link, "annotation url": "na", "date": time.strftime("%Y-%m-%d %H:%M:%S"), } lines = [] if not keep_alt or regex: regex_line = "regex: " if not keep_alt: regex_line += "'alt' (inverted match)" if not keep_alt and regex: regex_line += " and " if regex: regex_line += f"'{regex}'" if invert_match: regex_line += " (inverted match)" lines += ["", regex_line, "sequences that were excluded:"] for seq in not_included: lines.append(f"\t{seq}") write_readme(readme, metadata, lines)
def download_genome( self, name, genomes_dir=None, localname=None, mask="soft", keep_alt=False, regex=None, invert_match=False, bgzip=None, **kwargs, ): """ Download a (gzipped) genome file to a specific directory Parameters ---------- name : str Genome / species name genomes_dir : str , optional Directory to install genome localname : str , optional Custom name for your genome mask: str , optional Masking, soft, hard or none (all other strings) keep_alt : bool , optional Set to true to keep these alternative regions. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. """ name = safe(name) self.check_name(name) link = self.get_genome_download_link(name, mask=mask, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) if not os.path.exists(out_dir): mkdir_p(out_dir) sys.stderr.write( f"Downloading genome from {self.name}.\nTarget URL: {link}...\n") # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks with TemporaryDirectory(dir=out_dir) as tmp_dir: fname = os.path.join(tmp_dir, f"{localname}.fa") # actual download urlcleanup() with urlopen(link) as response: # check available memory vs file size. available_memory = int(virtual_memory().available) file_size = int(response.info()["Content-Length"]) # download file in chunks if >75% of memory would be used cutoff = int(available_memory * 0.75) chunk_size = None if file_size < cutoff else cutoff with open(fname, "wb") as f_out: shutil.copyfileobj(response, f_out, chunk_size) sys.stderr.write( "Genome download successful, starting post processing...\n") # unzip genome if link.endswith(".tar.gz"): tar_to_bigfile(fname, fname) elif link.endswith(".gz"): os.rename(fname, fname + ".gz") ret = sp.check_call(["gunzip", "-f", fname]) if ret != 0: raise Exception(f"Error gunzipping genome {fname}") def regex_filer(_fname, _regex, _v): os.rename(_fname, _fname + "_to_regex") infa = _fname + "_to_regex" outfa = _fname filter_fasta(infa, outfa, regex=_regex, v=_v, force=True) return [ k for k in Fasta(infa).keys() if k not in Fasta(outfa).keys() ] not_included = [] # remove alternative regions if not keep_alt: not_included.extend(regex_filer(fname, "alt", True)) # keep/remove user defined regions if regex: not_included.extend(regex_filer(fname, regex, invert_match)) # process genome (e.g. masking) if hasattr(self, "_post_process_download"): self._post_process_download(name=name, localname=localname, out_dir=tmp_dir, mask=mask) # bgzip genome if requested if bgzip or config.get("bgzip"): ret = sp.check_call(["bgzip", "-f", fname]) if ret != 0: raise Exception( f"Error bgzipping {name}. Is tabix installed?") fname += ".gz" # transfer the genome from the tmpdir to the genome_dir src = fname dst = os.path.join(genomes_dir, localname, os.path.basename(fname)) shutil.move(src, dst) sys.stderr.write("\n") sys.stderr.write("name: {}\n".format(name)) sys.stderr.write("local name: {}\n".format(localname)) sys.stderr.write("fasta: {}\n".format(dst)) # Create readme with information readme = os.path.join(genomes_dir, localname, "README.txt") metadata = { "name": localname, "provider": self.name, "original name": name, "original filename": os.path.split(link)[-1], "assembly_accession": self.assembly_accession(self.genomes.get(name)), "tax_id": self.genome_taxid(self.genomes.get(name)), "mask": mask, "genome url": link, "annotation url": "na", "date": time.strftime("%Y-%m-%d %H:%M:%S"), } lines = [] if regex: regex_line = f"regex: {regex}" if invert_match: regex_line += " (inverted match)" lines += ["", regex_line, "sequences that were excluded:"] for seq in not_included: lines.append(f"\t{seq}") write_readme(readme, metadata, lines)
def map_locations( frm: str, to: str, genomes_dir: Optional[str] = None ) -> Optional[pd.DataFrame]: """ Load chromosome mapping from one assembly to another using the NCBI assembly reports. Parameters ---------- frm: str A local genomepy genome name to: str target provider (UCSC, Ensembl or NCBI) genomes_dir: str, optional The genomes directory to look for the genomes. Will search the default genomes_dir if left blank. Returns ------- pandas.DataFrame Chromosome mapping. """ to_provider = to.lower() if to_provider not in ["ucsc", "ncbi", "ensembl"]: raise ValueError(f"Genomepy can only map to NCBI, UCSC or Ensembl, not '{to}'.") genomes_dir = get_genomes_dir(genomes_dir) frm_readme = os.path.join(genomes_dir, frm, "README.txt") frm_asm_report = os.path.join(genomes_dir, frm, "assembly_report.txt") if not os.path.exists(frm_readme): raise FileNotFoundError(f"Cannot find {frm} in {genomes_dir}.") metadata, _ = read_readme(frm_readme) frm_provider = metadata.get("provider").lower() if frm_provider == to_provider: logger.warning(f"You are attempting to map {frm} from {to} to {to}.") return asm_acc = metadata.get("assembly_accession") if not os.path.exists(frm_asm_report): download_assembly_report(asm_acc, frm_asm_report) if not os.path.exists(frm_asm_report): logger.warning("Cannot map without an assembly report.") return asm_report = pd.read_csv(frm_asm_report, sep="\t", comment="#", dtype=str) asm_report["ensembl_name"] = asm_report["Sequence-Name"] asm_report["ncbi_name"] = asm_report["Sequence-Name"] asm_report["ucsc_name"] = asm_report["UCSC-style-name"] # for Ensembl, use GenBank names for the scaffolds asm_report.loc[ asm_report["Sequence-Role"] != "assembled-molecule", "ensembl_name" ] = asm_report.loc[ asm_report["Sequence-Role"] != "assembled-molecule", "GenBank-Accn" ] if "ucsc" in [frm_provider, to_provider] and list( asm_report["ucsc_name"].unique() ) == ["na"]: logger.warning("UCSC style names not available for this assembly.") return mapping = asm_report[[f"{frm_provider}_name", f"{to_provider}_name"]] mapping = mapping.dropna().drop_duplicates().set_index(f"{frm_provider}_name") return mapping