def get_genome_download_link(self, name, mask="soft", **kwargs): """ Return NCBI ftp link to top-level genome sequence Parameters ---------- name : str Genome name. Current implementation will fail if exact name is not found. mask : str , optional Masking level. Options: soft, hard or none. Default is soft. Returns ------ str with the http/ftp download link. """ # only soft masked genomes available. can be (un)masked in _post_process_download link = self._ftp_or_html_link(name, file_suffix="_genomic.fna.gz") if link: return link raise GenomeDownloadError( f"Could not download genome {name} from {self.name}.\n" "URL is broken. Select another genome or provider.\n" f"Broken URL: {link}")
def get_annotation_download_link(self, name: str, **kwargs) -> str: """ Return a functional annotation download link. Parameters ---------- name : str genome name **kwargs: dict, optional: to_annotation : direct URL to the gene annotation Returns ------- str http/ftp link Raises ------ GenomeDownloadError if no functional link was found """ link = kwargs.get("to_annotation") if link: ext = get_file_info(link)[0] if ext not in [".gtf", ".gff", ".gff3", ".bed"]: raise TypeError( "Only (gzipped) gtf, gff and bed files are supported.\n") return link links = self.get_annotation_download_links(name) if links: return links[0] raise GenomeDownloadError( f"No gene annotations found for {get_localname(name)}.\n")
def attempt_and_report(self, name, localname, link, genomes_dir): if not link: sys.stderr.write( f"Could not download genome annotation for {name} from {self.name}.\n" ) return sys.stderr.write( f"Downloading annotation from {self.name}.\nTarget URL: {link}...\n" ) try: self.download_and_generate_annotation(genomes_dir, link, localname) except Exception: raise GenomeDownloadError( f"\nCould not download annotation for {name} from {self.name}\n" "If you think the annotation should be there, please file a bug report at:\n" "https://github.com/vanheeringen-lab/genomepy/issues\n") # TODO sanity check for genes sys.stderr.write("Annotation download successful\n") # Update readme annotation URL, or make a new readme = os.path.join(genomes_dir, localname, "README.txt") metadata, lines = read_readme(readme) metadata["annotation url"] = link write_readme(readme, metadata, lines)
def get_annotation_download_link(self, name: str, **kwargs) -> str: """ Return a functional annotation download link. Parameters ---------- name : str genome name Returns ------- str http/ftp link Raises ------ GenomeDownloadError if no functional link was found """ links = self.annotation_links(name, **kwargs) if links: return links[0] raise GenomeDownloadError( f"No gene annotations found for {name} on {self.name}.\n" "Check for typos or try\n" f" genomepy search {name} -p {self.name}")
def get_genome_download_link(self, name, mask="soft", **kwargs): """ Return NCBI ftp link to top-level genome sequence Parameters ---------- name : str Genome name. Current implementation will fail if exact name is not found. mask : str , optional Masking level. Options: soft, hard or none. Default is soft. Returns ------ str with the http/ftp download link. """ genome = self.genomes[safe(name)] # only soft masked genomes available. can be (un)masked in _post _process_download link = genome["ftp_path"] link = link.replace("ftp://", "https://") link += "/" + link.split("/")[-1] + "_genomic.fna.gz" if check_url(link, 2): return link raise GenomeDownloadError( f"Could not download genome {name} from {self.name}.\n" "URL is broken. Select another genome or provider.\n" f"Broken URL: {link}")
def connect_ftp_link(link, timeout=None) -> Tuple[FTP, str]: """ Anonymous login to ftp. Accepts link in the form of ftp://ftp.name.domain/... and ftp.name.domain/... Parameters ---------- link : str FTP link timeout : int, optional number of idle seconds before the connection closes Returns ------- tuple ftp: FTP object with connection established target: str target file """ link = link.replace("ftp://", "") host, target = link.split("/", 1) try: ftp = FTP(host, timeout=timeout) except socket.gaierror: raise GenomeDownloadError(f"FTP host not found: {host}") ftp.login() return ftp, target
def get_genome_download_link(self, name, mask="soft", **kwargs): """ Return UCSC http link to genome sequence Parameters ---------- name : str Genome name. Current implementation will fail if exact name is not found. mask : str , optional Masking level. Options: soft, hard or none. Default is soft. Returns ------ str with the http/ftp download link. """ # soft masked genomes. can be unmasked in _post _process_download urls = [self.ucsc_url, self.alt_ucsc_url] if mask == "hard": urls = [self.ucsc_url_masked, self.alt_ucsc_url_masked] for genome_url in urls: link = genome_url.format(name) if check_url(link, 2): return link raise GenomeDownloadError( f"Could not download genome {name} from {self.name}.\n" "URLs are broken. Select another genome or provider.\n" f"Broken URLs: {', '.join([url.format(name) for url in urls])}")
def _lazy_provider_selection(name, provider=None): """return the first PROVIDER which has genome NAME""" providers = _providers(provider) for p in providers: if name in p.genomes: return p else: raise GenomeDownloadError( f"{name} not found on {', '.join([p.name for p in providers])}.")
def _lazy_provider_selection(name, provider=None): """return the first PROVIDER which has genome NAME""" providers = _providers(provider) for p in providers: if name in p.genomes or (p.name == "URL" and try_except_pass( ValueError, check_url, name)): return p raise GenomeDownloadError( f"{name} not found on {', '.join([p.name for p in providers])}.")
def _check_name(self, name): """check if genome name can be found for provider""" name = safe(name) if name in self.genomes: return name raise GenomeDownloadError( f"Could not download genome {name} from {self.name}.\n\n" "Check for typos or try\n" f" genomepy search {name} -p {self.name}")
def _lazy_provider_selection(name, provider=None): """return the first PROVIDER which has genome NAME""" providers = [] for p in online_providers(provider): providers.append(p.name) if name in p.genomes: return p if p.name == "URL" and try_except_pass(ValueError, check_url, name): return p if p.name == "Local" and os.path.exists(cleanpath(name)): return p raise GenomeDownloadError(f"{name} not found on {', '.join(providers)}.")
def connect_ftp_link(link, timeout=None): """ anonymous login to ftp accepts link in the form of ftp://ftp.name.domain/... and ftp.name.domain/... """ link = link.replace("ftp://", "") host = link.split("/")[0] target = link.split(host)[1] try: ftp = FTP(host, timeout=timeout) except socket.gaierror: raise GenomeDownloadError(f"FTP host not found: {host}") ftp.login() return ftp, target
def download_annotation(self, name, genomes_dir=None, localname=None, **kwargs): """ Download annotation file to to a specific directory Parameters ---------- name : str Genome / species name genomes_dir : str , optional Directory to install annotation localname : str , optional Custom name for your genome """ name = self._check_name(name) link = self.get_annotation_download_link(name, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) logger.info( f"Downloading annotation from {self.name}. Target URL: {link}...") try: # download exact assembly report to rename the scaffolds acc = self.assembly_accession(name) fname = os.path.join(genomes_dir, localname, "assembly_report.txt") download_assembly_report(acc, fname) download_annotation(genomes_dir, link, localname) logger.info("Annotation download successful") except Exception as e: raise GenomeDownloadError( f"An error occured while installing the gene annotation for {name} from {self.name}.\n" "If you think the annotation should be there, please file a bug report at: " "https://github.com/vanheeringen-lab/genomepy/issues\n\n" f"Error: {e.args[0]}") # Add annotation URL to readme readme = os.path.join(genomes_dir, localname, "README.txt") update_readme(readme, updated_metadata={"annotation url": link})
def get_annotation_download_link(self, name: str, **kwargs) -> str: """ Return an available annotation type. Parameters ---------- name : str genome name **kwargs: dict, optional: ucsc_annotation_type : specific annotation type to download. Returns ------- str http/ftp link Raises ------ GenomeDownloadError if no functional link was found FileNotFoundError if the specified annotation type is unavailable """ available = self.annotation_links(name) if not available: raise GenomeDownloadError( f"No gene annotations found for {name} on {self.name}.\n" "Check for typos or try\n" f" genomepy search {name} -p {self.name}") annot = available usr_annot = kwargs.get("ucsc_annotation_type") if usr_annot: # not all types are available for each genome annot = [a for a in available if a.lower() == usr_annot.lower()] if not annot: raise FileNotFoundError( f"{usr_annot} is not available for {name}. " f"Options: {', '.join(available)}.\n") return annot[0]
def get_annotation_download_link(self, name: str, **kwargs) -> str: """ Return a filepath to a matching annotation. Parameters ---------- name : str genome name **kwargs: dict, optional: path_to_annotation : direct path to the gene annotation Returns ------- str path Raises ------ GenomeDownloadError if no functional path was found """ path = kwargs.get("path_to_annotation") if path: path = cleanpath(path) if not os.path.exists(path): raise FileNotFoundError( f"Local path to annotation does not exist: {path}") ext = get_file_info(path)[0] if ext not in [".gtf", ".gff", ".gff3", ".bed"]: raise TypeError( "Only (gzipped) gtf, gff and bed files are supported.\n") return path paths = self.get_annotation_download_links(name) if paths: return paths[0] raise GenomeDownloadError( f"No gene annotations found for {get_genomename(name)}.\n")
def download_annotation(self, name, genomes_dir=None, localname=None, **kwargs): """ Download the UCSC genePred via their MySQL database, and convert to annotations. """ name = self._check_name(name) annot = self.get_annotation_download_link(name, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) logger.info( f"Downloading the {annot} annotation from the UCSC MySQL database." ) try: download_annotation(name, annot, genomes_dir, localname) logger.info("Annotation download successful") except Exception as e: raise GenomeDownloadError( f"An error occured while installing the gene annotation for {name} from {self.name}.\n" "If you think the annotation should be there, please file a bug report at: " "https://github.com/vanheeringen-lab/genomepy/issues\n\n" f"Error: {e.args[0]}") # Add annotation URL to readme readme = os.path.join(genomes_dir, localname, "README.txt") update_readme( readme, updated_metadata={ "annotation url": f"UCSC MySQL database: {name}, table: {annot}" }, )
def get_genome_download_link(self, name, mask="soft", **kwargs): """ Return Ensembl http or ftp link to the genome sequence Parameters ---------- name : str Genome name. Current implementation will fail if exact name is not found. mask : str , optional Masking level. Options: soft, hard or none. Default is soft. Returns ------ str with the http/ftp download link. """ genome = self.genomes[safe(name)] # parse the division division = genome["division"].lower().replace("ensembl", "") if division == "bacteria": raise NotImplementedError( "bacteria from ensembl not yet supported") ftp_site = "ftp://ftp.ensemblgenomes.org/pub" if division == "vertebrates": ftp_site = "ftp://ftp.ensembl.org/pub" # Ensembl release version version = kwargs.get("version") if version is None: version = self.get_version(self.rest_url, division == "vertebrates") # division dependent url format ftp_dir = "{}/release-{}/fasta/{}/dna".format( division, version, genome["url_name"].lower()) if division == "vertebrates": ftp_dir = "release-{}/fasta/{}/dna".format( version, genome["url_name"].lower()) url = f"{ftp_site}/{ftp_dir}" # masking and assembly level def get_url(level="toplevel"): masks = { "soft": "dna_sm.{}", "hard": "dna_rm.{}", "none": "dna.{}" } pattern = masks[mask].format(level) asm_url = "{}/{}.{}.{}.fa.gz".format( url, genome["url_name"].capitalize(), re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])), pattern, ) return asm_url # try to get the (much smaller) primary assembly, # unless specified otherwise link = get_url("primary_assembly") if kwargs.get("toplevel") or not check_url(link, 2): link = get_url() if check_url(link, 2): return link raise GenomeDownloadError( f"Could not download genome {name} from {self.name}.\n" "URL is broken. Select another genome or provider.\n" f"Broken URL: {link}")
def get_genome_download_link(self, name, mask="soft", **kwargs): """ Return http link to the genome sequence Parameters ---------- name : str Genome name. Current implementation will fail if exact name is not found. mask : str , optional Masking level. Options: soft, hard or none. Default is soft. Returns ------ str with the http download link. """ genome = self.genomes[safe(name)] division, is_vertebrate = get_division(genome) # base directory of the genome ftp = "http://ftp.ensemblgenomes.org" if is_vertebrate: ftp = "http://ftp.ensembl.org" version = self.get_version(is_vertebrate, kwargs.get("version")) div_path = "" if is_vertebrate else f"/{division}" lwr_name = genome["url_name"].lower() ftp_directory = f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna" # some entries don't use url_name in their url... -,- # examples: # - EnsemblVertebrates: mus_musculus_nzohlltj # - EnsemblMetazoa: caenorhabditis_elegans if not check_url(ftp_directory, 2): lwr_name = genome["name"] ftp_directory = f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna" # this assembly has its own directory if name == "GRCh37": ftp_directory = genome["genome"].format(version) # specific fasta file cap_name = lwr_name.capitalize() asm_name = re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])) mask_lvl = {"soft": "_sm", "hard": "_rm", "none": ""}[mask] asm_lvl = "toplevel" if kwargs.get("toplevel") else "primary_assembly" version_tag = "" if int(version) > 30 else f".{version}" ftp_file = f"{cap_name}.{asm_name}{version_tag}.dna{mask_lvl}.{asm_lvl}.fa.gz" # combine link = f"{ftp_directory}/{ftp_file}" if check_url(link, 2): return link # primary assemblies do not always exist if asm_lvl == "primary_assembly": link = link.replace("primary_assembly", "toplevel") if check_url(link, 2): return link raise GenomeDownloadError( f"Could not download genome {name} from {self.name}.\n" "URL is broken. Select another genome or provider.\n" f"Broken URL: {link}")