def get_genome_download_link(self, name, mask="soft", **kwargs): """ Return NCBI ftp link to top-level genome sequence Parameters ---------- name : str Genome name. Current implementation will fail if exact name is not found. mask : str , optional Masking level. Options: soft, hard or none. Default is soft. Returns ------ str with the http/ftp download link. """ genome = self.genomes[safe(name)] # only soft masked genomes available. can be (un)masked in _post _process_download link = genome["ftp_path"] link = link.replace("ftp://", "https://") link += "/" + link.split("/")[-1] + "_genomic.fna.gz" if check_url(link, 2): return link raise GenomeDownloadError( f"Could not download genome {name} from {self.name}.\n" "URL is broken. Select another genome or provider.\n" f"Broken URL: {link}")
def get_genome_download_link(self, name, mask="soft", **kwargs): """ Return UCSC http link to genome sequence Parameters ---------- name : str Genome name. Current implementation will fail if exact name is not found. mask : str , optional Masking level. Options: soft, hard or none. Default is soft. Returns ------ str with the http/ftp download link. """ # soft masked genomes. can be unmasked in _post _process_download urls = [self.ucsc_url, self.alt_ucsc_url] if mask == "hard": urls = [self.ucsc_url_masked, self.alt_ucsc_url_masked] for genome_url in urls: link = genome_url.format(name) if check_url(link, 2): return link raise GenomeDownloadError( f"Could not download genome {name} from {self.name}.\n" "URLs are broken. Select another genome or provider.\n" f"Broken URLs: {', '.join([url.format(name) for url in urls])}")
def get_annotation_download_link(self, name, **kwargs): """ Parse and test the link to the UCSC annotation file. Will check UCSC, Ensembl, NCBI RefSeq and UCSC RefSeq annotation, respectively. More info on the annotation file on: https://genome.ucsc.edu/FAQ/FAQgenes.html#whatdo Parameters ---------- name : str Genome name """ gtf_url = f"http://hgdownload.soe.ucsc.edu/goldenPath/{name}/bigZips/genes/" txt_url = f"http://hgdownload.cse.ucsc.edu/goldenPath/{name}/database/" annot_files = { "ucsc": "knownGene", "ensembl": "ensGene", "ncbi_refseq": "ncbiRefSeq", "ucsc_refseq": "refGene", } # download gtf format if possible, txt format if not gtfs_exists = check_url(gtf_url, 2) base_url = gtf_url + name + "." if gtfs_exists else txt_url base_ext = ".gtf.gz" if gtfs_exists else ".txt.gz" # download specified annotation type if requested file = kwargs.get("ucsc_annotation_type") if file: link = base_url + annot_files[file.lower()] + base_ext if check_url(link, 2): return link sys.stderr.write( f"Specified annotation type ({file}) not found for {name}.\n") else: # download first available annotation type found for file in annot_files.values(): link = base_url + file + base_ext if check_url(link, 2): return link
def get_annotation_download_link(self, name, **kwargs): """ check if the linked annotation file is of a supported file type (gtf/gff3/bed) """ link = kwargs.get("to_annotation") if link: ext = get_file_info(link)[0] if ext not in [".gtf", ".gff", ".gff3", ".bed"]: raise TypeError( "Only (gzipped) gtf, gff and bed files are supported.\n") if check_url(link): return link
def _ftp_or_html_link(self, name, file_suffix, skip_check=False): """ NCBI's files are accessible over FTP and HTTPS Try HTTPS first and return the first functioning link """ genome = self.genomes[safe(name)] ftp_link = genome["ftp_path"] html_link = ftp_link.replace("ftp://", "https://") for link in [html_link, ftp_link]: link += "/" + link.split("/")[-1] + file_suffix if skip_check or check_url(link, max_tries=2, timeout=10): return link
def get_annotation_download_link(self, name, **kwargs): """ Parse and test the link to the NCBI annotation file. Parameters ---------- name : str Genome name """ genome = self.genomes[safe(name)] link = genome["ftp_path"] link = link.replace("ftp://", "https://") link += "/" + link.split("/")[-1] + "_genomic.gff.gz" if check_url(link, 2): return link
def get_annotation_download_link(self, name, **kwargs): """ Parse and test the link to the Ensembl annotation file. Parameters ---------- name : str Genome name kwargs: dict , optional: Provider specific options. version : int , optional Ensembl version. By default the latest version is used. """ genome = self.genomes[safe(name)] division = genome["division"].lower().replace("ensembl", "") ftp_site = "ftp://ftp.ensemblgenomes.org/pub" if division == "vertebrates": ftp_site = "ftp://ftp.ensembl.org/pub" # Ensembl release version version = kwargs.get("version") if version is None: version = self.get_version(self.rest_url, division == "vertebrates") if division != "vertebrates": ftp_site += f"/{division}" # Get the GTF URL base_url = ftp_site + "/release-{}/gtf/{}/{}.{}.{}.gtf.gz" safe_name = re.sub(r"\.p\d+$", "", name) link = base_url.format( version, genome["url_name"].lower(), genome["url_name"].capitalize(), safe_name, version, ) if check_url(link, 2): return link
def search_url_for_annotation(url): """Attempts to find a gtf or gff3 file in the same location as the genome url""" urldir = os.path.dirname(url) sys.stderr.write( "You have requested gene annotation to be downloaded.\n" "Genomepy will check the remote directory:\n" f"{urldir} for annotation files...\n") # try to find a GTF or GFF3 file name = get_localname(url) with urlopen(urldir) as f: for urlline in f.readlines(): urlstr = str(urlline) if any(substring in urlstr.lower() for substring in [".gtf", name + ".gff"]): break # retrieve the filename from the HTML line fname = "" for split in re.split('>|<|><|/|"', urlstr): if split.lower().endswith(( ".gtf", ".gtf.gz", name + ".gff", name + ".gff.gz", name + ".gff3", name + ".gff3.gz", )): fname = split break else: raise FileNotFoundError( "Could not parse the remote directory. " "Please supply a URL using --url-to-annotation.\n") # set variables for downloading link = urldir + "/" + fname if check_url(link): return link
def get_genome_download_link(self, name, mask="soft", **kwargs): """ Return Ensembl http or ftp link to the genome sequence Parameters ---------- name : str Genome name. Current implementation will fail if exact name is not found. mask : str , optional Masking level. Options: soft, hard or none. Default is soft. Returns ------ str with the http/ftp download link. """ genome = self.genomes[safe(name)] # parse the division division = genome["division"].lower().replace("ensembl", "") if division == "bacteria": raise NotImplementedError( "bacteria from ensembl not yet supported") ftp_site = "ftp://ftp.ensemblgenomes.org/pub" if division == "vertebrates": ftp_site = "ftp://ftp.ensembl.org/pub" # Ensembl release version version = kwargs.get("version") if version is None: version = self.get_version(self.rest_url, division == "vertebrates") # division dependent url format ftp_dir = "{}/release-{}/fasta/{}/dna".format( division, version, genome["url_name"].lower()) if division == "vertebrates": ftp_dir = "release-{}/fasta/{}/dna".format( version, genome["url_name"].lower()) url = f"{ftp_site}/{ftp_dir}" # masking and assembly level def get_url(level="toplevel"): masks = { "soft": "dna_sm.{}", "hard": "dna_rm.{}", "none": "dna.{}" } pattern = masks[mask].format(level) asm_url = "{}/{}.{}.{}.fa.gz".format( url, genome["url_name"].capitalize(), re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])), pattern, ) return asm_url # try to get the (much smaller) primary assembly, # unless specified otherwise link = get_url("primary_assembly") if kwargs.get("toplevel") or not check_url(link, 2): link = get_url() if check_url(link, 2): return link raise GenomeDownloadError( f"Could not download genome {name} from {self.name}.\n" "URL is broken. Select another genome or provider.\n" f"Broken URL: {link}")
def provider_status(self, url, max_tries=1): """check if provider is online (stores results for 10 minutes)""" if not check_url(url, max_tries): raise ConnectionError(f"{self.name} appears to be offline.\n")