def list_available_genomes(provider=None): """ List all available genomes. Parameters ---------- provider : str, optional List genomes from specific provider. Genomes from all providers will be returned if not specified. Returns ------- list with genome names """ if provider: providers = [ProviderBase.create(provider)] else: # if provider is not specified search all providers providers = [ ProviderBase.create(p) for p in ProviderBase.list_providers() ] for p in providers: for row in p.list_available_genomes(): yield [p.name] + list(row)
def search(term, provider=None): """ Search for a genome. If provider is specified, search only that specific provider, else search all providers. Both the name and description are used for the search. Search term is case-insensitive. Parameters ---------- term : str Search term, case-insensitive. provider : str , optional Provider name Yields ------ tuple genome information (name/identfier and description) """ if provider: providers = [ProviderBase.create(provider)] else: # if provider is not specified search all providers (except direct url) providers = [ ProviderBase.create(p) for p in ProviderBase.list_providers() if p != "url" ] for p in providers: for row in p.search(term): yield [x.encode("latin-1") for x in [p.name] + list(row)]
def search(term, provider=None): """ Search for a genome. If provider is specified, search only that specific provider, else search all providers. Both the name and description are used for the search. Seacrch term is case-insensitive. Parameters ---------- term : str Search term, case-insensitive. provider : str , optional Provider name Yields ------ tuple genome information (name/identfier and description) """ if provider: providers = [ProviderBase.create(provider)] else: # if provider is not specified search all providers providers = [ProviderBase.create(p) for p in ProviderBase.list_providers()] for p in providers: for row in p.search(term): yield [x.encode('latin-1') for x in [p.name] + list(row)]
def ensembl_genome_info(genome_name: str) -> Tuple[str, str, str]: """Return Ensembl genome information for a local genome managed by genomepy. Parameters ---------- genome_name : str Name of local genome. Returns ------- (str, str, str) Ensembl name, accession, taxonomy_id """ # Fast lookup for some common queries common_names = { "danRer11": "GRCz11", "hg38": "GRCh38", "mm10": "GRCm38", "dm6": "BDGP6.28", } if genome_name in common_names: search_term = common_names[genome_name] else: try: genome = Genome(genome_name) search_term = genome.tax_id except FileNotFoundError: logger.info(f"Genome {genome_name} not installed locally") p = ProviderBase.create("Ensembl") for name, *_rest in p.search(genome_name): if name == genome_name: logger.info( f"It can be downloaded from Ensembl: genomepy install {name} Ensembl --annotation" ) return None return None # search Ensembl by taxonomy_id or by specific Ensembl name (if we know it) p = ProviderBase.create("Ensembl") name, accession, species, tax_id, *rest = [ row for row in p.search(search_term) ][0] # Check if the assembly_id of the current Ensembl genome is the same as the # local genome. If it is identical, we can correctly assume that the genomes # sequences are identical. # For the genomes in the lookup table, we already know they match. if genome_name in common_names or accession == genome.assembly_accession: return name, accession, tax_id else: print(f"Could not find a matching genome in Ensembl") return None
def install_genome(name, provider, version=None, genome_dir=None, localname=None, mask="soft", regex=None, invert_match=False, annotation=False): """ Install a genome. Parameters ---------- name : str Genome name provider : str Provider name version : str Version (only for Ensembl) genome_dir : str , optional Where to store the fasta files localname : str , optional Custom name for this genome. mask : str , optional Default is 'soft', specify 'hard' for hard masking. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. """ if not genome_dir: genome_dir = config.get("genome_dir", None) if not genome_dir: raise norns.exceptions.ConfigError("Please provide or configure a genome_dir") genome_dir = os.path.expanduser(genome_dir) # Download genome from provider p = ProviderBase.create(provider) name = p.download_genome( name, genome_dir, version=version, mask=mask, localname=localname, regex=regex, invert_match=invert_match) if annotation: # Download annotation from provider p.download_annotation(name, genome_dir, version=version) g = Genome(name, genome_dir=genome_dir) for plugin in get_active_plugins(): plugin.after_genome_download(g)
def _online_providers(): """Return a list of online providers as objects""" providers = [] for p in ProviderBase.list_providers(): try: providers.append(ProviderBase.create(p)) except ConnectionError as e: sys.stderr.write(str(e)) return providers
def test__update_assembly_accession(genome="tests/data/small_genome.fa.gz"): g = genomepy.Genome(genome) # genome not found metadata = {} g._update_assembly_accession(metadata) assert metadata["assembly_accession"] == "na" # genome found metadata = {} provider = ProviderBase.create("NCBI") genome = provider.genomes.get("ASM14646v1") g._update_assembly_accession(metadata, provider, genome) assert metadata["assembly_accession"] == "GCA_000146465.1"
def test__update_tax_id(genome="tests/data/small_genome.fa.gz"): g = genomepy.Genome(genome) # genome not found metadata = {} g._update_tax_id(metadata) assert metadata["tax_id"] == "na" # genome found metadata = {} provider = ProviderBase.create("NCBI") genome = provider.genomes.get("ASM14646v1") g._update_tax_id(metadata, provider, genome) assert metadata["tax_id"] == "58839"
def list_available_genomes(provider=None): """ List all available genomes. Parameters ---------- provider : str, optional List genomes from specific provider. Genomes from all providers will be returned if not specified. Returns ------- list with genome names """ if provider: providers = [ProviderBase.create(provider)] else: # if provider is not specified search all providers providers = [ProviderBase.create(p) for p in ProviderBase.list_providers()] for p in providers: for row in p.list_available_genomes(): yield [p.name] + list(row)
def ncbi_assembly_report(asm_acc: str) -> pd.DataFrame: """Retrieve the NCBI assembly report as a DataFrame. Parameters ---------- asm_acc : str Assembly accession (GCA or GCF) Returns ------- pandas.DataFrame NCBI assembly report. """ p = ProviderBase.create("NCBI") ncbi_search = list(p.search(asm_acc)) if len(ncbi_search) > 1: raise Exception("More than one genome for accession") else: ncbi_name = ncbi_search[0][0].replace(" ", "_") # NCBI FTP location of assembly report logger.info(f"Found NCBI assembly {asm_acc} with name {ncbi_name}") assembly_report = ( f"ftp://ftp.ncbi.nlm.nih.gov/genomes/all/{asm_acc[0:3]}/" + f"{asm_acc[4:7]}/{asm_acc[7:10]}/{asm_acc[10:13]}/" + f"{asm_acc}_{ncbi_name}/{asm_acc}_{ncbi_name}_assembly_report.txt") logger.info(f"Downloading {assembly_report}") header = [ "Sequence-Name", "Sequence-Role", "Assigned-Molecule", "Assigned-Molecule-Location/Type", "GenBank-Accn", "Relationship", "RefSeq-Accn", "Assembly-Unit", "Sequence-Length", "UCSC-style-name", ] asm_report = pd.read_csv(assembly_report, sep="\t", comment="#", names=header) return asm_report
def _update_metadata(self, metadata): """check if there is missing info that can be updated""" print("Updating metadata in README.txt", file=sys.stderr) if metadata.get("provider", "na") == "na": self._update_provider(metadata) known_provider = metadata["provider"] in ["Ensembl", "UCSC", "NCBI"] name = safe(metadata.get("original name", "")) missing_info = any(key not in metadata for key in ["tax_id", "assembly_accession"]) p = genome = None if known_provider and name and missing_info: p = ProviderBase.create(metadata["provider"]) genome = p.genomes.get(name) if "tax_id" not in metadata: self._update_tax_id(metadata, p, genome) if "assembly_accession" not in metadata: self._update_assembly_accession(metadata, p, genome)
def install_genome(name, provider, genome_dir=None, localname=None, mask="soft", regex=None, invert_match=False, bgzip=None, annotation=False, force=False, **kwargs): """ Install a genome. Parameters ---------- name : str Genome name provider : str Provider name genome_dir : str , optional Where to store the fasta files localname : str , optional Custom name for this genome. mask : str , optional Default is 'soft', choices 'hard'/'soft/'none' for respective masking level. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. force : bool , optional Set to True to overwrite existing files. kwargs : dict, optional Provider specific options. Ensembl: toplevel : bool , optional Ensembl only: Always download the toplevel genome. Ignores potential primary assembly. version : int, optional Ensembl only: Specify release version. Default is latest. """ if not genome_dir: genome_dir = config.get("genome_dir", None) if not genome_dir: raise norns.exceptions.ConfigError( "Please provide or configure a genome_dir") genome_dir = os.path.expanduser(genome_dir) localname = get_localname(name, localname) out_dir = os.path.join(genome_dir, localname) # Check if genome already exists, or if downloading is forced no_genome_found = not any( os.path.exists(fname) for fname in glob_ext_files(out_dir, "fa")) if no_genome_found or force: # Download genome from provider p = ProviderBase.create(provider) p.download_genome(name, genome_dir, mask=mask, regex=regex, invert_match=invert_match, localname=localname, bgzip=bgzip, **kwargs) # If annotation is requested, check if annotation already exists, or if downloading is forced no_annotation_found = not any( os.path.exists(fname) for fname in glob_ext_files(out_dir, "gtf")) if annotation and (no_annotation_found or force): # Download annotation from provider p = ProviderBase.create(provider) p.download_annotation(name, genome_dir, localname=localname, **kwargs) # generates a Fasta object and the index file g = Genome(localname, genome_dir=genome_dir) # Run all active plugins for plugin in get_active_plugins(): plugin.after_genome_download(g, force) # Generate gap file if not found or if generation is forced gap_file = os.path.join(out_dir, localname + ".gaps.bed") if not os.path.exists(gap_file) or force: generate_gap_bed(glob_ext_files(out_dir, "fa")[0], gap_file) generate_env()
def _providers(provider=None): """ Return a list of provider objects: either the specified provider, or all online providers """ return [ProviderBase.create(provider)] if provider else _online_providers()
def install_genome(name, provider, version=None, genome_dir=None, localname=None, mask="soft", regex=None, invert_match=False, annotation=False): """ Install a genome. Parameters ---------- name : str Genome name provider : str Provider name version : str Version (only for Ensembl) genome_dir : str , optional Where to store the fasta files localname : str , optional Custom name for this genome. mask : str , optional Default is 'soft', specify 'hard' for hard masking. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. """ if not genome_dir: genome_dir = config.get("genome_dir", None) if not genome_dir: raise norns.exceptions.ConfigError( "Please provide or configure a genome_dir") genome_dir = os.path.expanduser(genome_dir) # Download genome from provider p = ProviderBase.create(provider) name = p.download_genome(name, genome_dir, version=version, mask=mask, localname=localname, regex=regex, invert_match=invert_match) if annotation: # Download annotation from provider p.download_annotation(name, genome_dir, version=version) # Create chromosome sizes generate_sizes(name, genome_dir) fa = os.path.join(genome_dir, name, "{}.fa".format(name)) bed = os.path.join(genome_dir, name, "{}.gaps.bed".format(name)) generate_gap_bed(fa, bed)
def install_genome( name, provider, version=None, genome_dir=None, localname=None, mask="soft", regex=None, invert_match=False, annotation=False, bgzip=None, ): """ Install a genome. Parameters ---------- name : str Genome name provider : str Provider name version : str Version (only for Ensembl) genome_dir : str , optional Where to store the fasta files localname : str , optional Custom name for this genome. mask : str , optional Default is 'soft', specify 'hard' for hard masking. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. """ if not genome_dir: genome_dir = config.get("genome_dir", None) if not genome_dir: raise norns.exceptions.ConfigError("Please provide or configure a genome_dir") genome_dir = os.path.expanduser(genome_dir) localname = get_localname(name, localname) # Download genome from provider p = ProviderBase.create(provider) p.download_genome( name, genome_dir, version=version, mask=mask, localname=localname, regex=regex, invert_match=invert_match, bgzip=bgzip, ) if annotation: # Download annotation from provider p.download_annotation(name, genome_dir, localname=localname, version=version) g = Genome(localname, genome_dir=genome_dir) for plugin in get_active_plugins(): plugin.after_genome_download(g) generate_env()