def install_genome(name, provider, version=None, genome_dir=None, localname=None, mask="soft", regex=None, invert_match=False, annotation=False): """ Install a genome. Parameters ---------- name : str Genome name provider : str Provider name version : str Version (only for Ensembl) genome_dir : str , optional Where to store the fasta files localname : str , optional Custom name for this genome. mask : str , optional Default is 'soft', specify 'hard' for hard masking. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. """ if not genome_dir: genome_dir = config.get("genome_dir", None) if not genome_dir: raise norns.exceptions.ConfigError("Please provide or configure a genome_dir") genome_dir = os.path.expanduser(genome_dir) # Download genome from provider p = ProviderBase.create(provider) name = p.download_genome( name, genome_dir, version=version, mask=mask, localname=localname, regex=regex, invert_match=invert_match) if annotation: # Download annotation from provider p.download_annotation(name, genome_dir, version=version) g = Genome(name, genome_dir=genome_dir) for plugin in get_active_plugins(): plugin.after_genome_download(g)
def __init__(self, name, genome_dir=None): try: super(Genome, self).__init__(name) self.name = os.path.basename(name) except Exception: if ( os.path.isdir(name) and len(glob_fa_files(name)) == 1 and genome_dir is not None ): fname = glob_fa_files(name)[0] name = os.path.basename(fname) else: if not genome_dir: genome_dir = config.get("genome_dir", None) if not genome_dir: raise norns.exceptions.ConfigError( "Please provide or configure a genome_dir" ) genome_dir = os.path.expanduser(genome_dir) if not os.path.exists(genome_dir): raise FileNotFoundError( "genome_dir {} does not exist".format(genome_dir) ) fnames = glob_fa_files( os.path.join(genome_dir, name.replace(".gz", "")) ) if len(fnames) == 0: raise FileNotFoundError( "no *.fa files found in genome_dir {}".format( os.path.join(genome_dir, name) ) ) elif len(fnames) > 1: fname = os.path.join(genome_dir, name, "{}.fa".format(name)) if fname not in fnames: fname += ".gz" if fname not in fnames: raise Exception( "More than one FASTA file found, no {}.fa!".format(name) ) else: fname = fnames[0] super(Genome, self).__init__(fname) self.name = name self._gap_sizes = None self.props = {} for plugin in get_active_plugins(): self.props[plugin.name()] = plugin.get_properties(self)
def __init__(self, name, genome_dir=None): try: super(Genome, self).__init__(name) self.name = os.path.basename(name) except: if not genome_dir: genome_dir = config.get("genome_dir", None) if not genome_dir: raise norns.exceptions.ConfigError("Please provide or configure a genome_dir") genome_dir = os.path.expanduser(genome_dir) if not os.path.exists(genome_dir): raise FileNotFoundError( "genome_dir {} does not exist".format(genome_dir) ) pattern = os.path.join(genome_dir, name, "*.fa") fnames = glob.glob(pattern) if len(fnames) == 0: raise FileNotFoundError( "no *.fa files found in genome_dir {}".format( os.path.join(genome_dir, name) ) ) elif len(fnames) > 1: fname = os.path.join(genome_dir, name, "{}.fa".format(name)) if fname not in fnames: raise Exception("More than one FASTA file found, no {}.fa!".format(name)) else: fname = fnames[0] super(Genome, self).__init__(fname) self.name = name self._gap_sizes = None self.props = {} for plugin in get_active_plugins(): self.props[plugin.name()] = plugin.get_properties(self)
def plugin(self): """dict of all active plugins and their properties""" p = dict() for plugin in get_active_plugins(): p[plugin.name()] = plugin.get_properties(self) return p
def install_genome(name, provider, genome_dir=None, localname=None, mask="soft", regex=None, invert_match=False, bgzip=None, annotation=False, force=False, **kwargs): """ Install a genome. Parameters ---------- name : str Genome name provider : str Provider name genome_dir : str , optional Where to store the fasta files localname : str , optional Custom name for this genome. mask : str , optional Default is 'soft', choices 'hard'/'soft/'none' for respective masking level. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. force : bool , optional Set to True to overwrite existing files. kwargs : dict, optional Provider specific options. Ensembl: toplevel : bool , optional Ensembl only: Always download the toplevel genome. Ignores potential primary assembly. version : int, optional Ensembl only: Specify release version. Default is latest. """ if not genome_dir: genome_dir = config.get("genome_dir", None) if not genome_dir: raise norns.exceptions.ConfigError( "Please provide or configure a genome_dir") genome_dir = os.path.expanduser(genome_dir) localname = get_localname(name, localname) out_dir = os.path.join(genome_dir, localname) # Check if genome already exists, or if downloading is forced no_genome_found = not any( os.path.exists(fname) for fname in glob_ext_files(out_dir, "fa")) if no_genome_found or force: # Download genome from provider p = ProviderBase.create(provider) p.download_genome(name, genome_dir, mask=mask, regex=regex, invert_match=invert_match, localname=localname, bgzip=bgzip, **kwargs) # If annotation is requested, check if annotation already exists, or if downloading is forced no_annotation_found = not any( os.path.exists(fname) for fname in glob_ext_files(out_dir, "gtf")) if annotation and (no_annotation_found or force): # Download annotation from provider p = ProviderBase.create(provider) p.download_annotation(name, genome_dir, localname=localname, **kwargs) # generates a Fasta object and the index file g = Genome(localname, genome_dir=genome_dir) # Run all active plugins for plugin in get_active_plugins(): plugin.after_genome_download(g, force) # Generate gap file if not found or if generation is forced gap_file = os.path.join(out_dir, localname + ".gaps.bed") if not os.path.exists(gap_file) or force: generate_gap_bed(glob_ext_files(out_dir, "fa")[0], gap_file) generate_env()
def install_genome( name, provider=None, genomes_dir=None, localname=None, mask="soft", keep_alt=False, regex=None, invert_match=False, bgzip=None, annotation=False, only_annotation=False, skip_sanitizing=False, threads=1, force=False, **kwargs, ): """ Install a genome. Parameters ---------- name : str Genome name provider : str , optional Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified. genomes_dir : str , optional Where to store the fasta files localname : str , optional Custom name for this genome. mask : str , optional Default is 'soft', choices 'hard'/'soft/'none' for respective masking level. keep_alt : bool , optional Some genomes contain alternative regions. These regions cause issues with sequence alignment, as they are inherently duplications of the consensus regions. Set to true to keep these alternative regions. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. threads : int , optional Build genome index using multithreading (if supported). Default: lowest of 8/all threads force : bool , optional Set to True to overwrite existing files. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. only_annotation : bool , optional If set to True, only download the annotation files. skip_sanitizing : bool , optional If set to True, downloaded annotation files whose sequence names do not match with the (first header fields of) the genome.fa will not be corrected. kwargs : dict , optional Provider specific options. toplevel : bool , optional Ensembl only: Always download the toplevel genome. Ignores potential primary assembly. version : int , optional Ensembl only: Specify release version. Default is latest. to_annotation : text , optional URL only: direct link to annotation file. Required if this is not the same directory as the fasta. """ name = safe(name) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) # Check if genome already exists, or if downloading is forced genome_found = _is_genome_dir(out_dir) if (not genome_found or force) and not only_annotation: # Download genome from provider p = _provider_selection(name, localname, genomes_dir, provider) p.download_genome( name, genomes_dir, mask=mask, keep_alt=keep_alt, regex=regex, invert_match=invert_match, localname=localname, bgzip=bgzip, **kwargs, ) genome_found = True # Export installed genome(s) generate_env(genomes_dir=genomes_dir) # Generates a Fasta object, index, gaps and sizes file g = None if genome_found: g = Genome(localname, genomes_dir=genomes_dir) if force: # overwrite previous versions generate_fa_sizes(g.genome_file, g.sizes_file) generate_gap_bed(g.genome_file, g.gaps_file) # Check if any annotation flags are given, if annotation already exists, or if downloading is forced if any([ annotation, only_annotation, skip_sanitizing, kwargs.get("to_annotation"), kwargs.get("ucsc_annotation_type"), ]): annotation = True annotation_found = bool(glob_ext_files(out_dir, "gtf")) if (not annotation_found or force) and annotation: # Download annotation from provider p = _provider_selection(name, localname, genomes_dir, provider) p.download_annotation(name, genomes_dir, localname=localname, **kwargs) # Sanitize annotation if needed (requires genome) annotation_found = bool(glob_ext_files(out_dir, "gtf")) if genome_found and annotation_found and not skip_sanitizing: sanitize_annotation(g) if genome_found: # Run all active plugins (requires genome) for plugin in get_active_plugins(): plugin.after_genome_download(g, threads, force)
def install_genome(name, provider, version=None, genome_dir=None, localname=None, mask="soft", regex=None, invert_match=False, annotation=False): """ Install a genome. Parameters ---------- name : str Genome name provider : str Provider name version : str Version (only for Ensembl) genome_dir : str , optional Where to store the fasta files localname : str , optional Custom name for this genome. mask : str , optional Default is 'soft', specify 'hard' for hard masking. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. """ if not genome_dir: genome_dir = config.get("genome_dir", None) if not genome_dir: raise norns.exceptions.ConfigError( "Please provide or configure a genome_dir") genome_dir = os.path.expanduser(genome_dir) # Download genome from provider p = ProviderBase.create(provider) name = p.download_genome(name, genome_dir, version=version, mask=mask, localname=localname, regex=regex, invert_match=invert_match) if annotation: # Download annotation from provider p.download_annotation(name, genome_dir, version=version) g = Genome(name, genome_dir=genome_dir) for plugin in get_active_plugins(): plugin.after_genome_download(g) generate_env()