def _as_seqdict_genome_regions(regions, minsize=None): """ Accepts list of regions where the genome is encoded in the region, using the genome@chrom:start-end format. """ genomic_regions = {} for region in regions: genome, region = region.split("@") genomic_regions.setdefault(genome, []).append(region) # test if all genomes are installed for genome in genomic_regions: Genome(genome) tmpfa = NamedTemporaryFile(mode="w", delete=False) for genome, g_regions in genomic_regions.items(): g = Genome(genome) fa = g.track2fasta(g_regions) for seq in fa: seq.name = f"{genome}@{seq.name}" print(seq.__repr__(), file=tmpfa) tmpfa.flush() # Open tempfile and restore original sequence order fa = as_seqdict(tmpfa.name) fa = {region: fa[region] for region in regions} return _check_minsize(fa, minsize)
def _genomepy_convert(to_convert, genome, minsize=None): """ Convert a variety of inputs using track2fasta(). """ if genome is None: raise ValueError("input file is not a FASTA file, need a genome!") g = Genome(genome) tmpfile = NamedTemporaryFile() g.track2fasta(to_convert, tmpfile.name) fa = as_seqdict(tmpfile.name) return _check_minsize(fa, minsize)
def generate_exports(genomes_dir=None): """Print export commands for setting environment variables.""" env = [] for name in list_installed_genomes(genomes_dir): try: g = Genome(name) env_name = re.sub(r"[^\w]+", "_", name).upper() env.append(f"export {env_name}={g.filename}") except (FastaIndexingError, FileNotFoundError): pass return env
def install_genome( name, provider=None, genomes_dir=None, localname=None, mask="soft", keep_alt=False, regex=None, invert_match=False, bgzip=None, annotation=False, only_annotation=False, skip_sanitizing=False, threads=1, force=False, **kwargs, ): """ Install a genome. Parameters ---------- name : str Genome name provider : str , optional Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified. genomes_dir : str , optional Where to store the fasta files localname : str , optional Custom name for this genome. mask : str , optional Default is 'soft', choices 'hard'/'soft/'none' for respective masking level. keep_alt : bool , optional Some genomes contain alternative regions. These regions cause issues with sequence alignment, as they are inherently duplications of the consensus regions. Set to true to keep these alternative regions. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. threads : int , optional Build genome index using multithreading (if supported). Default: lowest of 8/all threads force : bool , optional Set to True to overwrite existing files. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. only_annotation : bool , optional If set to True, only download the annotation files. skip_sanitizing : bool , optional If set to True, downloaded annotation files whose sequence names do not match with the (first header fields of) the genome.fa will not be corrected. kwargs : dict , optional Provider specific options. toplevel : bool , optional Ensembl only: Always download the toplevel genome. Ignores potential primary assembly. version : int , optional Ensembl only: Specify release version. Default is latest. to_annotation : text , optional URL only: direct link to annotation file. Required if this is not the same directory as the fasta. """ name = safe(name) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) # Check if genome already exists, or if downloading is forced genome_found = _is_genome_dir(out_dir) if (not genome_found or force) and not only_annotation: # Download genome from provider p = _provider_selection(name, localname, genomes_dir, provider) p.download_genome( name, genomes_dir, mask=mask, keep_alt=keep_alt, regex=regex, invert_match=invert_match, localname=localname, bgzip=bgzip, **kwargs, ) genome_found = True # Export installed genome(s) generate_env(genomes_dir=genomes_dir) # Generates a Fasta object, index, gaps and sizes file g = None if genome_found: g = Genome(localname, genomes_dir=genomes_dir) if force: # overwrite previous versions generate_fa_sizes(g.genome_file, g.sizes_file) generate_gap_bed(g.genome_file, g.gaps_file) # Check if any annotation flags are given, if annotation already exists, or if downloading is forced if any([ annotation, only_annotation, skip_sanitizing, kwargs.get("to_annotation"), kwargs.get("ucsc_annotation_type"), ]): annotation = True annotation_found = bool(glob_ext_files(out_dir, "gtf")) if (not annotation_found or force) and annotation: # Download annotation from provider p = _provider_selection(name, localname, genomes_dir, provider) p.download_annotation(name, genomes_dir, localname=localname, **kwargs) # Sanitize annotation if needed (requires genome) annotation_found = bool(glob_ext_files(out_dir, "gtf")) if genome_found and annotation_found and not skip_sanitizing: sanitize_annotation(g) if genome_found: # Run all active plugins (requires genome) for plugin in get_active_plugins(): plugin.after_genome_download(g, threads, force)
def install_genome( name: str, provider: Optional[str] = None, genomes_dir: Optional[str] = None, localname: Optional[str] = None, mask: Optional[str] = "soft", keep_alt: Optional[bool] = False, regex: Optional[str] = None, invert_match: Optional[bool] = False, bgzip: Optional[bool] = None, # None -> check config. False -> dont check. annotation: Optional[bool] = False, only_annotation: Optional[bool] = False, skip_matching: Optional[bool] = False, skip_filter: Optional[bool] = False, threads: Optional[int] = 1, force: Optional[bool] = False, **kwargs: Optional[dict], ) -> Genome: """ Install a genome (& gene annotation). Parameters ---------- name : str Genome name provider : str , optional Provider name. will try Ensembl, UCSC and NCBI (in that order) if not specified. genomes_dir : str , optional Where to create the output folder. localname : str , optional Custom name for this genome. mask : str , optional Genome masking of repetitive sequences. Options: hard/soft/none, default is soft. keep_alt : bool , optional Some genomes contain alternative regions. These regions cause issues with sequence alignment, as they are inherently duplications of the consensus regions. Set to true to keep these alternative regions. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that *don't* match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip, and gene annotation will be compressed with gzip. threads : int , optional Build genome index using multithreading (if supported). Default: lowest of 8/all threads. force : bool , optional Set to True to overwrite existing files. annotation : bool , optional If set to True, download gene annotation in BED and GTF format. only_annotation : bool , optional If set to True, only download the gene annotation files. skip_matching : bool , optional If set to True, contigs in the annotation not matching those in the genome will not be corrected. skip_filter : bool , optional If set to True, the gene annotations will not be filtered to match the genome contigs. kwargs : dict , optional Provider specific options. toplevel : bool , optional Ensembl only: Always download the toplevel genome. Ignores potential primary assembly. version : int , optional Ensembl only: Specify release version. Default is latest. to_annotation : text , optional URL only: direct link to annotation file. Required if this is not the same directory as the fasta. Returns ------- Genome Genome class with the installed genome """ name = safe(name) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) genome_file = os.path.join(out_dir, f"{localname}.fa") provider = _provider_selection(name, localname, genomes_dir, provider) # check which files need to be downloaded genome_found = _is_genome_dir(out_dir) download_genome = ( genome_found is False or force is True ) and only_annotation is False annotation_found = bool(glob_ext_files(out_dir, "annotation.gtf")) and bool( glob_ext_files(out_dir, "annotation.bed") ) download_annotation = (annotation_found is False or force is True) and any( [ annotation, only_annotation, skip_matching, skip_filter, kwargs.get("to_annotation"), kwargs.get("path_to_annotation"), kwargs.get("ucsc_annotation_type"), ] ) genome = None genome_downloaded = False if download_genome: if force: _delete_extensions(out_dir, ["fa", "fai"]) provider.download_genome( name, genomes_dir, mask=mask, localname=localname, **kwargs, ) genome_found = True genome_downloaded = True # Filter genome _filter_genome(genome_file, regex, invert_match, keep_alt) # Generates a Fasta object and the genome index, gaps and sizes files genome = Genome(localname, genomes_dir=genomes_dir) # Download the NCBI assembly report asm_report = os.path.join(out_dir, "assembly_report.txt") asm_acc = genome.assembly_accession if not os.path.exists(asm_report) and asm_acc != "na": download_assembly_report(asm_acc, asm_report) # Export installed genome(s) generate_env(genomes_dir=genomes_dir) annotation_downloaded = False if download_annotation: if force: _delete_extensions(out_dir, ["annotation.gtf", "annotation.bed"]) provider.download_annotation(name, genomes_dir, localname=localname, **kwargs) annotation_downloaded = bool( glob_ext_files(out_dir, "annotation.gtf") ) and bool(glob_ext_files(out_dir, "annotation.bed")) if annotation_downloaded: annotation = Annotation(localname, genomes_dir=genomes_dir) if genome_found and not (skip_matching and skip_filter): annotation.sanitize(not skip_matching, not skip_filter, True) # Run active plugins (also if the genome was downloaded earlier) if genome_found: genome = genome if genome else Genome(localname, genomes_dir=genomes_dir) for plugin in get_active_plugins(): plugin.after_genome_download(genome, threads, force) # zip files downloaded now if bgzip is True or (bgzip is None and config.get("bgzip")): if genome_downloaded: bgzip_and_name(genome.filename) if annotation_downloaded: gzip_and_name(annotation.annotation_gtf_file) gzip_and_name(annotation.annotation_bed_file) return genome