def after_genome_download(self, genome): if not cmd_ok("gmap_build"): return # Create index dir index_dir = genome.props["gmap"]["index_dir"] mkdir_p(index_dir) # If the genome is bgzipped it needs to be unzipped first fname = genome.filename bgzip = False if fname.endswith(".gz"): ret = sp.check_call(["gunzip", fname]) if ret != 0: raise Exception("Error gunzipping genome {}".format(fname)) fname = re.sub(".gz$", "", fname) bgzip = True # Create index cmd = "gmap_build -D {} -d {} {}".format(index_dir, genome.name, genome.filename) run_index_cmd("gmap", cmd) if bgzip: ret = sp.check_call(["bgzip", fname]) if ret != 0: raise Exception("Error bgzipping genome {}. ".format(fname) + "Is tabix installed?")
def after_genome_download(self, genome, force=False): if not cmd_ok("hisat2-build"): return # Create index dir index_dir = genome.props["hisat2"]["index_dir"] index_name = genome.props["hisat2"]["index_name"] if force: # Start from scratch rmtree(index_dir, ignore_errors=True) mkdir_p(index_dir) if not any(fname.endswith(".ht2") for fname in os.listdir(index_dir)): # If the genome is bgzipped it needs to be unzipped first fname = genome.filename bgzip = False if fname.endswith(".gz"): ret = sp.check_call(["gunzip", fname]) if ret != 0: raise Exception("Error gunzipping genome {}".format(fname)) fname = re.sub(".gz$", "", fname) bgzip = True # Create index cmd = "hisat2-build {} {}".format(fname, index_name) run_index_cmd("hisat2", cmd) if bgzip: ret = sp.check_call(["bgzip", fname]) if ret != 0: raise Exception( "Error bgzipping genome {}. ".format(fname) + "Is tabix installed?")
def genome(request, tempdir): """Create a test genome.""" name = "dm3" # Use fake name for blacklist test fafile = "tests/data/small_genome.fa" bgzipped = True if request.param == "bgzipped" else False # Input needs to be bgzipped, depending on param if os.path.exists(fafile + ".gz"): if not bgzipped: check_call(["gunzip", fafile + ".gz"]) elif bgzipped: check_call(["bgzip", fafile]) tmpdir = os.path.join(tempdir, request.param, name) mkdir_p(tmpdir) if bgzipped: fafile += ".gz" copyfile(fafile, os.path.join(tmpdir, os.path.basename(fafile))) for p in init_plugins(): activate(p) # provide the fixture value yield Genome(name, genome_dir=os.path.join(tempdir, request.param)) if os.path.exists(fafile) and not bgzipped: check_call(["bgzip", fafile])
def after_genome_download(self, genome, force=False): if not cmd_ok("STAR"): return # Create index dir index_dir = genome.props["star"]["index_dir"] index_name = genome.props["star"]["index_name"] if force: # Start from scratch rmtree(index_dir, ignore_errors=True) mkdir_p(index_dir) if not os.path.exists(index_name): # If the genome is bgzipped it needs to be unzipped first fname = genome.filename bgzip = False if fname.endswith(".gz"): ret = sp.check_call(["gunzip", fname]) if ret != 0: raise Exception("Error gunzipping genome {}".format(fname)) fname = re.sub(".gz$", "", fname) bgzip = True # Create index cmd = "STAR --runMode genomeGenerate --genomeFastaFiles {} --genomeDir {} --outFileNamePrefix {}".format( fname, index_dir, index_dir) run_index_cmd("star", cmd) # Rezip genome if it was bgzipped if bgzip: ret = sp.check_call(["bgzip", fname]) if ret != 0: raise Exception( "Error bgzipping genome {}. ".format(fname) + "Is tabix installed?")
def after_genome_download(self, genome, threads=1, force=False): index_name = genome.plugin["star"]["index_name"] if not cmd_ok("STAR") or (os.path.exists(index_name) and not force): return index_dir = genome.plugin["star"]["index_dir"] rm_rf(index_dir) mkdir_p(index_dir) # gunzip genome if bgzipped and return up-to-date genome name with extracted_file(genome.filename) as fname: # index command cmd = (f"STAR --runMode genomeGenerate --runThreadN {threads} " + f"--genomeFastaFiles {fname} --genomeDir {index_dir} " + f"--outFileNamePrefix {index_dir}") # if an annotation is present, generate a splice-aware index gtf_file = genome.annotation_gtf_file if gtf_file: with extracted_file(gtf_file) as _gtf_file: # update index command with annotation cmd += f" --sjdbGTFfile {_gtf_file}" # Create index run_index_cmd("star", cmd) else: logger.info("Creating STAR index without annotation file.") # Create index run_index_cmd("star", cmd)
def download_annotation(name, annot, genomes_dir, localname, n=None): """ Download the extended genePred file from the UCSC MySQL database. Next convert this to a BED and GTF file. """ out_dir = os.path.join(genomes_dir, localname) mkdir_p(out_dir) tmp_dir = mkdtemp(dir=out_dir) pred_file = f"{os.path.join(tmp_dir, localname)}.annotation.extended.gp" gtf_file = f"{os.path.join(out_dir, localname)}.annotation.gtf" bed_file = f"{os.path.join(out_dir, localname)}.annotation.bed" # MySQL query 1: get column names for this genePred command = f"SHOW COLUMNS FROM {annot};" cols = list(query_ucsc(command, database=name)) # drop columns the UCSC tools cannot handle # see https://genome.ucsc.edu/FAQ/FAQformat.html#format9 accepted_cols = [ "geneName", "name", "chrom", "strand", "txStart", "txEnd", "cdsStart", "cdsEnd", "exonCount", "exonStarts", "exonEnds", "score", "name2", "cdsStartStat", "cdsEndStat", "exonFrames", ] cols = [c[0] for c in cols if c[0] in accepted_cols] cols = ",".join(cols) # MySQL query 2: download genePred command = f"SELECT {cols} FROM {annot};" if n: command = f"SELECT {cols} FROM {annot} LIMIT {n};" ret = query_ucsc(command, database=name) # clean up genePred df = pd.DataFrame.from_records(ret) for c in [8, 9, 14]: if c in df: df[c] = df[c].str.decode("utf-8") df.to_csv(pred_file, index=False, header=False, sep="\t") # convert genePred to GTF and BED cmd = "genePredToGtf -source=genomepy file {0} {1}" sp.check_call(cmd.format(pred_file, gtf_file), shell=True) cmd = "genePredToBed {0} {1}" sp.check_call(cmd.format(pred_file, bed_file), shell=True) rm_rf(tmp_dir)
def after_genome_download(self, genome): if not cmd_ok("minimap2"): return # Create index dir index_dir = genome.props["minimap2"]["index_dir"] index_name = genome.props["minimap2"]["index_name"] mkdir_p(index_dir) # Create index cmd = "minimap2 -d {} {}".format(index_name, genome.filename) run_index_cmd("minimap2", cmd)
def after_genome_download(self, genome, threads=1, force=False): index_name = genome.plugin["hisat2"]["index_name"] if not cmd_ok("hisat2-build") or ( os.path.exists(f"{index_name}.1.ht2") and not force ): return index_dir = genome.plugin["hisat2"]["index_dir"] rm_rf(index_dir) mkdir_p(index_dir) # gunzip genome if bgzipped and return up-to-date genome name fname, bgzip = gunzip_and_name(genome.filename) # index command cmd = f"hisat2-build -p {threads} {fname} {index_name}" # if an annotation is present, generate a splice-aware index gtf_file = genome.annotation_gtf_file if gtf_file: # gunzip if gzipped gtf_file, gzip_file = gunzip_and_name(gtf_file) # generate splice and exon site files to enhance indexing hisat_path = ( sp.Popen("which hisat2", stdout=sp.PIPE, shell=True) .stdout.read() .decode("utf8") .strip() ) splice_script = hisat_path + "_extract_splice_sites.py" splice_file = os.path.join(genome.genome_dir, "splice_sites.txt") sp.check_call( f"python3 {splice_script} {gtf_file} > {splice_file}", shell=True ) exon_script = hisat_path + "_extract_exons.py" exon_file = os.path.join(genome.genome_dir, "exon_sites.txt") sp.check_call(f"python3 {exon_script} {gtf_file} > {exon_file}", shell=True) # re-gzip annotation if gunzipped gzip_and_name(gtf_file, gzip_file) # update index command with annotation cmd += f" --ss {splice_file} --exon {exon_file}" else: print("\nCreating Hisat2 index without annotation file.") # Create index run_index_cmd("hisat2", cmd) # re-bgzip genome if gunzipped bgzip_and_name(fname, bgzip)
def after_genome_download(self, genome): if not cmd_ok("bowtie2-build"): return # Create index dir index_dir = genome.props["bowtie2"]["index_dir"] index_name = genome.props["bowtie2"]["index_name"] mkdir_p(index_dir) # Create index cmd = "bowtie2-build {} {}".format(genome.filename, index_name) run_index_cmd("bowtie2", cmd)
def after_genome_download(self, genome): if not cmd_ok("minimap2"): return # Create index dir index_dir = genome.props["minimap2"]["index_dir"] index_name = genome.props["minimap2"]["index_name"] mkdir_p(index_dir) # Create index cmd = "minimap2 -d {} {}".format(index_name, genome.filename) run_index_cmd("minimap2", cmd)
def after_genome_download(self, genome): if not cmd_ok("bowtie2-build"): return # Create index dir index_dir = genome.props["bowtie2"]["index_dir"] index_name = genome.props["bowtie2"]["index_name"] mkdir_p(index_dir) # Create index cmd = "bowtie2-build {} {}".format(genome.filename, index_name) run_index_cmd("bowtie2", cmd)
def after_genome_download(self, genome): if not cmd_ok("gmap_build"): return # Create index dir index_dir = genome.props["gmap"]["index_dir"] index_name = genome.props["gmap"]["index_name"] mkdir_p(index_dir) # Create index cmd = "gmap_build -D {} -d {} {}".format(index_dir, genome.name, genome.filename) run_index_cmd("gmap", cmd)
def after_genome_download(self, genome): if not cmd_ok("gmap_build"): return # Create index dir index_dir = genome.props["gmap"]["index_dir"] index_name = genome.props["gmap"]["index_name"] mkdir_p(index_dir) # Create index cmd = "gmap_build -D {} -d {} {}".format( index_dir, genome.name, genome.filename) run_index_cmd("gmap", cmd)
def after_genome_download(self, genome): if not cmd_ok("bwa"): return # Create index dir index_dir = genome.props["bwa"]["index_dir"] index_fa = genome.props["bwa"]["index_name"] mkdir_p(index_dir) if not os.path.exists(index_fa): os.symlink(genome.filename, index_fa) cmd = "bwa index {}".format(index_fa) run_index_cmd("bwa", cmd)
def after_genome_download(self, genome): if not cmd_ok("bwa"): return # Create index dir index_dir = genome.props["bwa"]["index_dir"] index_fa = genome.props["bwa"]["index_name"] mkdir_p(index_dir) if not os.path.exists(index_fa): os.symlink(genome.filename, index_fa) cmd = "bwa index {}".format(index_fa) run_index_cmd("bwa", cmd)
def after_genome_download(self, genome, force=False): if not cmd_ok("bowtie2-build"): return # Create index dir index_dir = genome.props["bowtie2"]["index_dir"] index_name = genome.props["bowtie2"]["index_name"] if force: # Start from scratch rmtree(index_dir, ignore_errors=True) mkdir_p(index_dir) if not any(fname.endswith(".bt2") for fname in os.listdir(index_dir)): # Create index cmd = "bowtie2-build {} {}".format(genome.filename, index_name) run_index_cmd("bowtie2", cmd)
def after_genome_download(self, genome, threads=1, force=False): if not cmd_ok("minimap2"): return # Create index dir index_dir = genome.plugin["minimap2"]["index_dir"] index_name = genome.plugin["minimap2"]["index_name"] if force: # Start from scratch rm_rf(index_dir) mkdir_p(index_dir) if not any(fname.endswith(".mmi") for fname in os.listdir(index_dir)): # Create index cmd = f"minimap2 -t {threads} -d {index_name} {genome.filename}" run_index_cmd("minimap2", cmd)
def after_genome_download(self, genome, threads=1, force=False): if not cmd_ok("bwa"): return # Create index dir index_dir = genome.plugin["bwa"]["index_dir"] index_name = genome.plugin["bwa"]["index_name"] if force: # Start from scratch rm_rf(index_dir) mkdir_p(index_dir) if not any(fname.endswith(".bwt") for fname in os.listdir(index_dir)): # Create index if not os.path.exists(index_name): os.symlink(genome.filename, index_name) cmd = f"bwa index {index_name}" run_index_cmd("bwa", cmd)
def after_genome_download(self, genome, force=False): if not cmd_ok("bwa"): return # Create index dir index_dir = genome.props["bwa"]["index_dir"] index_name = genome.props["bwa"]["index_name"] if force: # Start from scratch rmtree(index_dir, ignore_errors=True) mkdir_p(index_dir) if not any(fname.endswith(".bwt") for fname in os.listdir(index_dir)): # Create index if not os.path.exists(index_name): os.symlink(genome.filename, index_name) cmd = "bwa index {}".format(index_name) run_index_cmd("bwa", cmd)
def download_assembly_report(acc: str, fname: str = None, quiet=False): """ Retrieve the NCBI assembly report. Returns the assembly_report as a pandas DataFrame if fname is not specified. Parameters ---------- acc : str Assembly accession (GCA or GCF) fname : str, optional Save assembly_report to this filename. quiet : bool, optional Silence warnings. Returns ------- pandas.DataFrame NCBI assembly report. """ msg = "Could not download the assembly report from NCBI. " if not isinstance(acc, str) or not acc.startswith(("GCA", "GCF")): if not quiet: logger.warning(msg) return None assembly_report = _assembly_report_url(acc) if assembly_report is None: if not quiet: logger.warning(msg + f"Assembly accession '{acc}' not found.") return None asm_report = pd.read_csv(assembly_report, sep="\t", comment="#", names=ASM_FORMAT, dtype=str) if fname: mkdir_p(os.path.dirname(fname)) asm_report.to_csv(fname, sep="\t", index=False) else: return asm_report
def after_genome_download(self, genome, threads=1, force=False): index_name = genome.plugin["star"]["index_name"] if not cmd_ok("STAR") or (os.path.exists(index_name) and not force): return index_dir = genome.plugin["star"]["index_dir"] rmtree(index_dir, ignore_errors=True) mkdir_p(index_dir) # gunzip genome if bgzipped and return up-to-date genome name fname, bgzip = gunzip_and_name(genome.filename) # index command cmd = ( f"STAR --runMode genomeGenerate --runThreadN {threads} " + f"--genomeFastaFiles {fname} --genomeDir {index_dir} " + f"--outFileNamePrefix {index_dir}" ) # if an annotation is present, generate a splice-aware index gtf_file = genome.annotation_gtf_file gzip_file = False if gtf_file: # gunzip if gzipped gtf_file, gzip_file = gunzip_and_name(gtf_file) # update index command with annotation cmd += f" --sjdbGTFfile {gtf_file}" else: print("\nCreating STAR index without annotation file.") # Create index run_index_cmd("star", cmd) # re-bgzip genome if gunzipped bgzip_and_name(fname, bgzip) # re-gzip annotation if gunzipped if gtf_file: gzip_and_name(gtf_file, gzip_file)
def generate_env(fname: str = "exports.txt", genomes_dir: str = None): """ Generate file with exports. By default the export file generated is .config/genomepy/exports.txt. An alternative file name or file path is accepted too. Parameters ---------- fname: str, optional Absolute path or name of the output file. genomes_dir: str, optional Directory with installed genomes to export. """ fname1 = os.path.expanduser(os.path.expandvars(fname)) fname2 = os.path.join(user_config_dir("genomepy"), fname) fname = fname1 if os.path.isabs(fname1) else fname2 mkdir_p(os.path.dirname(fname)) with open(fname, "w") as fout: for env in _generate_exports(genomes_dir): fout.write(f"{env}\n")
def manage_config(cmd): """Manage genomepy config file.""" if cmd == "file": print(config.config_file) elif cmd == "show": with open(config.config_file) as f: print(f.read()) elif cmd == "generate": config_dir = user_config_dir("genomepy") if not os.path.exists(config_dir): mkdir_p(config_dir) new_config = os.path.join(config_dir, "genomepy.yaml") # existing config must be removed before norns picks up the default again if os.path.exists(new_config): os.unlink(new_config) default_config = norns.config("genomepy", default="cfg/default.yaml").config_file with open(new_config, "w") as fout, open(default_config) as fin: fout.write(fin.read()) config.config_file = new_config print(f"Created config file {new_config}") else: raise ValueError(f"Invalid config command: {cmd}")
def download_annotation(genomes_dir, annot_url, localname, n=None): """download annotation file, convert to intermediate file and generate output files""" # create output directory if missing out_dir = os.path.join(genomes_dir, localname) mkdir_p(out_dir) # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks tmp_dir = mkdtemp(dir=out_dir) ext, is_compressed = get_file_info(annot_url) annot_file = os.path.join(tmp_dir, localname + ".annotation" + ext) tmp_annot_file = os.path.join(tmp_dir, annot_url.split("/")[-1]) get_file = shutil.copyfile if os.path.exists(annot_url) else download_file if n is None: get_file(annot_url, tmp_annot_file) else: download_head(annot_url, tmp_annot_file, n) is_compressed = False # unzip input file (if needed) if is_compressed: annot_file = extract_archive(tmp_annot_file, outfile=annot_file) else: shutil.move(tmp_annot_file, annot_file) # generate intermediate file (GenePred) pred_file = annot_file.replace(ext, ".gp") if "bed" in ext: cmd = "bedToGenePred {0} {1}" elif "gff" in ext: # example annotation: GRCh38.p12 from NCBI cmd = "gff3ToGenePred -useName -warnAndContinue {0} {1}" elif "gtf" in ext: cmd = "gtfToGenePred -genePredExt -allErrors -ignoreGroupsWithoutExons {0} {1}" elif "txt" in ext: # UCSC annotations only with open(annot_file) as f: cols = f.readline().split("\t") # extract the genePred format columns start_col = 1 for i, col in enumerate(cols): if col in ["+", "-"]: start_col = i - 1 break end_col = start_col + 10 cmd = ( f"""cat {{0}} | cut -f {start_col}-{end_col} | """ # knownGene.txt.gz has spotty fields, this replaces non-integer fields with zeroes + """awk 'BEGIN {{FS=OFS="\t"}} !($11 ~ /^[0-9]+$/) {{$11="0"}}1' > {1}""" ) else: raise TypeError(f"file type extension {ext} not recognized!") if n is None and "gencode" in annot_url: rename_contigs(annot_file) sp.check_call(cmd.format(annot_file, pred_file), shell=True) # generate gzipped gtf file (if required) gtf_file = annot_file.replace(ext, ".gtf") if "gtf" not in ext: cmd = "genePredToGtf -source=genomepy file {0} {1}" sp.check_call(cmd.format(pred_file, gtf_file), shell=True) # generate gzipped bed file (if required) bed_file = annot_file.replace(ext, ".bed") if "bed" not in ext: cmd = "genePredToBed {0} {1}" sp.check_call(cmd.format(pred_file, bed_file), shell=True) # transfer the files from the tmpdir to the genome_dir for f in [gtf_file, bed_file]: src = f dst = os.path.join(out_dir, os.path.basename(f)) shutil.move(src, dst) rm_rf(tmp_dir)
def download_genome( self, name: str, genomes_dir: str = None, localname: str = None, mask: str = "soft", **kwargs, ): """ Download a (gzipped) genome file to a specific directory Parameters ---------- name : str Genome / species name genomes_dir : str , optional Directory to install genome localname : str , optional Custom name for your genome mask: str , optional Masking, soft, hard or none (all other strings) """ name = self._check_name(name) link = self.get_genome_download_link(name, mask=mask, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) mkdir_p(out_dir) logger.info( f"Downloading genome from {self.name}. Target URL: {link}...") # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks get_file = shutil.copyfile if os.path.exists(link) else download_file with TemporaryDirectory(dir=out_dir) as tmp_dir: tmp_fname = os.path.join(tmp_dir, link.split("/")[-1]) fname = os.path.join(tmp_dir, f"{localname}.fa") get_file(link, tmp_fname) logger.info( "Genome download successful, starting post processing...") # unzip genome _, is_compressed = get_file_info(link) if is_compressed: extract_archive(tmp_fname, outfile=fname, concat=True) else: shutil.move(tmp_fname, fname) # process genome (e.g. masking) if hasattr(self, "_post_process_download"): self._post_process_download(name=name, fname=fname, out_dir=out_dir, mask=mask) # transfer the genome from the tmpdir to the genome_dir src = fname dst = os.path.join(out_dir, f"{localname}.fa") shutil.move(src, dst) logger.info("name: {}".format(name)) logger.info("local name: {}".format(localname)) logger.info("fasta: {}".format(dst)) # Create readme with information readme = os.path.join(genomes_dir, localname, "README.txt") asm_acc = self.assembly_accession(name) tax_id = self.genome_taxid(name) metadata = { "name": localname, "provider": self.name, "original name": name, "original filename": os.path.split(link)[-1], "assembly_accession": asm_acc if asm_acc else "na", "tax_id": tax_id if tax_id else "na", "mask": mask, "genome url": link, "genomepy version": __version__, "date": time.strftime("%Y-%m-%d %H:%M:%S"), } update_readme(readme, metadata)
def clean(): """Remove cached data on providers""" my_cache_dir = os.path.join(user_cache_dir("genomepy"), __version__) rm_rf(my_cache_dir) mkdir_p(my_cache_dir) print("All clean!")
def download_genome( self, name, genomes_dir=None, localname=None, mask="soft", keep_alt=False, regex=None, invert_match=False, bgzip=None, **kwargs, ): """ Download a (gzipped) genome file to a specific directory Parameters ---------- name : str Genome / species name genomes_dir : str , optional Directory to install genome localname : str , optional Custom name for your genome mask: str , optional Masking, soft, hard or none (all other strings) keep_alt : bool , optional Set to true to keep these alternative regions. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. """ name = safe(name) self.check_name(name) link = self.get_genome_download_link(name, mask=mask, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) if not os.path.exists(out_dir): mkdir_p(out_dir) sys.stderr.write( f"Downloading genome from {self.name}.\nTarget URL: {link}...\n") # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks with TemporaryDirectory(dir=out_dir) as tmp_dir: fname = os.path.join(tmp_dir, f"{localname}.fa") # actual download urlcleanup() with urlopen(link) as response: # check available memory vs file size. available_memory = int(virtual_memory().available) file_size = int(response.info()["Content-Length"]) # download file in chunks if >75% of memory would be used cutoff = int(available_memory * 0.75) chunk_size = None if file_size < cutoff else cutoff with open(fname, "wb") as f_out: shutil.copyfileobj(response, f_out, chunk_size) sys.stderr.write( "Genome download successful, starting post processing...\n") # unzip genome if link.endswith(".tar.gz"): tar_to_bigfile(fname, fname) elif link.endswith(".gz"): os.rename(fname, fname + ".gz") ret = sp.check_call(["gunzip", "-f", fname]) if ret != 0: raise Exception(f"Error gunzipping genome {fname}") def regex_filer(_fname, _regex, _v): os.rename(_fname, _fname + "_to_regex") infa = _fname + "_to_regex" outfa = _fname filter_fasta(infa, outfa, regex=_regex, v=_v, force=True) return [ k for k in Fasta(infa).keys() if k not in Fasta(outfa).keys() ] not_included = [] # remove alternative regions if not keep_alt: not_included.extend(regex_filer(fname, "alt", True)) # keep/remove user defined regions if regex: not_included.extend(regex_filer(fname, regex, invert_match)) # process genome (e.g. masking) if hasattr(self, "_post_process_download"): self._post_process_download(name=name, localname=localname, out_dir=tmp_dir, mask=mask) # bgzip genome if requested if bgzip or config.get("bgzip"): ret = sp.check_call(["bgzip", "-f", fname]) if ret != 0: raise Exception( f"Error bgzipping {name}. Is tabix installed?") fname += ".gz" # transfer the genome from the tmpdir to the genome_dir src = fname dst = os.path.join(genomes_dir, localname, os.path.basename(fname)) shutil.move(src, dst) sys.stderr.write("\n") sys.stderr.write("name: {}\n".format(name)) sys.stderr.write("local name: {}\n".format(localname)) sys.stderr.write("fasta: {}\n".format(dst)) # Create readme with information readme = os.path.join(genomes_dir, localname, "README.txt") metadata = { "name": localname, "provider": self.name, "original name": name, "original filename": os.path.split(link)[-1], "assembly_accession": self.assembly_accession(self.genomes.get(name)), "tax_id": self.genome_taxid(self.genomes.get(name)), "mask": mask, "genome url": link, "annotation url": "na", "date": time.strftime("%Y-%m-%d %H:%M:%S"), } lines = [] if regex: regex_line = f"regex: {regex}" if invert_match: regex_line += " (inverted match)" lines += ["", regex_line, "sequences that were excluded:"] for seq in not_included: lines.append(f"\t{seq}") write_readme(readme, metadata, lines)
def download_genome( self, name, genomes_dir=None, localname=None, mask="soft", keep_alt=False, regex=None, invert_match=False, bgzip=None, **kwargs, ): """ Download a (gzipped) genome file to a specific directory Parameters ---------- name : str Genome / species name genomes_dir : str , optional Directory to install genome localname : str , optional Custom name for your genome mask: str , optional Masking, soft, hard or none (all other strings) keep_alt : bool , optional Set to true to keep these alternative regions. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. """ name = safe(name) self.check_name(name) link = self.get_genome_download_link(name, mask=mask, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) if not os.path.exists(out_dir): mkdir_p(out_dir) sys.stderr.write( f"Downloading genome from {self.name}.\nTarget URL: {link}...\n") # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks tmp_dir = mkdtemp(dir=out_dir) fname = os.path.join(tmp_dir, f"{localname}.fa") urlcleanup() download_file(link, fname) sys.stderr.write( "Genome download successful, starting post processing...\n") # unzip genome if link.endswith(".tar.gz"): tar_to_bigfile(fname, fname) elif link.endswith(".gz"): os.rename(fname, fname + ".gz") ret = sp.check_call(["gunzip", "-f", fname]) if ret != 0: raise Exception(f"Error gunzipping genome {fname}") def regex_filer(_fname, _regex, _v): infa = _fname + "_to_regex" os.rename(_fname, infa) # filter the fasta and store the output's keys keys_out = filter_fasta(infa, outfa=_fname, regex=_regex, v=_v, force=True).keys() keys_in = Fasta(infa).keys() return [k for k in keys_in if k not in keys_out] not_included = [] # remove alternative regions if not keep_alt: not_included.extend(regex_filer(fname, "alt", True)) # keep/remove user defined regions if regex: not_included.extend(regex_filer(fname, regex, invert_match)) # process genome (e.g. masking) if hasattr(self, "_post_process_download"): self._post_process_download(name=name, localname=localname, out_dir=tmp_dir, mask=mask) # bgzip genome if requested if bgzip or config.get("bgzip"): # bgzip to stdout, track progress, and output to file fsize = int(os.path.getsize(fname) * 10**-6) cmd = ( f"bgzip -fc {fname} | " f"tqdm --bytes --desc Bgzipping {fsize}MB fasta --log ERROR | " f"cat > {fname}.gz") ret = sp.check_call(cmd, shell=True) if ret != 0: raise Exception(f"Error bgzipping {name}. Is tabix installed?") fname += ".gz" # transfer the genome from the tmpdir to the genome_dir src = fname dst = os.path.join(genomes_dir, localname, os.path.basename(fname)) shutil.move(src, dst) rm_rf(tmp_dir) sys.stderr.write("\n") sys.stderr.write("name: {}\n".format(name)) sys.stderr.write("local name: {}\n".format(localname)) sys.stderr.write("fasta: {}\n".format(dst)) # Create readme with information readme = os.path.join(genomes_dir, localname, "README.txt") metadata = { "name": localname, "provider": self.name, "original name": name, "original filename": os.path.split(link)[-1], "assembly_accession": self.assembly_accession(self.genomes.get(name)), "tax_id": self.genome_taxid(self.genomes.get(name)), "mask": mask, "genome url": link, "annotation url": "na", "date": time.strftime("%Y-%m-%d %H:%M:%S"), } lines = [] if not keep_alt or regex: regex_line = "regex: " if not keep_alt: regex_line += "'alt' (inverted match)" if not keep_alt and regex: regex_line += " and " if regex: regex_line += f"'{regex}'" if invert_match: regex_line += " (inverted match)" lines += ["", regex_line, "sequences that were excluded:"] for seq in not_included: lines.append(f"\t{seq}") write_readme(readme, metadata, lines)
def download_and_generate_annotation(genomes_dir, annot_url, localname): """download annotation file, convert to intermediate file and generate output files""" # create output directory if missing out_dir = os.path.join(genomes_dir, localname) if not os.path.exists(out_dir): mkdir_p(out_dir) # download to tmp dir. Move files on completion. with TemporaryDirectory(dir=out_dir) as tmpdir: ext, gz = get_file_info(annot_url) annot_file = os.path.join(tmpdir, localname + ".annotation" + ext) urlretrieve(annot_url, annot_file) # unzip input file (if needed) if gz: cmd = "mv {0} {1} && gunzip -f {1}" sp.check_call(cmd.format(annot_file, annot_file + ".gz"), shell=True) # generate intermediate file (GenePred) pred_file = annot_file.replace(ext, ".gp") if "bed" in ext: cmd = "bedToGenePred {0} {1}" elif "gff" in ext: cmd = "gff3ToGenePred -geneNameAttr=gene {0} {1}" elif "gtf" in ext: cmd = "gtfToGenePred {0} {1}" elif "txt" in ext: # UCSC annotations only with open(annot_file) as f: cols = f.readline().split("\t") # extract the genePred format columns start_col = 1 for i, col in enumerate(cols): if col in ["+", "-"]: start_col = i - 1 break end_col = start_col + 10 cmd = ( f"""cat {{0}} | cut -f {start_col}-{end_col} | """ # knownGene.txt.gz has spotty fields, this replaces non-integer fields with zeroes + """awk 'BEGIN {{FS=OFS="\t"}} !($11 ~ /^[0-9]+$/) {{$11="0"}}1' > {1}""" ) else: raise TypeError(f"file type extension {ext} not recognized!") sp.check_call(cmd.format(annot_file, pred_file), shell=True) # generate gzipped gtf file (if required) gtf_file = annot_file.replace(ext, ".gtf") if "gtf" not in ext: cmd = "genePredToGtf -source=genomepy file {0} {1} && gzip -f {1}" sp.check_call(cmd.format(pred_file, gtf_file), shell=True) # generate gzipped bed file (if required) bed_file = annot_file.replace(ext, ".bed") if "bed" not in ext: cmd = "genePredToBed {0} {1} && gzip -f {1}" sp.check_call(cmd.format(pred_file, bed_file), shell=True) # if input file was gtf/bed, gzip it if ext in [".gtf", ".bed"]: cmd = "gzip -f {}" sp.check_call(cmd.format(annot_file), shell=True) # transfer the files from the tmpdir to the genome_dir for f in [gtf_file + ".gz", bed_file + ".gz"]: src = f dst = os.path.join(out_dir, os.path.basename(f)) shutil.move(src, dst)