def _index_sam(env, ref_file): (ref_dir, local_file) = os.path.split(ref_file) with shared.chdir(ref_dir): if not os.path.exists("%s.fai" % local_file): subprocess.check_call("samtools faidx %s" % local_file, shell=True) galaxy.index_picard(ref_file) return ref_file
def _data_uniref(env): """Retrieve and index UniRef databases for protein searches. http://www.ebi.ac.uk/uniref/ These are currently indexed for FASTA searches. Are other indexes desired? Should this be separated out and organized by program like genome data? This should also check the release note and automatically download and replace older versions. """ site = "ftp://ftp.uniprot.org" base_url = site + "/pub/databases/uniprot/" \ "current_release/uniref/%s/%s" for uniref_db in ["uniref50", "uniref90", "uniref100"]: work_dir = os.path.join(env.data_files, "uniref", uniref_db) if not os.path.exists(work_dir): subprocess.check_call("mkdir -p %s" % work_dir, shell=True) base_work_url = base_url % (uniref_db, uniref_db) fasta_url = base_work_url + ".fasta.gz" base_file = os.path.splitext(os.path.basename(fasta_url))[0] with shared.chdir(work_dir): if not os.path.exists(base_file): out_file = shared._remote_fetch(env, fasta_url) subprocess.check_call("gunzip %s" % out_file, shell=True) shared._remote_fetch(env, base_work_url + ".release_note") _index_blast_db(work_dir, base_file, "prot")
def download(self, seq_dir): zipped_file = None genome_file = "%s.fa" % self._name if not self._exists(genome_file, seq_dir): prep_dir = "seq_prep" subprocess.check_call("mkdir -p %s" % prep_dir, shell=True) with shared.chdir(prep_dir): zipped_file = self._download_zip(seq_dir) if zipped_file.endswith(".tar.gz"): subprocess.check_call("tar -xzpf %s" % zipped_file, shell=True) elif zipped_file.endswith(".zip"): subprocess.check_call("unzip %s" % zipped_file, shell=True) elif zipped_file.endswith(".gz"): if not os.path.exists("out.fa"): subprocess.check_call("gunzip -c %s > out.fa" % zipped_file, shell=True) else: raise ValueError("Do not know how to handle: %s" % zipped_file) tmp_file = genome_file.replace(".fa", ".txt") result = subprocess.check_output("find `pwd` -name '*.fa'", shell=True).decode() result = [x.strip() for x in result.split("\n")] if len(result) == 1: orig_result = result[0] result = self._split_multifasta(result[0]) subprocess.check_call("rm %s" % orig_result, shell=True) result = self._karyotype_sort(result) subprocess.check_call("rm -f inputs.txt", shell=True) for fname in result: subprocess.check_output("echo '%s' >> inputs.txt" % fname, shell=True).decode() subprocess.check_call("cat `cat inputs.txt` > %s" % (tmp_file), shell=True) for fname in result: subprocess.check_output("rm -f %s" % fname, shell=True).decode() subprocess.check_call("mv %s %s" % (tmp_file, genome_file), shell=True) zipped_file = os.path.join(prep_dir, zipped_file) genome_file = os.path.join(prep_dir, genome_file) return genome_file, [zipped_file]
def _data_liftover(env, lift_over_genomes): """Download chain files for running liftOver. Does not install liftOver binaries automatically. """ lo_dir = os.path.join(env.data_files, "liftOver") if not os.path.exists(lo_dir): subprocess.check_call("mkdir %s" % lo_dir, shell=True) lo_base_url = "ftp://hgdownload.cse.ucsc.edu/goldenPath/%s/liftOver/%s" lo_base_file = "%sTo%s.over.chain.gz" for g1 in lift_over_genomes: for g2 in [g for g in lift_over_genomes if g != g1]: g2u = g2[0].upper() + g2[1:] cur_file = lo_base_file % (g1, g2u) non_zip = os.path.splitext(cur_file)[0] worked = False with shared.chdir(lo_dir): if not os.path.exists(non_zip): result = shared._remote_fetch(env, "%s" % (lo_base_url % (g1, cur_file)), allow_fail=True) # Lift over back and forths don't always exist # Only move forward if we found the file if result: worked = True subprocess.check_call("gunzip %s" % result, shell=True) if worked: ref_parts = [g1, g2, os.path.join(lo_dir, non_zip)] galaxy.update_loc_file(env, "liftOver.loc", ref_parts)
def _index_w_command(env, dir_name, command, ref_file, pre=None, post=None, ext=None): """Low level function to do the indexing and paths with an index command. """ path_export = _get_path_export(env) index_name = os.path.splitext(os.path.basename(ref_file))[0] if ext is not None: index_name += ext full_ref_path = os.path.join(os.pardir, ref_file) if not os.path.exists(dir_name): subprocess.check_call("mkdir %s" % dir_name, shell=True) with shared.chdir(dir_name): if pre: full_ref_path = pre(full_ref_path) subprocess.check_call( path_export + command.format(ref_file=full_ref_path, index_name=index_name), shell=True) if post: post(full_ref_path) return os.path.join(dir_name, index_name)
def _tar_directory(dir, tar_name): """Create a tarball of the directory. """ base_dir, tar_dir = os.path.split(dir) tarball = os.path.join(base_dir, "%s.tar.xz" % tar_name) if not os.path.exists(tarball): with shared.chdir(base_dir): subprocess.check_call("tar -cvpf - %s | xz -zc - > %s" % (tar_dir, os.path.basename(tarball)), shell=True) return tarball
def _index_to_galaxy(env, work_dir, ref_file, gid, genome_indexes, config): """Index sequence files and update associated Galaxy loc files. """ indexes = {} with shared.chdir(work_dir): for idx in genome_indexes: index_file = get_index_fn(idx)(env, ref_file) if index_file: indexes[idx] = os.path.join(work_dir, index_file) galaxy.prep_locs(env, gid, indexes, config)
def _index_bismark(env, ref_file): dir_name = "bismark" subprocess.check_call("mkdir -p %s" % dir_name, shell=True) with shared.chdir(dir_name): local = os.path.basename(ref_file) subprocess.check_call("ln -sf {0} {1}".format(ref_file, local), shell=True) cmd = f"bismark_genome_preparation ." subprocess.check_call(cmd, shell=True) return os.path.join(dir_name, "Bisulfite_Genome")
def _index_blast_db(work_dir, base_file, db_type): """Index a database using blast+ for similary searching. """ type_to_ext = dict(prot=("phr", "pal"), nucl=("nhr", "nal")) db_name = os.path.splitext(base_file)[0] with shared.chdir(work_dir): if not reduce(operator.or_, (os.path.exists("%s.%s" % (db_name, ext)) for ext in type_to_ext[db_type])): subprocess.check_call("makeblastdb -in %s -dbtype %s -out %s" % (base_file, db_type, db_name), shell=True)
def _index_bwa(env, ref_file): dir_name = "bwa" local_ref = os.path.split(ref_file)[-1] if not os.path.exists(os.path.join(dir_name, "%s.bwt" % local_ref)): subprocess.check_call("mkdir -p %s" % dir_name, shell=True) with shared.chdir(dir_name): subprocess.check_call("ln -sf %s" % os.path.join(os.pardir, ref_file), shell=True) try: subprocess.check_call("bwa index -a bwtsw %s" % local_ref, shell=True) except subprocess.CalledProcessError: # work around a bug in bwa indexing for small files subprocess.check_call("bwa index %s" % local_ref, shell=True) subprocess.check_call("rm -f %s" % local_ref, shell=True) return os.path.join(dir_name, local_ref)
def finalize(genomes, data_filedir): """Provide symlinks back to reference genomes so tophat avoids generating FASTA genomes. """ genome_dir = os.path.join(data_filedir, "genomes") for (orgname, gid, manager) in genomes: org_dir = os.path.join(genome_dir, orgname) for aligner in ["bowtie", "bowtie2"]: aligner_dir = os.path.join(org_dir, gid, aligner) if os.path.exists(aligner_dir): with shared.chdir(aligner_dir): for ext in ["", ".fai"]: orig_seq = os.path.join(os.pardir, "seq", "%s.fa%s" % (gid, ext)) if os.path.exists(orig_seq) and not os.path.exists(os.path.basename(orig_seq)): subprocess.check_call("ln -sf %s" % orig_seq, shell=True)
def _clean_directory(dir, gid): """Clean duplicate files from directories before tar and upload. """ # get rid of softlinks bowtie_ln = os.path.join(dir, "bowtie", "%s.fa" % gid) maq_ln = os.path.join(dir, "maq", "%s.fa" % gid) for to_remove in [bowtie_ln, maq_ln]: if os.path.exists(to_remove): subprocess.check_call("rm -f %s" % to_remove, shell=True) # remove any downloaded original sequence files remove_exts = ["*.gz", "*.zip"] with shared.chdir(os.path.join(dir, "seq")): for rext in remove_exts: fnames = subprocess.check_output("find . -name '%s'" % rext, shell=True).decode() for fname in (f.strip() for f in fnames.split("\n") if f.strip()): subprocess.check_call("rm -f %s" % fname, shell=True)
def _data_ngs_genomes(env, genomes, genome_indexes): """Download and create index files for next generation genomes. """ genome_dir = _make_genome_dir(env.data_files) for organism, genome, manager in genomes: cur_dir = os.path.join(genome_dir, organism, genome) print("Processing genome {0} and putting it to {1}".format(organism, cur_dir)) if not os.path.exists(cur_dir): subprocess.check_call('mkdir -p %s' % cur_dir, shell=True) with shared.chdir(cur_dir): if hasattr(env, "remove_old_genomes") and env.remove_old_genomes: _clean_genome_directory() seq_dir = 'seq' ref_file, base_zips = manager.download(seq_dir) ref_file = _move_seq_files(ref_file, base_zips, seq_dir) cur_indexes = manager.config.get("indexes", genome_indexes) _index_to_galaxy(env, cur_dir, ref_file, genome, cur_indexes, manager.config)
def _index_w_command(env, dir_name, command, ref_file, pre=None, post=None, ext=None): """Low level function to do the indexing and paths with an index command. """ path_export = _get_path_export(env) index_name = os.path.splitext(os.path.basename(ref_file))[0] if ext is not None: index_name += ext full_ref_path = os.path.join(os.pardir, ref_file) if not os.path.exists(dir_name): subprocess.check_call("mkdir %s" % dir_name, shell=True) with shared.chdir(dir_name): if pre: full_ref_path = pre(full_ref_path) subprocess.check_call(path_export + command.format(ref_file=full_ref_path, index_name=index_name), shell=True) if post: post(full_ref_path) return os.path.join(dir_name, index_name)
def finalize(genomes, data_filedir): """Provide symlinks back to reference genomes so tophat avoids generating FASTA genomes. """ genome_dir = os.path.join(data_filedir, "genomes") for (orgname, gid, manager) in genomes: org_dir = os.path.join(genome_dir, orgname) for aligner in ["bowtie", "bowtie2"]: aligner_dir = os.path.join(org_dir, gid, aligner) if os.path.exists(aligner_dir): with shared.chdir(aligner_dir): for ext in ["", ".fai"]: orig_seq = os.path.join(os.pardir, "seq", "%s.fa%s" % (gid, ext)) if os.path.exists(orig_seq) and not os.path.exists( os.path.basename(orig_seq)): subprocess.check_call("ln -sf %s" % orig_seq, shell=True)
def _download_genomes(env, genomes, genome_indexes): """Download a group of genomes from Amazon s3 bucket. """ genome_dir = _make_genome_dir(env.data_files) for (orgname, gid, manager) in genomes: org_dir = os.path.join(genome_dir, orgname, gid) if not os.path.exists(org_dir): subprocess.check_call('mkdir -p %s' % org_dir, shell=True) for idx in genome_indexes: with shared.chdir(org_dir): if not os.path.exists(idx): _download_s3_index(env, manager, gid, idx) ref_file = os.path.join(org_dir, "seq", "%s.fa" % gid) if not os.path.exists(ref_file): ref_file = os.path.join(org_dir, "seq", "%s.fa" % manager._name) assert os.path.exists(ref_file), ref_file cur_indexes = manager.config.get("indexes", genome_indexes) _index_to_galaxy(env, org_dir, ref_file, gid, cur_indexes, manager.config)
def _prep_genomes(env, genomes, genome_indexes, retrieve_fns, data_filedir): """Prepare genomes with the given indexes, supporting multiple retrieval methods. """ genome_dir = _make_genome_dir(data_filedir) for (orgname, gid, manager) in genomes: org_dir = os.path.join(genome_dir, orgname, gid) if not os.path.exists(org_dir): subprocess.check_call('mkdir -p %s' % org_dir, shell=True) ggd_recipes = manager.config.get( "annotations", []) + manager.config.get("validation", []) ggd_recipes += [ x for x in manager.config.get("indexes", []) if x in genome_indexes ] for idx in genome_indexes + ggd_recipes: with shared.chdir(org_dir): if idx in ggd_recipes or not os.path.exists(idx): finished = False last_exc = None for method, retrieve_fn in retrieve_fns: try: retrieve_fn(env, manager, gid, idx) finished = True break except KeyboardInterrupt: raise except BaseException as e: # Fail on incorrect GGD recipes if idx in ggd_recipes and method == "ggd": raise else: last_exc = traceback.format_exc() print( "Moving on to next genome prep method after trying {0}\n{1}" .format(method, str(e))) if not finished: raise IOError( "Could not prepare index {0} for {1} by any method\n{2}" .format(idx, gid, last_exc)) ref_file = os.path.join(org_dir, "seq", "%s.fa" % gid) if not os.path.exists(ref_file): ref_file = os.path.join(org_dir, "seq", "%s.fa" % manager._name) assert os.path.exists(ref_file), ref_file _index_to_galaxy(env, org_dir, ref_file, gid, genome_indexes, manager.config)
def update_loc_file(env, ref_file, line_parts): """Add a reference to the given genome to the base index file. """ if getattr(env, "galaxy_home", None) is not None: tools_dir = os.path.join(env.galaxy_home, "tool-data") if not os.path.exists(tools_dir): subprocess.check_call("mkdir -p %s" % tools_dir, shell=True) dt_file = os.path.join(env.galaxy_home, "tool_data_table_conf.xml") if not os.path.exists(dt_file): shutil.copy(env.tool_data_table_conf_file, dt_file) add_str = "\t".join(line_parts) with shared.chdir(tools_dir): if not os.path.exists(ref_file): subprocess.check_call("touch %s" % ref_file, shell=True) has_line = False with open(ref_file) as in_handle: for line in in_handle: if line.strip() == add_str.strip(): has_line = True if not has_line: with open(ref_file, "a") as out_handle: out_handle.write(add_str + "\n")
def _prep_genomes(env, genomes, genome_indexes, retrieve_fns, data_filedir): """Prepare genomes with the given indexes, supporting multiple retrieval methods. """ genome_dir = _make_genome_dir(data_filedir) for (orgname, gid, manager) in genomes: org_dir = os.path.join(genome_dir, orgname, gid) if not os.path.exists(org_dir): subprocess.check_call('mkdir -p %s' % org_dir, shell=True) ggd_recipes = manager.config.get("annotations", []) + manager.config.get("validation", []) ggd_recipes += [x for x in manager.config.get("indexes", []) if x in genome_indexes] for idx in genome_indexes + ggd_recipes: with shared.chdir(org_dir): if idx in ggd_recipes or not os.path.exists(idx): finished = False last_exc = None for method, retrieve_fn in retrieve_fns: try: retrieve_fn(env, manager, gid, idx) finished = True break except KeyboardInterrupt: raise except BaseException as e: # Fail on incorrect GGD recipes if idx in ggd_recipes and method == "ggd": raise else: last_exc = traceback.format_exc() print("Moving on to next genome prep method after trying {0}\n{1}".format( method, str(e))) if not finished: raise IOError("Could not prepare index {0} for {1} by any method\n{2}" .format(idx, gid, last_exc)) ref_file = os.path.join(org_dir, "seq", "%s.fa" % gid) if not os.path.exists(ref_file): ref_file = os.path.join(org_dir, "seq", "%s.fa" % manager._name) assert os.path.exists(ref_file), ref_file _index_to_galaxy(env, org_dir, ref_file, gid, genome_indexes, manager.config)