def _propagate(index_dir, threads): """Run k-mer propagation. Args: index_dir (str): Index directory. threads (int): Number of threads for Makefile. """ pro.message('Running k-mer propagation') propagation_dir = os.path.join(index_dir, 'propagation') pro.test_files(os.path.join(propagation_dir, 'Makefile'), test_nonzero=True) # test if input files for propagation exist command = ['make', '-C', propagation_dir, '-n', '-s', '>', '/dev/null'] pro.run_safe( command, err_msg="Some FASTA files needed for k-mer propagation are probably missing, see the messages above.", thr_exc=False, silent=True, ) # run propagation command = ['make', '-j', threads, '-C', propagation_dir, 'V=1'] pro.run_safe( command, err_msg="K-mer propagation has not been finished because of an error. See messages above.", thr_exc=False, )
def prophyle_decompress(archive, output_dir, klcp): pro.test_files(archive) _compile_prophyle_bin(parallel=True) with tarfile.open(archive) as tar: names = tar.getnames() index_name = names[0] for x in FILES_TO_ARCHIVE: assert os.path.join(index_name, x) in names, "File '{}' is missing in the archive".format(x) index_dir = os.path.join(output_dir, index_name) pro.message("Decompressing index core files") cmd = ["tar", "xvf", archive, "-C", output_dir] pro.run_safe(cmd) pro.message("Core files have been decompressed, reconstructing the index") pro.touch(os.path.join(index_dir, "index.fa")) pro.touch(os.path.join(index_dir, "index.fa.pac")) if klcp: config = pro.load_index_config(index_dir) cmd = [PROPHYLE, "index", "-k", config['k'], os.path.join(index_dir, "tree.nw"), index_dir] else: cmd = [PROPHYLE, "index", "-K", os.path.join(index_dir, "tree.nw"), index_dir] pro.run_safe(cmd) pro.message("Index reconstruction finished")
def _pseudo_fai(d): """Generate a psedudofai file for given directory (directory/*.fa => directory.fai). Pseudofai format = TSV with 2 two columns: filename, sequence header (text after > in FASTA). Args: d (str): Directory. """ l = os.path.dirname(d) pseudofai_fn = d + ".pseudofai" pro.makedirs(d) if _is_complete(d, 2) and os.path.isfile(pseudofai_fn): pro.message( "Skipping generating pseudofai for library '{}' (already exists)". format(l)) else: pro.message("Generating pseudofai for library '{}'".format(l)) assert d[-1] != "/" # cmd=['grep -r --include=\\*.{fa,ffn,fna}', '">"', d, '| sed "s/:>/\t/"'] cmd = [ 'find', d, '-name', "'*.fa'", "-o", "-name", "'*.ffn'", "-o", "-name", "'*.fna'", "-exec", "grep", "-H", '">"', "{}", "\\;", "|", 'sed', '"s/\:>/\t/"' ] pro.run_safe(cmd, output_fn=pseudofai_fn) _mark_complete(d, 2)
def _propagation_preprocessing(in_trees, out_tree, no_prefixes, sampling_rate, autocomplete): """Merge input trees into a single tree. Args: in_trees (list of str): Input NHX trees (possibly with a root specifier). out_tree (str): Output NHX tree. no_prefixes (bool): Don't prepend prefixes to node names during tree merging. sampling rate (float): Sampling rate for subsampling the tree or None for no subsampling. """ pro.message('Generating index tree') # existence already checked # pro.test_files(*in_trees) command = [PROPAGATION_PREPROCESSING] if sampling_rate is not None: command += ['-s', sampling_rate] command += in_trees + [out_tree] if no_prefixes: command += ['-P'] if autocomplete: command += ['-A'] pro.run_safe( command, err_msg="The main tree could not be generated.", thr_exc=False, ) _log_file_md5(out_tree)
def _propagation_postprocessing(index_dir, in_tree_fn, out_tree_fn): """Merge reduced FASTA files after k-mer propagation and create index.fa. Args: index_dir (str): Index directory. in_tree_fn (str): Input tree in Newick/NHX. out_tree_fn (str): Output tree in Newick/NHX. """ pro.message('Propagation post-processing') propagation_dir = os.path.join(index_dir, 'propagation') tsv_fn = os.path.join(index_dir, "index.fa.kmers.tsv") index_fa = os.path.join(index_dir, "index.fa") command = ["cat", os.path.join(propagation_dir, "*.tsv"), '>', tsv_fn] pro.run_safe( command, err_msg="K-mer statistics could not be created.", thr_exc=True, ) command = [PROPAGATION_POSTPROCESSING, propagation_dir, index_fa, in_tree_fn, tsv_fn, out_tree_fn] pro.run_safe( command, err_msg="Main ProPhyle FASTA file could not be generated", thr_exc=True, ) pro.touch(index_fa + ".complete") _log_file_md5(index_fa) _log_file_md5(in_tree_fn) _log_file_md5(out_tree_fn)
def prophyle_compress(index_dir, archive): _compile_prophyle_bin(parallel=True) tmp_dir = tempfile.mkdtemp() arcdir = index_dir.rstrip("/").split("/")[-1] tmp_arc_dir = os.path.join(tmp_dir, arcdir) # todo: should create a correct directory pro.message("Creating a temporary directory for files to compress") pro.makedirs(tmp_arc_dir) for x in FILES_TO_ARCHIVE: if x == "index.fa.bwt": continue pro.cp_to_dir(os.path.join(index_dir, x), tmp_arc_dir) bwt_fn_1 = os.path.join(index_dir, "index.fa.bwt") bwt_fn_2 = os.path.join(tmp_arc_dir, "index.fa.bwt") cmd = [IND, "debwtupdate", bwt_fn_1, bwt_fn_2] pro.run_safe(cmd) pro.message("Creating '{}'".format(archive)) with tarfile.open(archive, "w:gz") as tar: tar.add(tmp_arc_dir, arcname=arcdir) pro.message("File '{}' has been created".format(archive))
def prophyle_analyze(index_dir, out_prefix, input_fns, stats, in_format): cmd_analyze = [ANALYZE, '-s', stats, index_dir, out_prefix] + input_fns if in_format is not None: cmd_analyze += ['-f', in_format] pro.test_files(*filter(lambda x: x != "-", input_fns), test_nonzero=True) pro.run_safe(cmd_analyze)
def _remove_tmp_propagation_files(index_dir): """Run k-mer propagation. Args: index_dir (str): Index directory. """ pro.message('Removing temporary files') propagation_dir = os.path.join(index_dir, 'propagation') command = ['make', '-C', propagation_dir, 'clean', '>', '/dev/null'] pro.run_safe(command)
def prophyle_decompress(archive, output_dir, klcp): pro.test_files(archive) if not os.path.isdir(output_dir): pro.error("Directory '{}' does not exist.".format(output_dir)) _compile_prophyle_bin(parallel=True) with tarfile.open(archive) as tar: names = tar.getnames() index_name = names[0] for x in FILES_TO_ARCHIVE: if not os.path.join(index_name, x) in names: pro.error("File '{}' is missing in the archive".format(x)) index_dir = os.path.join(output_dir, index_name) index_exists = True for i in range(1, 7): fn = os.path.join(index_dir, ".complete.{}".format(i)) if not os.path.isfile(fn): index_exists = False break if index_exists: pro.message("Index already exists") return _compile_prophyle_bin(parallel=True) pro.message("Decompressing core index files") cmd = ["tar", "xvf", archive, "-C", output_dir] pro.run_safe(cmd) fn = os.path.join(index_dir, ".complete.4") pro.rm(fn) pro.message("Reconstructing the index") pro.touch(os.path.join(index_dir, "index.fa")) pro.touch(os.path.join(index_dir, "index.fa.pac")) if klcp: config = pro.load_index_config(index_dir) cmd = [ PROPHYLE, "index", "-k", config['k'], os.path.join(index_dir, "tree.nw"), index_dir ] else: cmd = [ PROPHYLE, "index", "-K", os.path.join(index_dir, "tree.nw"), index_dir ] pro.run_safe(cmd) pro.message("Index reconstruction finished")
def parse_rpt(library, library_dir): if library == "all": for l in LIBRARIES: parse_rpt(l, library_dir) return else: assert library in LIBRARIES cmd = [ RPT_PARSER, os.path.join(library_dir, library), '>', library + '_taxamap.tsv' ] pro.run_safe(cmd)
def fasta_idx(library, library_dir): if library == "all": for l in LIBRARIES: fasta_idx(l, library_dir) return else: assert library in LIBRARIES cmd = [ 'find', os.path.join(library_dir, library), '-name', '*.fna', '|' 'parallel', '--no-notice', '--verbose', 'samtools', 'faidx', '{}' ] pro.run_safe(cmd)
def build_tree(library, library_dir): if library == "all": for l in LIBRARIES: build_tree(l, library_dir) return else: assert library in LIBRARIES root = "Bacteria" if library == 'plasmids' else library.title() cmd = [ TREE_BUILDER, library, library_dir, library + '.nw', library + '_taxamap.tsv', '-l', library + '.log', '-u', root ] pro.run_safe(cmd)
def _pac2bwt(fa_fn): """Run `bwa pac2bwtgen` (2bit => BWT). Args: fa_fn (str): FASTA file. """ #pro.message('Generating BWT') pro.test_files(BWA, fa_fn + ".pac") command = [BWA, 'pac2bwtgen', fa_fn + ".pac", fa_fn + ".bwt"] pro.run_safe( command, err_msg="Burrows-Wheeler Transform could not be computed.", thr_exc=True, ) _log_file_md5(fa_fn + ".bwt", remark="without OCC")
def _kmer_stats(index_dir): """Create a file with k-mer statistics. Args: index_dir (str): Index directory. """ propagation_dir = os.path.join(index_dir, 'propagation') command = [ "cat", propagation_dir + "/*.count.tsv", "|", "grep", "-v", "^#", "|", "sort", "|", "uniq", ">", os.path.join(index_dir, "index.fa.kmers.tsv") ] pro.run_safe( command, err_msg="A file with k-mer statistics could not be created.", thr_exc=False, )
def _bwt2bwtocc(fa_fn): """Run `bwa bwtupdate` (BWT => BWT+OCC). Args: fa_fn (str): FASTA file. """ #pro.message('Generating sampled OCC array') pro.test_files(BWA, fa_fn + ".bwt") command = [BWA, 'bwtupdate', fa_fn + ".bwt"] pro.run_safe( command, err_msg="OCC array could not be computed.", thr_exc=True, ) _log_file_md5(fa_fn + ".bwt", remark="with OCC")
def _bwtocc2sa(fa_fn): """Run `bwa bwt2sa` (BWT+, remark="with OCC"OCC => SSA). Args: fa_fn (str): FASTA file. """ #pro.message('Generating sampled SA') pro.test_files(BWA, fa_fn + ".bwt") command = [BWA, 'bwt2sa', fa_fn + ".bwt", fa_fn + ".sa"] pro.run_safe( command, err_msg="Sampled Suffix Array computation failed.", thr_exc=True, ) _log_file_md5(fa_fn + ".sa")
def _fa2pac(fa_fn): """Run `bwa fa2pac` (FA => 2bit). Args: fa_fn (str): FASTA file. """ #pro.message('Generating packed FASTA file') pro.test_files(BWA, fa_fn) command = [BWA, 'fa2pac', fa_fn, fa_fn] pro.run_safe( command, err_msg="Packaged file could not be created.", thr_exc=True, ) _log_file_md5(fa_fn + ".pac")
def _bwtocc2klcp(fa_fn, k): """Create k-LCP `` (BWT => k-LCP). Args: fa_fn (str): FASTA file. k (int): K-mer size. """ #pro.message('Generating k-LCP array') pro.test_files(IND, fa_fn + ".bwt") command = [IND, 'build', '-k', k, fa_fn] pro.run_safe( command, err_msg="k-Longest Common Prefix array construction failed.", thr_exc=True, ) _log_file_md5("{}.{}.klcp".format(fa_fn, k))
def _bwtocc2sa_klcp(fa_fn, k): """Create k-LCP `` (BWT => k-LCP). Args: fa_fn (str): FASTA file. k (int): K-mer size. """ pro.message('Generating k-LCP array and SA in parallel') pro.test_files(IND, fa_fn + ".bwt") command = [IND, 'build', '-s', '-k', k, fa_fn] pro.run_safe( command, err_msg="Parallel construction of k-Longest Common Prefix array and Sampled Suffix Array failed.", thr_exc=True, ) _log_file_md5(fa_fn + ".sa") _log_file_md5("{}.{}.klcp".format(fa_fn, k))
def _propagate(index_dir, threads, nonprop=0): """Run k-mer propagation. Args: index_dir (str): Index directory. threads (int): Number of threads for Makefile. nonprop (bool): Switch propagation off. """ pro.message('Running k-mer propagation') propagation_dir = os.path.join(index_dir, 'propagation') pro.test_files(os.path.join(propagation_dir, 'Makefile'), test_nonzero=True) if nonprop: nonprop_cmd_str = "NONPROP=1" else: nonprop_cmd_str = "" # test if input files for propagation exist command = [ 'make', '-j', '-C', propagation_dir, '-n', '-s', nonprop_cmd_str, '>', '/dev/null' ] pro.run_safe( command, err_msg= "Some FASTA files needed for k-mer propagation are probably missing, see the messages above.", thr_exc=False, silent=True, ) # run propagation # TODO: progress report is switched off; come up with a better way than # counting files command = [ 'make', '-j', threads, '-C', propagation_dir, nonprop_cmd_str, 'V=1', 'PRINT_PROGRESS=' ] pro.run_safe( command, err_msg= "K-mer propagation has not been finished because of an error. See messages above.", thr_exc=False, )
def _compile_prophyle_bin(clean=False, parallel=False, silent=True, force=False): """Compile ProPhyle binaries if they don't exist yet. Recompile if not up-to-date. Args: clean (bool): Run make clean instead of make. parallel (bool): Run make in parallel. silent (bool): Run make silently. force (bool): Force recompile (make -B). """ try: command = ["make"] if parallel: command += ['-j'] if silent: command += ['-s'] if force: command += ['-B'] command += ["-C", C_D] if clean: command += ['clean'] pro.run_safe(command, output_fo=sys.stderr) except RuntimeError: if not os.path.isfile(IND) or not os.path.isfile(ASM): pro.error( "Error: ProPhyle executables could not be compiled. Please, the command '{}' manually." .format(" ".join(command))) else: print( "Warning: ProPhyle executables could not be recompiled. Going to use the old ones.", file=sys.stderr)
def _create_makefile(index_dir, k, library_dir, mask_repeats=False): """Create a Makefile for k-mer propagation. Args: index_dir (str): Index directory. k (int): K-mer size. library_dir (library_dir): Library directory. mask_repeats (bool): Mask repeats using DustMasker. TODO: * Add checking of params.mk """ pro.message('Creating Makefile for k-mer propagation') propagation_dir = os.path.join(index_dir, 'propagation') pro.makedirs(propagation_dir) makefile = os.path.join(propagation_dir, 'Makefile') tree_fn = os.path.join(index_dir, 'tree.preliminary.nw') _test_tree(tree_fn) # pro.test_files(NEWICK2MAKEFILE, tree_fn) command = [ NEWICK2MAKEFILE, '-k', k, tree_fn, os.path.abspath(library_dir), './', makefile ] config = collections.OrderedDict() config['prophyle-version'] = version.VERSION config['prophyle-revision'] = version.REVCOUNT config['prophyle-commit'] = version.SHORTHASH config['k'] = k pro.save_index_config(index_dir, config) with open(os.path.join(propagation_dir, "params.mk"), "w+") as f: f.write('PRG_ASM="{}"\n'.format(ASM)) f.write("K={}\n".format(k)) if mask_repeats: f.write("MASKREP=1\n") pro.run_safe(command) _log_file_md5(makefile)
def _merge_kmer_stats(index_dir): """Create a file with k-mer statistics. Args: index_dir (str): Index directory. """ tsv_fn = os.path.join(index_dir, "index.fa.kmers.tsv") propagation_dir = os.path.join(index_dir, 'propagation') command = [ "find", propagation_dir, "-name", "'*.tsv'", \ "|", "sort", \ "|", "xargs", "cat", \ "|", "grep", "-v", "^#", "|", "sort", \ "|", "uniq", \ '>', tsv_fn] pro.run_safe( command, err_msg="A file with k-mer statistics could not be created.", thr_exc=False, )
def download_rpt(library, library_dir): if library == "all": for l in LIBRARIES: download_rpt(l, library_dir) return else: assert library in LIBRARIES d = os.path.join(library_dir, library) #os.makedirs(d, exist_ok=True) #pro.makedirs(d) # if it does not exist, exit, there are no fna files to add to the tree! if library == 'bacteria': cmd = [ 'cd', d, '&&', 'curl', FTP_NCBI + '/genomes/archive/old_refseq/Bacteria/all.rpt.tar.gz', '|', 'tar', 'xz' ] pro.run_safe(cmd) elif library == 'viruses': cmd = [ 'cd', d, '&&', 'curl', FTP_NCBI + '/genomes/Viruses/all.rpt.tar.gz', '|', 'tar', 'xz' ] pro.run_safe(cmd) elif library == 'plasmids': cmd = [ 'cd', d, '&&', 'curl', FTP_NCBI + '/genomes/archive/old_refseq/Plasmids/plasmids.all.rpt.tar.gz', '|', 'tar', 'xz', '--strip', '5' ] pro.run_safe(cmd) else: raise ValueError('Unknown library "{}"'.format(library))
def create_bwa_index(fa): # cmd('"{bwa}" index "{fa}"'.format(bwa=bwa,fa=fa)) pro.run_safe([bwa, 'fa2pac', fa, fa]) pro.run_safe([bwa, 'pac2bwtgen', fa + ".pac", fa + ".bwt", ">", "/dev/null"]) pro.run_safe([bwa, 'bwtupdate', fa + ".bwt"]) pro.run_safe([bwa, 'bwt2sa', fa + ".bwt", fa + ".sa"])
def create_klcp(fa, k): pro.run_safe([prophyle_index, 'build', '-k', k, fa, ">", "/dev/null"])
def prophyle_classify( index_dir, fq_fn, fq_pe_fn, k, out_format, mimic_kraken, measure, annotate, tie_lca, kmer_lca, print_seq, cimpl, force_restarted_search, prophyle_conf_string ): """Run ProPhyle classification. Args: index_dir (str): Index directory. fq_fn (str): Input reads (single-end or first of paired-end). fq_pe_fn (str): Input reads (second paired-end, None if single-end) k (int): K-mer size (None => detect automatically). out_format (str): Output format: sam / kraken. mimic_kraken (bool): Mimic Kraken algorithm (compute LCA for each k-mer). measure (str): Measure used for classification (h1 / h2 / c1 / c2). annotate (bool): Annotate assignments (insert annotations from Newick to SAM). tie_lca (bool): If multiple equally good assignments found, compute their LCA. kmer_lca (bool): Replace k-mer matches by their LCA. print_seq (bool): Print sequencing in SAM. cimpl (bool): Use the C++ implementation. force_restarted_search (bool): Force restarted search. prophyle_conf_string (str): ProPhyle configuration string. """ _compile_prophyle_bin(parallel=True) index_fa = os.path.join(index_dir, 'index.fa') index_tree = os.path.join(index_dir, 'tree.nw') if k is None: k = pro.detect_k_from_index(index_dir) pro.message("Automatic detection of k-mer length: k={}".format(k)) _test_tree(index_tree) if fq_pe_fn: pro.test_files(fq_fn, fq_pe_fn, allow_pipes=False) elif fq_fn != '-': pro.test_files(fq_fn, allow_pipes=False) pro.test_files(IND) pro.test_files( index_fa + '.bwt', #index_fa + '.pac', index_fa + '.sa', index_fa + '.ann', #index_fa + '.amb', ) (bwt_s, sa_s) = pro.file_sizes(index_fa + '.bwt', index_fa + '.sa') assert abs(bwt_s - 2 * sa_s) < 1000, 'Inconsistent index (SA vs. BWT)' #assert abs(bwt_s - 2 * pac_s) < 1000, 'Inconsistent index (PAC vs. BWT)' klcp_fn = "{}.{}.klcp".format(index_fa, k) if force_restarted_search: pro.message("Restarted search forced") use_rolling_window = False else: use_rolling_window = os.path.isfile(klcp_fn) if use_rolling_window: pro.message("k-LCP file found, going to use rolling window") pro.test_files(klcp_fn) (klcp_s, ) = pro.file_sizes(klcp_fn) assert abs(bwt_s - 4 * klcp_s) < 1000, 'Inconsistent index (KLCP vs. BWT)' else: pro.message("k-LCP file not found, going to use restarted search") if cimpl: ASSIGN = C_ASSIGN else: ASSIGN = PY_ASSIGN if mimic_kraken: measure = "h1" tie_lca = True kmer_lca = True out_format = "kraken" cmd_assign = [ASSIGN] if not cimpl and prophyle_conf_string: cmd_assign += ['-c', prophyle_conf_string] cmd_assign += ['-m', measure, '-f', out_format] if annotate: cmd_assign += ['-A'] if tie_lca: cmd_assign += ['-L'] if kmer_lca: cmd_assign += ['-X'] cmd_assign += [index_tree, k, '-'] if fq_pe_fn: cmd_read = [READ, fq_fn, fq_pe_fn, '|'] in_read = '-' else: cmd_read = [] # fq_fn can be '-' as well in_read = fq_fn cmd_query = [ IND, 'query', '-k', k, '-u' if use_rolling_window else '', '-b' if print_seq else '', index_fa, in_read, '|' ] command = cmd_read + cmd_query + cmd_assign pro.run_safe(command)
def prophyle_download(library, library_dir, force=False): """Create a library Download genomic library and copy the corresponding tree. Args: library (str): Library to download (bacteria / viruses / ...) library_dir (str): Directory where download files will be downloaded. TODO: * Add support for alternative URLs (http / ftp, backup refseq sites, etc.). * http://downloads.hmpdacc.org/data/HMREFG/all_seqs.fa.bz2 * ftp://public-ftp.hmpdacc.org/HMREFG/all_seqs.fa.bz2 """ if library == "all": for l in LIBRARIES: prophyle_download(l, library_dir, force) return else: assert library in LIBRARIES if library_dir is None: d = os.path.join(os.path.expanduser("~/prophyle"), library) else: d = os.path.join(library_dir, library) # print('making',d, file=sys.stderr) # os.makedirs(d, exist_ok=True) pro.makedirs(d) #pro.message("Checking library '{}' in '{}'".format(library, d)) lib_missing = _missing_library(d) if library == 'bacteria': if lib_missing or force: cmd = [ 'cd', d + "/..", '&&', 'curl', '-O', ZENODO_URL + '/files/bacteria.nw', '&&', 'curl', ZENODO_URL + '/files/bacteria.tar.gz', '|', 'tar', 'xz' ] pro.run_safe(cmd) _mark_complete(d, 1) # _pseudo_fai(d) elif library == 'viruses': if lib_missing or force: cmd = [ 'cd', d + "/..", '&&', 'curl', '-O', ZENODO_URL + '/files/viruses.nw', '&&', 'curl', ZENODO_URL + '/files/viruses.tar.gz', '|', 'tar', 'xz' ] pro.run_safe(cmd) _mark_complete(d, 1) # _pseudo_fai(d) elif library == 'plasmids': if lib_missing or force: cmd = [ 'cd', d + "/..", '&&', 'curl', '-O', ZENODO_URL + '/files/plasmids.nw', '&&', 'curl', ZENODO_URL + '/files/plasmids.tar.gz', '|', 'tar', 'xz' ] pro.run_safe(cmd) _mark_complete(d, 1) # _pseudo_fai(d) elif library == 'hmp': if lib_missing or force: # fix when error appears cmd = [ 'cd', d, '&&', 'curl', 'http://downloads.hmpdacc.org/data/HMREFG/all_seqs.fa.bz2', '|', 'bzip2', '-d', '|', SPLIT_FA, os.path.abspath(d) ] pro.run_safe(cmd) _mark_complete(d, 1) # _pseudo_fai(d) else: raise ValueError('Unknown library "{}"'.format(library))
def query(fa, fq, k, u=False, v=False, t=1): params = "" cmd = [prophyle_index, 'query', "-v" if v else "", "-u" if u else "", '-k', k, '-t', t, fa, fq] pro.run_safe(cmd)