def run_parallel(_fn, fn_args, n_jobs=None, filter_results=False): if qconfig.memory_efficient: results_tuples = [_fn(*args) for args in fn_args] else: n_jobs = n_jobs or qconfig.max_threads if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed results_tuples = Parallel(n_jobs=n_jobs)(delayed(_fn)(*args) for args in fn_args) results = [] if results_tuples: if isinstance(results_tuples[0], list) or isinstance( results_tuples[0], tuple): results_cnt = len(results_tuples[0]) if filter_results: results = [[ result_list[i] for result_list in results_tuples if result_list[i] ] for i in range(results_cnt)] else: results = [[result_list[i] for result_list in results_tuples] for i in range(results_cnt)] else: results = [ result for result in results_tuples if result or not filter_results ] return results
def compress_dataset_files(dataset_data, ext_python_modules_home, max_threads, log): log.info("\n== Compressing corrected reads (with gzip)") to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith('reads'): compressed_reads_filenames = [] for reads_file in value: if not os.path.isfile(reads_file): support.error('something went wrong and file with corrected reads (' + reads_file + ') is missing!', log) to_compress.append(reads_file) compressed_reads_filenames.append(reads_file + ".gz") reads_library[key] = compressed_reads_filenames if len(to_compress): pigz_path = support.which('pigz') if pigz_path: for reads_file in to_compress: support.sys_call([pigz_path, '-f', '-7', '-p', str(max_threads), reads_file], log) else: addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): from joblib2 import Parallel, delayed elif sys.version.startswith('3.'): from joblib3 import Parallel, delayed n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)(delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output)
def partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template, labels): # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) n_jobs = min(qconfig.max_threads, len(assemblies)) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed assemblies = Parallel(n_jobs=n_jobs)(delayed(parallel_partition_contigs)( asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template) for asm in assemblies) assemblies_dicts = [assembly[0] for assembly in assemblies] assemblies_by_ref = [] for ref_fpath in ref_fpaths: ref_name = qutils.name_from_fpath(ref_fpath) not_sorted_assemblies = set([ val for sublist in (assemblies_dicts[i][ref_name] for i in range(len(assemblies_dicts))) for val in sublist ]) sorted_assemblies = [] for label in labels: # sort by label for assembly in not_sorted_assemblies: if assembly.label == label: sorted_assemblies.append(assembly) break assemblies_by_ref.append((ref_fpath, sorted_assemblies)) not_aligned_assemblies = [assembly[1] for assembly in assemblies] return assemblies_by_ref, not_aligned_assemblies
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed if qconfig.memory_efficient: results = Parallel(n_jobs=n_jobs)( delayed(predict_genes)(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) else: results = [ predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths) ] genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label], unique, full_genes, partial_genes = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if full_genes is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique is None and full_genes is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % label) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting): ## removing from contigs' names special characters because: ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc ## 2) Nucmer fails on names like "contig 1_bla_bla", "contig 2_bla_bla" (it interprets as a contig's name only the first word of caption and gets ambiguous contigs names) if qconfig.max_threads is None: qconfig.max_threads = 1 n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed logger.main_info(' Pre-processing...') if not qconfig.memory_efficient: corrected_info = Parallel(n_jobs=n_jobs)( delayed(parallel_correct_contigs)(i, contigs_fpath, corrected_dirpath, labels) for i, contigs_fpath in enumerate(contigs_fpaths)) else: corrected_info = [ parallel_correct_contigs(i, contigs_fpath, corrected_dirpath, labels) for i, contigs_fpath in enumerate(contigs_fpaths) ] corrected_contigs_fpaths = [] old_contigs_fpaths = [] if any([ is_fatal_error for (old_fpaths, corr_fpaths, broken_scaffold_fpaths, logs, is_fatal_error) in corrected_info ]): exit(4) for contig_idx, (old_fpaths, corr_fpaths, broken_scaffold_fpaths, logs, is_fatal_error) in enumerate(corrected_info): label = labels[contig_idx] logger.main_info('\n'.join(logs)) for old_fpath in old_fpaths: old_contigs_fpaths.append(old_fpath) qconfig.assembly_labels_by_fpath[old_fpath] = label for corr_fpath, lengths in corr_fpaths: corrected_contigs_fpaths.append(corr_fpath) qconfig.assembly_labels_by_fpath[corr_fpath] = label add_lengths_to_report(lengths, reporting, corr_fpath) for broken_fpath, lengths in broken_scaffold_fpaths: old_contigs_fpaths.append(broken_fpath) corrected_contigs_fpaths.append(broken_fpath) qconfig.assembly_labels_by_fpath[broken_fpath] = label + '_broken' add_lengths_to_report(lengths, reporting, broken_fpath) if qconfig.draw_plots or qconfig.html_report: if not plotter_data.dict_color_and_ls: plotter_data.save_colors_and_ls(corrected_contigs_fpaths) return corrected_contigs_fpaths, old_contigs_fpaths
def move_dataset_files(dataset_data, dst, ext_python_modules_home, max_threads, log, gzip=False): to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith('reads'): moved_reads_files = [] for reads_file in value: dst_filename = os.path.join(dst, os.path.basename(reads_file)) # TODO: fix problem with files with the same basenames in Hammer binary! if not os.path.isfile(reads_file): if (not gzip and os.path.isfile(dst_filename)) or ( gzip and os.path.isfile(dst_filename + '.gz')): support.warning( 'file with corrected reads (' + reads_file + ') is the same in several libraries', log) if gzip: dst_filename += '.gz' else: support.error( 'something went wrong and file with corrected reads (' + reads_file + ') is missing!', log) else: shutil.move(reads_file, dst_filename) if gzip: to_compress.append(dst_filename) dst_filename += '.gz' moved_reads_files.append(dst_filename) reads_library[key] = moved_reads_files if len(to_compress): pigz_path = support.which('pigz') if pigz_path: for reads_file in to_compress: support.sys_call([ pigz_path, '-f', '-7', '-p', str(max_threads), reads_file ], log) else: addsitedir(ext_python_modules_home) if sys.version.startswith('2.'): from joblib2 import Parallel, delayed elif sys.version.startswith('3.'): from joblib3 import Parallel, delayed n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)( delayed(support.sys_call)(['gzip', '-f', '-7', reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output)
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning( "GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning( ' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) elif not install_genemark(): logger.warning( ' Can\'t copy the license key to ~/.gm_key, skipping gene prediction.' ) else: if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: results = Parallel(n_jobs=n_jobs)( delayed(predict_genes) (index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) else: results = [ predict_genes(index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths) ] if not is_license_valid(out_dirpath, fasta_fpaths): return genes_by_labels = dict() # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) label = qutils.label_from_fpath(fasta_path) genes_by_labels[ label], unique_count, full_genes, partial_genes = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if full_genes is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique_count is None and full_genes is None: logger.error( ' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + label + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: for dirpath in glob.iglob(tmp_dirpath + '*'): if os.path.isdir(dirpath): shutil.rmtree(dirpath) logger.main_info('Done.') return genes_by_labels
def compress_dataset_files(input_file, ext_python_modules_home, max_threads, log, not_used_yaml_file, output_dir, gzip_output): addsitedir(ext_python_modules_home) if sys.version.startswith("2."): import pyyaml2 as pyyaml from joblib2 import Parallel, delayed elif sys.version.startswith("3."): import pyyaml3 as pyyaml from joblib3 import Parallel, delayed dataset_data = pyyaml.load(open(input_file)) remove_not_corrected_reads(output_dir) is_changed = False if gzip_output: is_changed = True pigz_path = support.which("pigz") if pigz_path: compressor = "pigz" else: compressor = "gzip" log.info("\n== Compressing corrected reads (with %s)" % compressor) to_compress = [] for reads_library in dataset_data: for key, value in reads_library.items(): if key.endswith("reads"): compressed_reads_filenames = [] for reads_file in value: compressed_reads_filenames.append(reads_file + ".gz") to_compress.append(reads_file) reads_library[key] = compressed_reads_filenames if len(to_compress): for reads_file in to_compress: if not isfile(reads_file): support.error( "something went wrong and file with corrected reads (%s) is missing!" % reads_file, log) if pigz_path: for reads_file in to_compress: support.sys_call([ pigz_path, "-f", "-7", "-p", str(max_threads), reads_file ], log) else: n_jobs = min(len(to_compress), max_threads) outputs = Parallel(n_jobs=n_jobs)( delayed(support.sys_call)(["gzip", "-f", "-7", reads_file]) for reads_file in to_compress) for output in outputs: if output: log.info(output) if not_used_yaml_file != "": is_changed = True not_used_dataset_data = pyyaml.load(open(not_used_yaml_file)) dataset_data += not_used_dataset_data if is_changed: with open(input_file, 'w') as f: pyyaml.dump(dataset_data, f, default_flow_style=False, default_style='"', width=float("inf"))
def nx_seq_junction(infilename1, infilename2, dst, log, silent=True): starttime = time.time() basename1 = os.path.basename(infilename1) if os.path.splitext(basename1)[1] == '.gz': basename1 = os.path.splitext(basename1)[0] basename2 = os.path.basename(infilename2) if os.path.splitext(basename2)[1] == '.gz': basename2 = os.path.splitext(basename2)[0] #open three outfiles splitfilenameleft = os.path.join(dst, 'R1_IJS7_' + basename1) splitfile1 = open(splitfilenameleft, 'w') splitfilenameright = os.path.join(dst, 'R2_IJS7_' + basename2) splitfile2 = open(splitfilenameright, 'w') unsplitfilename = os.path.join( dst, 'unsplit_IJS7_' + basename1.replace('_R1_', '_R1R2_')) unsplitfile = open(unsplitfilename, 'w') #jctstr = '(GGTTCATCGTCAGGCCTGACGATGAACC){e<=4}' # JS7 24/28 required results in ~92% detected in ion torrent # from NextClip: --adaptor_sequence GTTCATCGTCAGG -e --strict_match 22,11 --relaxed_match 20,10 eg strict 22/26 = 4 errors, relaxed 20/26 = 6 errors jctstr = '(GTTCATCGTCAGGCCTGACGATGAAC){e<=4}' # try 22/26 to match NextClip strict (e<=6 for relaxed) #PARSE both files in tuples of 4 lines parserR1 = ParseFastQ(infilename1) parserR2 = ParseFastQ(infilename2) all_stats = JunctionStats() n_jobs = options_storage.threads while True: # prepare input reads1 = list(itertools.islice(parserR1, READS_PER_BATCH)) reads2 = list(itertools.islice(parserR2, READS_PER_BATCH)) if len(reads1) != len(reads2): support.error( "lucigen_nxmate.py, nx_seq_junction: " "number of left reads (%d) is not equal to number of right reads (%d)!" % (len(reads1), len(reads2)), log) if not reads1: break chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs) # processing outputs = Parallel(n_jobs=n_jobs)( delayed(nx_seq_junction_process_batch)(reads, jctstr) for reads in chunks) results, stats = [x[0] for x in outputs], [x[1] for x in outputs] # writing results for result, stat in zip(results, stats): write_to_files([splitfile1, splitfile2, unsplitfile], result) all_stats += stat if not silent: log.info( "==== nx_seq_junction progress: reads processed: %d, time elapsed: %s" % (all_stats.readcounter, time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime)))) parserR1.close() parserR2.close() splitfile1.close() splitfile2.close() unsplitfile.close() if all_stats.readcounter == 0: support.error( "lucigen_nxmate.py, nx_seq_junction: error in input data! Number of processed reads is 0!", log) if all_stats.splitcounter == 0: support.error( "lucigen_nxmate.py, nx_seq_junction: error in input data! Number of split pairs is 0!", log) if not silent: #print some stats percentsplit = 100 * all_stats.splitcounter / all_stats.readcounter percentR1R2 = 100 * all_stats.R1R2jctcounter / all_stats.splitcounter percentR1 = 100 * all_stats.R1jctcounter / all_stats.splitcounter percentR2 = 100 * all_stats.R2jctcounter / all_stats.splitcounter log.info("==== nx_seq_junction info: processing finished!") log.info("==== nx_seq_junction info: %d reads processed" % (all_stats.readcounter)) log.info( "==== nx_seq_junction info: %d total split pairs (%.2f %% of processed reads))" % (all_stats.splitcounter, percentsplit)) log.info( "==== nx_seq_junction info: %d junctions in both R1 and R2 (%.2f %% of split junctions))" % (all_stats.R1R2jctcounter, percentR1R2)) log.info( "==== nx_seq_junction info: %d split junctions are in Read1 (%.2f %% of split junctions))" % (all_stats.R1jctcounter, percentR1)) log.info( "==== nx_seq_junction info: %d split junctions are in Read2 (%.2f %% of split junctions))" % (all_stats.R2jctcounter, percentR2)) elapsedtime = time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime)) log.info("==== nx_seq_junction info: time elapsed: %s" % (elapsedtime)) parserR1.close() parserR2.close() return splitfilenameleft, splitfilenameright, unsplitfilename
def chimera_clean(infilename1, infilename2, dst, log, silent=True): starttime = time.time() basename1 = os.path.basename(infilename1) if os.path.splitext(basename1)[1] == '.gz': basename1 = os.path.splitext(basename1)[0] basename2 = os.path.basename(infilename2) if os.path.splitext(basename2)[1] == '.gz': basename2 = os.path.splitext(basename2)[0] #open four outfiles outfilename1 = os.path.join(dst, 'mates_ICC4_' + basename1) outfile1 = open(outfilename1, 'w') slagfilename1 = os.path.join(dst, 'non-mates_ICC4_' + basename1) slagfile1 = open(slagfilename1, 'w') outfilename2 = os.path.join(dst, 'mates_ICC4_' + basename2) outfile2 = open(outfilename2, 'w') slagfilename2 = os.path.join(dst, 'non-mates_ICC4_' + basename2) slagfile2 = open(slagfilename2, 'w') #set up regular expression patterns for chimera codes- for illumin use the reverse complements of right codes csslist1 = [ '(TGGACTCCACTGTG){e<=1}', '(ACTTCGCCACTGTG){e<=1}', '(TGAGTCCCACTGTG){e<=1}', '(TGACTGCCACTGTG){e<=1}', '(TCAGGTCCACTGTG){e<=1}', '(ATGTCACCACTGTG){e<=1}', '(GTATGACCACTGTG){e<=1}', '(GTCTACCCACTGTG){e<=1}', '(GTTGGACCACTGTG){e<=1}', '(CGATTCCCACTGTG){e<=1}', '(GGTTACCCACTGTG){e<=1}', '(TCACCTCCACTGTG){e<=1}' ] csslist2 = [ '(TCCAGACCAATGTG){e<=1}', '(ACATCACCAATGTG){e<=1}', '(TCACGACCAATGTG){e<=1}', '(TAGCACCCAATGTG){e<=1}', '(AACCTCCCAATGTG){e<=1}', '(ACAACTCCAATGTG){e<=1}', '(GTCTAACCAATGTG){e<=1}', '(TACACGCCAATGTG){e<=1}', '(GAGAACCCAATGTG){e<=1}', '(GAGATTCCAATGTG){e<=1}', '(GACCTACCAATGTG){e<=1}', '(AGACTCCCAATGTG){e<=1}' ] #PARSE both files in tuples of 4 lines parserR1 = ParseFastQ(infilename1) parserR2 = ParseFastQ(infilename2) all_stats = CleanStats() n_jobs = options_storage.threads while True: # prepare input reads1 = list(itertools.islice(parserR1, READS_PER_BATCH)) reads2 = list(itertools.islice(parserR2, READS_PER_BATCH)) if len(reads1) != len(reads2): support.error( "lucigen_nxmate.py, chimera_clean: " "number of left reads (%d) is not equal to number of right reads (%d)!" % (len(reads1), len(reads2)), log) if not reads1: break chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs) # processing outputs = Parallel(n_jobs=n_jobs)( delayed(chimera_clean_process_batch)(reads, csslist1, csslist2) for reads in chunks) results, stats = [x[0] for x in outputs], [x[1] for x in outputs] # writing results for result, stat in zip(results, stats): write_to_files([outfile1, outfile2, slagfile1, slagfile2], result) all_stats += stat if not silent: log.info( "==== chimera_clean progress: reads processed: %d, time elapsed: %s" % (all_stats.readcounter, time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime)))) parserR1.close() parserR2.close() outfile1.close() slagfile1.close() outfile2.close() slagfile2.close() if all_stats.TOTALmatecounter + all_stats.slagcounter != all_stats.readcounter: support.error( "lucigen_nxmate.py, chimera_clean: error in the script somewhere! Unequal read counts!", log) if all_stats.readcounter == 0: support.error( "lucigen_nxmate.py, chimera_clean: error in input data! Number of processed reads is 0!", log) if not silent: #print some stats percentmates = 100. * all_stats.matecounter / all_stats.readcounter percentslag = 100. * all_stats.slagcounter / all_stats.readcounter log.info("==== chimera_clean info: processing finished!") log.info( "==== chimera_clean info: %d reads processed, %d true mate reads (%.2f %%) " "and %d non-mates/chimeras (%.2f %%)." % (all_stats.readcounter, all_stats.matecounter, percentmates, all_stats.slagcounter, percentslag)) shortmates = all_stats.TOTALmatecounter - all_stats.matecounter log.info( "==== chimera_clean info: %d mates too short to keep after trimming" % shortmates) elapsedtime = time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime)) log.info("==== chimera_clean info: time elapsed: %s" % (elapsedtime)) log.info("==== chimera_clean info: " + str(all_stats.csscounter)) return outfilename1, outfilename2
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath): if not download_blast_binaries(filenames=blast_filenames): return None, None, None if qconfig.custom_blast_db_fpath: global db_fpath db_fpath = qconfig.custom_blast_db_fpath if isdir(db_fpath): db_aux_files = [ f for f in os.listdir(db_fpath) if f.endswith('.nsq') ] if db_aux_files: db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', '')) elif isfile(db_fpath) and db_fpath.endswith('.nsq'): db_fpath = db_fpath[:-len('.nsq')] if not os.path.isfile(db_fpath + '.nsq'): logger.error( 'You should specify path to BLAST database obtained by running makeblastdb command: ' 'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.' ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.', exit_with_code=2) elif not download_blastdb(): return None, None, None blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') if len(blast_assemblies) > 0: logger.main_info('Running BlastN..') n_jobs = min(qconfig.max_threads, len(blast_assemblies)) blast_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed Parallel(n_jobs=n_jobs)(delayed(parallel_blast)( assembly.fpath, assembly.label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads) for i, assembly in enumerate(blast_assemblies)) logger.main_info() species_scores = [] species_by_assembly = dict() max_entries = 4 replacement_dict = defaultdict(list) for label in labels: assembly_scores = [] assembly_species = [] res_fpath = get_blast_output_fpath(blast_res_fpath, label) if os.path.exists(res_fpath): refs_for_query = 0 with open(res_fpath) as res_file: query_id_col, subj_id_col, idy_col, len_col, score_col = None, None, None, None, None for line in res_file: fs = line.split() if line.startswith('#'): refs_for_query = 0 # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score if 'Fields' in line: fs = line.strip().split('Fields: ')[-1].split(', ') query_id_col = fs.index( 'query id') if 'query id' in fs else 0 subj_id_col = fs.index( 'subject id') if 'subject id' in fs else 1 idy_col = fs.index( '% identity') if '% identity' in fs else 2 len_col = fs.index( 'alignment length' ) if 'alignment length' in fs else 3 score_col = fs.index( 'bit score') if 'bit score' in fs else 11 elif refs_for_query < max_entries and len(fs) > score_col: query_id = fs[query_id_col] organism_id = fs[subj_id_col] idy = float(fs[idy_col]) length = int(fs[len_col]) score = float(fs[score_col]) if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore: # and (not scores or min(scores) - score < max_identity_difference): seqname, taxons = parse_organism_id(organism_id) if not seqname: continue species_name = get_species_name(seqname) if species_name and 'uncultured' not in seqname: if refs_for_query == 0: if species_name not in assembly_species: assembly_scores.append( (seqname, query_id, score)) if taxons: taxons_for_krona[correct_name( seqname)] = taxons assembly_species.append(species_name) refs_for_query += 1 else: seq_scores = [ (query_name, seq_query_id, seq_score) for query_name, seq_query_id, seq_score in assembly_scores if get_species_name( query_name) == species_name ] if seq_scores and score > seq_scores[ 0][2]: assembly_scores.remove( seq_scores[0]) assembly_scores.append( (seqname, query_id, score)) if taxons: taxons_for_krona[correct_name( seqname)] = taxons refs_for_query += 1 else: if seqname not in replacement_dict[ query_id]: replacement_dict[query_id].append( seqname) refs_for_query += 1 assembly_scores = sorted(assembly_scores, reverse=True) assembly_scores = assembly_scores[:qconfig.max_references] for seqname, query_id, score in assembly_scores: if not species_by_assembly or not any( seqname in species_list for species_list in species_by_assembly.values()): species_scores.append((seqname, query_id, score)) species_by_assembly[label] = [ seqname for seqname, query_id, score in assembly_scores ] if not species_scores: return None, None, None return species_scores, species_by_assembly, replacement_dict
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, features_dict, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): coords_dirpath = os.path.join(detailed_contigs_reports_dirpath, qconfig.minimap_output_dirname) from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: coords_dirpath = os.path.join(coords_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) genome_size, reference_chromosomes, ns_by_chromosomes = fastaparser.get_genome_stats( ref_fpath) # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = os.path.join(genome_stats_dirpath, 'genome_info.txt') res_file = open(result_fpath, 'w') containers = [] for feature, feature_fpath in features_dict.items(): containers.append(FeatureContainer([feature_fpath], feature)) if not features_dict: logger.notice( 'No file with genomic features were provided. ' 'Use the --features option if you want to specify it.\n', indent=' ') if operons_fpaths: containers.append(FeatureContainer(operons_fpaths, 'operon')) else: logger.notice( 'No file with operons were provided. ' 'Use the -O option if you want to specify it.', indent=' ') for container in containers: if not container.fpaths: continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file( fpath, container.kind) if len(container.region_list) == 0: logger.warning('No genomic features of type "' + container.kind + '" were loaded.', indent=' ') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' genomic features of type "' + container.kind + '"') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict( container.kind, container.region_list, list(reference_chromosomes.keys())) ref_genes_num, ref_operons_num = None, None for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) genomic_features = 0 for container in containers: if container.kind == 'operon': ref_operons_num = len(container.region_list) report.add_field(reporting.Fields.REF_OPERONS, len(container.region_list)) else: genomic_features += len(container.region_list) if genomic_features: ref_genes_num = genomic_features report.add_field(reporting.Fields.REF_GENES, genomic_features) # for cumulative plots: files_features_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_unsorted_features_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} files_unsorted_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: process_results = Parallel(n_jobs=n_jobs)( delayed(process_single_file)( contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) else: process_results = [ process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers) for index, contigs_fpath in enumerate(aligned_contigs_fpaths) ] num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [ process_results[i][1] for i in range(len(process_results)) ] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ ref_lengths[i][ref] for i in range(len(ref_lengths)) ] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, ' + 'total length without N\'s: ' + str(chr_len - len(ns_by_chromosomes[chr_name])) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('=' * 120 + '\n') for contigs_fpath, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\ in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_features_in_contigs[contigs_fpath] = features_in_contigs files_unsorted_features_in_contigs[ contigs_fpath] = unsorted_features_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs files_unsorted_operons_in_contigs[ contigs_fpath] = unsorted_operons_in_contigs full_found_genes.append(sum(features_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) res_file.write( '%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], report.get_field( reporting.Fields.MAPPEDGENOME), report.get_field(reporting.Fields.DUPLICATION_RATIO), gaps_count)) genome_mapped.append( float(report.get_field(reporting.Fields.MAPPEDGENOME))) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if qconfig.html_report: from quast_libs.html_saver import html_saver if ref_genes_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'features', files_features_in_contigs, ref_genes_num) if ref_operons_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter from quast_libs.ca_utils.misc import contigs_aligned_lengths if ref_genes_num: plotter.genes_operons_plot( ref_genes_num, aligned_contigs_fpaths, files_features_in_contigs, genome_stats_dirpath + '/features_cumulative_plot', 'genomic features') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_features_in_contigs, genome_stats_dirpath + '/features_frcurve_plot', 'genomic features') plotter.histogram( aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_features_histogram', '# complete genomic features') if ref_operons_num: plotter.genes_operons_plot( ref_operons_num, aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_operons_in_contigs, genome_stats_dirpath + '/operons_frcurve_plot', 'operons') plotter.histogram( aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return containers