def partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template, labels): # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) n_jobs = min(qconfig.max_threads, len(assemblies)) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed assemblies = Parallel(n_jobs=n_jobs)(delayed(parallel_partition_contigs)( asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template) for asm in assemblies) assemblies_dicts = [assembly[0] for assembly in assemblies] assemblies_by_ref = [] for ref_fpath in ref_fpaths: ref_name = qutils.name_from_fpath(ref_fpath) not_sorted_assemblies = set([ val for sublist in (assemblies_dicts[i][ref_name] for i in range(len(assemblies_dicts))) for val in sublist ]) sorted_assemblies = [] for label in labels: # sort by label for assembly in not_sorted_assemblies: if assembly.label == label: sorted_assemblies.append(assembly) break assemblies_by_ref.append((ref_fpath, sorted_assemblies)) not_aligned_assemblies = [assembly[1] for assembly in assemblies] return assemblies_by_ref, not_aligned_assemblies
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed if qconfig.memory_efficient: results = Parallel(n_jobs=n_jobs)( delayed(predict_genes)(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) else: results = [ predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths) ] genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label], unique, full_genes, partial_genes = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if full_genes is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique is None and full_genes is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % label) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if qconfig.memory_efficient: results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) else: results = [predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)] genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label], unique, full_genes, partial_genes = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if full_genes is not None: genes = ['%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes)] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique is None and full_genes is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % label) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template, labels): # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) n_jobs = min(qconfig.max_threads, len(assemblies)) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed assemblies = Parallel(n_jobs=n_jobs)(delayed(parallel_partition_contigs)(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template) for asm in assemblies) assemblies_dicts = [assembly[0] for assembly in assemblies] assemblies_by_ref = [] for ref_fpath in ref_fpaths: ref_name = qutils.name_from_fpath(ref_fpath) not_sorted_assemblies = set([val for sublist in (assemblies_dicts[i][ref_name] for i in range(len(assemblies_dicts))) for val in sublist]) sorted_assemblies = [] for label in labels: # sort by label for assembly in not_sorted_assemblies: if assembly.label == label: sorted_assemblies.append(assembly) break assemblies_by_ref.append((ref_fpath, sorted_assemblies)) not_aligned_assemblies = [assembly[1] for assembly in assemblies] return assemblies_by_ref, not_aligned_assemblies
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if not success_compilation: logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return dict( zip(contigs_fpaths, [AlignerStatus.FAILED] * len(contigs_fpaths))), None num_nf_errors = logger._num_nf_errors create_minimap_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed genome_size, reference_chromosomes, ns_by_chromosomes = get_genome_stats( reference, skip_ns=True) threads = qconfig.max_threads if qconfig.memory_efficient else threads args = [(is_cyclic, i, contigs_fpath, output_dir, reference, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads) for i, (contigs_fpath, old_contigs_fpath ) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))] statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs = run_parallel( align_and_analyze, args, n_jobs) reports = [] aligner_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict( zip(contigs_fpaths, aligned_lengths_by_contigs)) if AlignerStatus.OK in aligner_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == AlignerStatus.OK: reports.append( save_result(results[index], report, fname, reference, genome_size)) elif statuses[index] == AlignerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if AlignerStatus.OK in aligner_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot( reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict( (contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(aligner_statuses.values()).count(AlignerStatus.OK) not_aligned = list(aligner_statuses.values()).count( AlignerStatus.NOT_ALIGNED) failed = list(aligner_statuses.values()).count(AlignerStatus.FAILED) errors = list(aligner_statuses.values()).count(AlignerStatus.ERROR) problems = not_aligned + failed + errors all = len(aligner_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info( 'Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return aligner_statuses, aligned_lengths_per_fpath
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None if qconfig.draw_plots: compile_gnuplot(logger, only_clean=False) num_nf_errors = logger._num_nf_errors create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.splitted_ref and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append(align_and_analyze( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\ [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)] reports = [] nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs)) if NucmerStatus.OK in nucmer_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname, reference)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(nucmer_statuses.values()).count(NucmerStatus.OK) not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED) failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED) errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return nucmer_statuses, aligned_lengths_per_fpath
__author__ = 'letovesnoi' import subprocess import sys import os import shutil this_location = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) sys.path.append(os.path.join(this_location, 'quast_libs')) from quast_libs import qutils from quast_libs import fastaparser if qutils.is_python2(): from quast_libs.site_packages.joblib2 import Parallel, delayed else: from quast_libs.site_packages.joblib3 import Parallel, delayed from general import log from general import rqconfig logger = log.get_logger('parallel_blat_run') def parallel_blat_run(transcripts_dict, reference_pathes, threads, tmp_dir, label, logger, log_dir): log_out_1 = os.path.join(log_dir, label + '.blat.out.log') logger.print_timestamp() logger.info('Getting psl files by BLAT for {}...'.format(label))
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath): if not download_blast_binaries(filenames=blast_filenames): return None, None, None if qconfig.custom_blast_db_fpath: global db_fpath db_fpath = qconfig.custom_blast_db_fpath if isdir(db_fpath): db_aux_files = [ f for f in os.listdir(db_fpath) if f.endswith('.nsq') ] if db_aux_files: db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', '')) elif isfile(db_fpath) and db_fpath.endswith('.nsq'): db_fpath = db_fpath[:-len('.nsq')] if not os.path.isfile(db_fpath + '.nsq'): logger.error( 'You should specify path to BLAST database obtained by running makeblastdb command: ' 'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.' ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.', exit_with_code=2) elif not download_blastdb(): return None, None, None blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') if len(blast_assemblies) > 0: logger.main_info('Running BlastN..') n_jobs = min(qconfig.max_threads, len(blast_assemblies)) blast_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed Parallel(n_jobs=n_jobs)(delayed(parallel_blast)( assembly.fpath, assembly.label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads) for i, assembly in enumerate(blast_assemblies)) logger.main_info() species_scores = [] species_by_assembly = dict() max_entries = 4 replacement_dict = defaultdict(list) for label in labels: assembly_scores = [] assembly_species = [] res_fpath = get_blast_output_fpath(blast_res_fpath, label) if os.path.exists(res_fpath): refs_for_query = 0 with open(res_fpath) as res_file: query_id_col, subj_id_col, idy_col, len_col, score_col = None, None, None, None, None for line in res_file: fs = line.split() if line.startswith('#'): refs_for_query = 0 # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score if 'Fields' in line: fs = line.strip().split('Fields: ')[-1].split(', ') query_id_col = fs.index( 'query id') if 'query id' in fs else 0 subj_id_col = fs.index( 'subject id') if 'subject id' in fs else 1 idy_col = fs.index( '% identity') if '% identity' in fs else 2 len_col = fs.index( 'alignment length' ) if 'alignment length' in fs else 3 score_col = fs.index( 'bit score') if 'bit score' in fs else 11 elif refs_for_query < max_entries and len(fs) > score_col: query_id = fs[query_id_col] organism_id = fs[subj_id_col] idy = float(fs[idy_col]) length = int(fs[len_col]) score = float(fs[score_col]) if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore: # and (not scores or min(scores) - score < max_identity_difference): seqname, taxons = parse_organism_id(organism_id) if not seqname: continue species_name = get_species_name(seqname) if species_name and 'uncultured' not in seqname: if refs_for_query == 0: if species_name not in assembly_species: assembly_scores.append( (seqname, query_id, score)) if taxons: taxons_for_krona[correct_name( seqname)] = taxons assembly_species.append(species_name) refs_for_query += 1 else: seq_scores = [ (query_name, seq_query_id, seq_score) for query_name, seq_query_id, seq_score in assembly_scores if get_species_name( query_name) == species_name ] if seq_scores and score > seq_scores[ 0][2]: assembly_scores.remove( seq_scores[0]) assembly_scores.append( (seqname, query_id, score)) if taxons: taxons_for_krona[correct_name( seqname)] = taxons refs_for_query += 1 else: if seqname not in replacement_dict[ query_id]: replacement_dict[query_id].append( seqname) refs_for_query += 1 assembly_scores = sorted(assembly_scores, reverse=True) assembly_scores = assembly_scores[:qconfig.max_references] for seqname, query_id, score in assembly_scores: if not species_by_assembly or not any( seqname in species_list for species_list in species_by_assembly.values()): species_scores.append((seqname, query_id, score)) species_by_assembly[label] = [ seqname for seqname, query_id, score in assembly_scores ] if not species_scores: return None, None, None return species_scores, species_by_assembly, replacement_dict
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if qconfig.test and is_emem_aligner(): success_compilation = check_emem_functionality(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None if qconfig.draw_plots: compile_gnuplot(logger, only_clean=False) num_nf_errors = logger._num_nf_errors create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if qconfig.memory_efficient: threads = 1 else: threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.splitted_ref and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append(align_and_analyze( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\ [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)] reports = [] nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs)) if NucmerStatus.OK in nucmer_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname, reference)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(nucmer_statuses.values()).count(NucmerStatus.OK) not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED) failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED) errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') if not qconfig.test and is_emem_aligner(): logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.') return nucmer_statuses, aligned_lengths_per_fpath
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys())) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # for cumulative plots: files_genes_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed process_results = Parallel(n_jobs=n_jobs)(delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [process_results[i][1] for i in range(len(process_results))] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('================================================================================================================\n') for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from quast_libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter if genes_container.region_list: plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return [genes_container, operons_container]
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') num_nf_errors = logger._num_nf_errors success_compilation = compile_aligner(logger) if qconfig.test and is_emem_aligner(): success_compilation = check_emem_functionality(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if qconfig.memory_efficient: threads = 1 else: threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.splitted_ref: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append(align_and_analyze( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \ [x[1] for x in statuses_results_lengths_tuples], \ [x[2] for x in statuses_results_lengths_tuples] reports = [] for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) if qconfig.draw_plots: from . import plotter plotter.draw_misassembl_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) oks = list(nucmer_statuses.values()).count(NucmerStatus.OK) not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED) failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED) errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') if not qconfig.test and is_emem_aligner(): logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.') return nucmer_statuses, aligned_lengths_per_fpath
def align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath): log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') nucmer_successful_check_fpath = nucmer_fpath + '.sf' delta_fpath = nucmer_fpath + '.delta' filtered_delta_fpath = nucmer_fpath + '.fdelta' coords_fpath, _, _, show_snps_fpath, _ = \ get_nucmer_aux_out_fpaths(nucmer_fpath) log_out_f.write('Aligning contigs to reference...\n') # Checking if there are existing previous nucmer alignments. # If they exist, using them to save time. using_existing_alignments = False if isfile(nucmer_successful_check_fpath) and isfile(coords_fpath) and \ (isfile(show_snps_fpath) or isfile(show_snps_fpath + '.gz') or not qconfig.show_snps): if check_nucmer_successful_check(nucmer_successful_check_fpath, old_contigs_fpath, ref_fpath): log_out_f.write('\tUsing existing alignments...\n') logger.info(' ' + qutils.index_to_str(index) + 'Using existing alignments... ') using_existing_alignments = True if not using_existing_alignments: log_out_f.write('\tAligning contigs to the reference\n') logger.info(' ' + qutils.index_to_str(index) + 'Aligning contigs to the reference') if not qconfig.splitted_ref: nucmer_exit_code = run_nucmer(nucmer_fpath, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, threads) if nucmer_exit_code != 0: return NucmerStatus.ERROR else: prefixes_and_chr_files = [(nucmer_fpath + "_" + basename(chr_fname), chr_fname) for chr_fname in qconfig.splitted_ref] # Daemonic processes are not allowed to have children, # so if we are already one of parallel processes # (i.e. daemonic) we can't start new daemonic processes if parallel_by_chr and not qconfig.memory_efficient: n_jobs = min(qconfig.max_threads, len(prefixes_and_chr_files)) threads = max(1, threads // n_jobs) else: n_jobs = 1 threads = 1 if n_jobs > 1: logger.info(' ' + 'Aligning to different chromosomes in parallel' ' (' + str(n_jobs) + ' threads)') # processing each chromosome separately (if we can) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: nucmer_exit_codes = Parallel(n_jobs=n_jobs)(delayed(run_nucmer)( prefix, chr_file, contigs_fpath, log_out_fpath, log_err_fpath + "_part%d" % (i + 1), index, threads) for i, (prefix, chr_file) in enumerate(prefixes_and_chr_files)) else: nucmer_exit_codes = [run_nucmer(prefix, chr_file, contigs_fpath, log_out_fpath, log_err_fpath + "_part%d" % (i + 1), index, threads) for i, (prefix, chr_file) in enumerate(prefixes_and_chr_files)] log_err_f.write("Stderr outputs for reference parts are in:\n") for i in range(len(prefixes_and_chr_files)): log_err_f.write(log_err_fpath + "_part%d" % (i + 1) + '\n') log_err_f.write("\n") if 0 not in nucmer_exit_codes: return NucmerStatus.ERROR else: # filling common delta file delta_file = open(delta_fpath, 'w') delta_file.write(ref_fpath + " " + contigs_fpath + "\n") delta_file.write("NUCMER\n") for i, (prefix, chr_fname) in enumerate(prefixes_and_chr_files): if nucmer_exit_codes[i] != 0: logger.warning(' ' + qutils.index_to_str(index) + 'Failed aligning contigs %s to reference part %s! Skipping this part. ' % (qutils.label_from_fpath(contigs_fpath), chr_fname) + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) continue chr_delta_fpath = prefix + '.delta' if isfile(chr_delta_fpath): chr_delta_file = open(chr_delta_fpath) chr_delta_file.readline() chr_delta_file.readline() for line in chr_delta_file: delta_file.write(line) chr_delta_file.close() delta_file.close() # By default: filtering by IDY% = 95 (as GAGE did) return_code = qutils.call_subprocess( [bin_fpath('delta-filter'), '-i', str(qconfig.min_IDY), '-l', str(qconfig.min_alignment), delta_fpath], stdout=open(filtered_delta_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Delta filter failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR shutil.move(filtered_delta_fpath, delta_fpath) if qconfig.draw_plots: draw_mummer_plot(logger, nucmer_fpath, delta_fpath, index, log_out_f, log_err_f) tmp_coords_fpath = coords_fpath + '_tmp' return_code = qutils.call_subprocess( [bin_fpath('show-coords'), delta_fpath], stdout=open(tmp_coords_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Show-coords failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR # removing waste lines from coords file coords_file = open(coords_fpath, 'w') header = [] tmp_coords_file = open(tmp_coords_fpath) for line in tmp_coords_file: header.append(line) if line.startswith('====='): break coords_file.write(header[-2]) coords_file.write(header[-1]) for line in tmp_coords_file: coords_file.write(line) coords_file.close() tmp_coords_file.close() if not isfile(coords_fpath): return NucmerStatus.FAILED if len(open(coords_fpath).readlines()[-1].split()) < 13: return NucmerStatus.NOT_ALIGNED if qconfig.show_snps: with open(coords_fpath) as coords_file: headless_coords_fpath = coords_fpath + '.headless' headless_coords_f = open(headless_coords_fpath, 'w') coords_file.readline() coords_file.readline() headless_coords_f.write(coords_file.read()) headless_coords_f.close() headless_coords_f = open(headless_coords_fpath) return_code = qutils.call_subprocess( [bin_fpath('show-snps'), '-S', '-T', '-H', delta_fpath], stdin=headless_coords_f, stdout=open(show_snps_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Show-snps failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR create_nucmer_successful_check(nucmer_successful_check_fpath, old_contigs_fpath, ref_fpath) return NucmerStatus.OK
def align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath): log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') nucmer_successful_check_fpath = nucmer_fpath + '.sf' delta_fpath = nucmer_fpath + '.delta' filtered_delta_fpath = nucmer_fpath + '.fdelta' coords_fpath, _, _, show_snps_fpath, _ = \ get_nucmer_aux_out_fpaths(nucmer_fpath) log_out_f.write('Aligning contigs to reference...\n') # Checking if there are existing previous nucmer alignments. # If they exist, using them to save time. using_existing_alignments = False if isfile(nucmer_successful_check_fpath) and isfile(coords_fpath) and \ (isfile(show_snps_fpath) or isfile(show_snps_fpath + '.gz') or not qconfig.show_snps): if check_nucmer_successful_check(nucmer_successful_check_fpath, old_contigs_fpath, ref_fpath): log_out_f.write('\tUsing existing alignments...\n') logger.info(' ' + qutils.index_to_str(index) + 'Using existing alignments... ') using_existing_alignments = True if not using_existing_alignments: log_out_f.write('\tAligning contigs to the reference\n') logger.info(' ' + qutils.index_to_str(index) + 'Aligning contigs to the reference') if not qconfig.splitted_ref: nucmer_exit_code = run_nucmer(nucmer_fpath, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, threads) if nucmer_exit_code != 0: return NucmerStatus.ERROR else: prefixes_and_chr_files = [(nucmer_fpath + "_" + basename(chr_fname), chr_fname) for chr_fname in qconfig.splitted_ref] # Daemonic processes are not allowed to have children, # so if we are already one of parallel processes # (i.e. daemonic) we can't start new daemonic processes if parallel_by_chr and not qconfig.memory_efficient: n_jobs = min(qconfig.max_threads, len(prefixes_and_chr_files)) threads = max(1, threads // n_jobs) else: n_jobs = 1 threads = 1 if n_jobs > 1: logger.info(' ' + 'Aligning to different chromosomes in parallel' ' (' + str(n_jobs) + ' threads)') # processing each chromosome separately (if we can) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed nucmer_exit_codes = Parallel(n_jobs=n_jobs)(delayed(run_nucmer)( prefix, chr_file, contigs_fpath, log_out_fpath, log_err_fpath + "_part%d" % (i + 1), index, threads) for i, (prefix, chr_file) in enumerate(prefixes_and_chr_files)) log_err_f.write("Stderr outputs for reference parts are in:\n") for i in range(len(prefixes_and_chr_files)): log_err_f.write(log_err_fpath + "_part%d" % (i + 1) + '\n') log_err_f.write("\n") if 0 not in nucmer_exit_codes: return NucmerStatus.ERROR else: # filling common delta file delta_file = open(delta_fpath, 'w') delta_file.write(ref_fpath + " " + contigs_fpath + "\n") delta_file.write("NUCMER\n") for i, (prefix, chr_fname) in enumerate(prefixes_and_chr_files): if nucmer_exit_codes[i] != 0: logger.warning(' ' + qutils.index_to_str(index) + 'Failed aligning contigs %s to reference part %s! Skipping this part. ' % (qutils.label_from_fpath(contigs_fpath), chr_fname) + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) continue chr_delta_fpath = prefix + '.delta' if isfile(chr_delta_fpath): chr_delta_file = open(chr_delta_fpath) chr_delta_file.readline() chr_delta_file.readline() for line in chr_delta_file: delta_file.write(line) chr_delta_file.close() delta_file.close() # By default: filtering by IDY% = 95 (as GAGE did) return_code = qutils.call_subprocess( [bin_fpath('delta-filter'), '-i', str(qconfig.min_IDY), '-l', str(qconfig.min_alignment), delta_fpath], stdout=open(filtered_delta_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Delta filter failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR shutil.move(filtered_delta_fpath, delta_fpath) tmp_coords_fpath = coords_fpath + '_tmp' return_code = qutils.call_subprocess( [bin_fpath('show-coords'), delta_fpath], stdout=open(tmp_coords_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Show-coords failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR # removing waste lines from coords file coords_file = open(coords_fpath, 'w') header = [] tmp_coords_file = open(tmp_coords_fpath) for line in tmp_coords_file: header.append(line) if line.startswith('====='): break coords_file.write(header[-2]) coords_file.write(header[-1]) for line in tmp_coords_file: coords_file.write(line) coords_file.close() tmp_coords_file.close() if not isfile(coords_fpath): return NucmerStatus.FAILED if len(open(coords_fpath).readlines()[-1].split()) < 13: return NucmerStatus.NOT_ALIGNED if qconfig.show_snps: with open(coords_fpath) as coords_file: headless_coords_fpath = coords_fpath + '.headless' headless_coords_f = open(headless_coords_fpath, 'w') coords_file.readline() coords_file.readline() headless_coords_f.write(coords_file.read()) headless_coords_f.close() headless_coords_f = open(headless_coords_fpath) return_code = qutils.call_subprocess( [bin_fpath('show-snps'), '-S', '-T', '-H', delta_fpath], stdin=headless_coords_f, stdout=open(show_snps_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Show-snps failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR create_nucmer_successful_check(nucmer_successful_check_fpath, old_contigs_fpath, ref_fpath) return NucmerStatus.OK
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath): if not os.path.isdir(blastdb_dirpath): os.makedirs(blastdb_dirpath) if not download_all_blast_binaries(): return None, None if qconfig.custom_blast_db_fpath: global db_fpath db_fpath = qconfig.custom_blast_db_fpath if isdir(db_fpath): db_aux_files = [f for f in os.listdir(db_fpath) if f.endswith('.nsq')] if db_aux_files: db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', '')) elif isfile(db_fpath) and db_fpath.endswith('.nsq'): db_fpath = db_fpath[:-len('.nsq')] if not os.path.isfile(db_fpath + '.nsq'): logger.error('You should specify path to BLAST database obtained by running makeblastdb command: ' 'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.' ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.', exit_with_code=2) elif not os.path.isfile(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize: # if os.path.isdir(blastdb_dirpath): # shutil.rmtree(blastdb_dirpath) if not download_blastdb(): return None, None logger.info() blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') if len(blast_assemblies) > 0: logger.main_info('Running BlastN..') n_jobs = min(qconfig.max_threads, len(blast_assemblies)) blast_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(assembly.fpath, assembly.label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads) for i, assembly in enumerate(blast_assemblies)) logger.main_info('') scores_organisms = [] organisms_assemblies = {} for label in labels: all_scores = [] organisms = [] res_fpath = get_blast_output_fpath(blast_res_fpath, label) if os.path.exists(res_fpath): refs_for_query = 0 for line in open(res_fpath): if refs_for_query == 0 and not line.startswith('#') and len(line.split()) > 10: # TODO: find and parse "Fields" line to detect each column indexes: # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score # We need: identity, legnth, score, query and subject id. line = line.split() organism_id = line[1] idy = float(line[2]) length = int(line[3]) score = float(line[11]) if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore: # and (not scores or min(scores) - score < max_identity_difference): seqname, taxons = parse_organism_id(organism_id) if not seqname: continue specie = seqname.split('_') if len(specie) > 1 and 'uncultured' not in seqname: specie = specie[0] + '_' + specie[1] if specie not in organisms: all_scores.append((score, seqname)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons organisms.append(specie) refs_for_query += 1 else: tuple_scores = [x for x in all_scores if specie in x[1]] if tuple_scores and score > tuple_scores[0][0]: all_scores.remove((tuple_scores[0][0], tuple_scores[0][1])) all_scores.append((score, seqname)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons refs_for_query += 1 elif line.startswith('#'): refs_for_query = 0 all_scores = sorted(all_scores, reverse=True) all_scores = all_scores[:qconfig.max_references] for score in all_scores: if not organisms_assemblies or (organisms_assemblies.values() and not [1 for list in organisms_assemblies.values() if score[1] in list]): scores_organisms.append(score) organisms_assemblies[label] = [score[1] for score in all_scores] if not scores_organisms: return None, None return scores_organisms, organisms_assemblies
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath): if not download_all_blast_binaries(): return None, None if qconfig.custom_blast_db_fpath: global db_fpath db_fpath = qconfig.custom_blast_db_fpath if isdir(db_fpath): db_aux_files = [f for f in os.listdir(db_fpath) if f.endswith('.nsq')] if db_aux_files: db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', '')) elif isfile(db_fpath) and db_fpath.endswith('.nsq'): db_fpath = db_fpath[:-len('.nsq')] if not os.path.isfile(db_fpath + '.nsq'): logger.error('You should specify path to BLAST database obtained by running makeblastdb command: ' 'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.' ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.', exit_with_code=2) elif not download_blastdb(): return None, None blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') if len(blast_assemblies) > 0: logger.main_info('Running BlastN..') n_jobs = min(qconfig.max_threads, len(blast_assemblies)) blast_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(assembly.fpath, assembly.label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads) for i, assembly in enumerate(blast_assemblies)) logger.main_info('') scores_organisms = [] organisms_assemblies = {} for label in labels: all_scores = [] organisms = [] res_fpath = get_blast_output_fpath(blast_res_fpath, label) if os.path.exists(res_fpath): refs_for_query = 0 with open(res_fpath) as res_file: for line in res_file: if refs_for_query == 0 and not line.startswith('#') and len(line.split()) > 10: # TODO: find and parse "Fields" line to detect each column indexes: # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score # We need: identity, legnth, score, query and subject id. line = line.split() organism_id = line[1] idy = float(line[2]) length = int(line[3]) score = float(line[11]) if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore: # and (not scores or min(scores) - score < max_identity_difference): seqname, taxons = parse_organism_id(organism_id) if not seqname: continue specie = seqname.split('_') if len(specie) > 1 and 'uncultured' not in seqname: specie = specie[0] + '_' + specie[1] if specie not in organisms: all_scores.append((score, seqname)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons organisms.append(specie) refs_for_query += 1 else: tuple_scores = [x for x in all_scores if specie in x[1]] if tuple_scores and score > tuple_scores[0][0]: all_scores.remove((tuple_scores[0][0], tuple_scores[0][1])) all_scores.append((score, seqname)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons refs_for_query += 1 elif line.startswith('#'): refs_for_query = 0 all_scores = sorted(all_scores, reverse=True) all_scores = all_scores[:qconfig.max_references] for score in all_scores: if not organisms_assemblies or (organisms_assemblies.values() and not [1 for list in organisms_assemblies.values() if score[1] in list]): scores_organisms.append(score) organisms_assemblies[label] = [score[1] for score in all_scores] if not scores_organisms: return None, None return scores_organisms, organisms_assemblies
def process_blast(blast_assemblies, downloaded_dirpath, corrected_dirpath, labels, blast_check_fpath, err_fpath): if not download_blast_binaries(filenames=blast_filenames): return None, None if qconfig.custom_blast_db_fpath: global db_fpath db_fpath = qconfig.custom_blast_db_fpath if isdir(db_fpath): db_aux_files = [f for f in os.listdir(db_fpath) if f.endswith('.nsq')] if db_aux_files: db_fpath = join(qconfig.custom_blast_db_fpath, db_aux_files[0].replace('.nsq', '')) elif isfile(db_fpath) and db_fpath.endswith('.nsq'): db_fpath = db_fpath[:-len('.nsq')] if not os.path.isfile(db_fpath + '.nsq'): logger.error('You should specify path to BLAST database obtained by running makeblastdb command: ' 'either path to directory containing <dbname>.nsq file or path to <dbname>.nsq file itself.' ' Also you can rerun MetaQUAST without --blast-db option. MetaQUAST uses SILVA 16S RNA database by default.', exit_with_code=2) elif not download_blastdb(): return None, None blast_res_fpath = os.path.join(downloaded_dirpath, 'blast.res') if len(blast_assemblies) > 0: logger.main_info('Running BlastN..') n_jobs = min(qconfig.max_threads, len(blast_assemblies)) blast_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed Parallel(n_jobs=n_jobs)(delayed(parallel_blast)(assembly.fpath, assembly.label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads) for i, assembly in enumerate(blast_assemblies)) logger.main_info() species_scores = [] species_by_assembly = dict() max_entries = 4 replacement_dict = defaultdict(list) for label in labels: assembly_scores = [] assembly_species = [] res_fpath = get_blast_output_fpath(blast_res_fpath, label) if os.path.exists(res_fpath): refs_for_query = 0 with open(res_fpath) as res_file: query_id_col, subj_id_col, idy_col, len_col, score_col = None, None, None, None, None for line in res_file: fs = line.split() if line.startswith('#'): refs_for_query = 0 # Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score if 'Fields' in line: fs = line.strip().split('Fields: ')[-1].split(', ') query_id_col = fs.index('query id') subj_id_col = fs.index('subject id') idy_col = fs.index('% identity') len_col = fs.index('alignment length') score_col = fs.index('bit score') elif refs_for_query < max_entries and len(fs) > score_col: query_id = fs[query_id_col] organism_id = fs[subj_id_col] idy = float(fs[idy_col]) length = int(fs[len_col]) score = float(fs[score_col]) if idy >= qconfig.identity_threshold and length >= qconfig.min_length and score >= qconfig.min_bitscore: # and (not scores or min(scores) - score < max_identity_difference): seqname, taxons = parse_organism_id(organism_id) if not seqname: continue species_name = seqname.split('_') if len(species_name) > 1 and 'uncultured' not in seqname: species_name = species_name[0] + '_' + species_name[1] if refs_for_query == 0: if species_name not in assembly_species: assembly_scores.append((seqname, query_id, score)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons assembly_species.append(species_name) refs_for_query += 1 else: seq_scores = [(seqname, query_id, score) for seqname, query_id, score in assembly_scores if species_name in seqname] if seq_scores and score > seq_scores[0][2]: assembly_scores.remove(seq_scores[0]) assembly_scores.append((seqname, query_id, score)) if taxons: taxons_for_krona[correct_name(seqname)] = taxons refs_for_query += 1 else: if seqname not in replacement_dict[query_id]: replacement_dict[query_id].append(seqname) refs_for_query += 1 assembly_scores = sorted(assembly_scores, reverse=True) assembly_scores = assembly_scores[:qconfig.max_references] for seqname, query_id, score in assembly_scores: if not species_by_assembly or not any(seqname in species_list for species_list in species_by_assembly.values()): species_scores.append((seqname, query_id, score)) species_by_assembly[label] = [seqname for seqname, query_id, score in assembly_scores] if not species_scores: return None, None return species_scores, species_by_assembly, replacement_dict
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning("GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning(' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) else: successful = install_genemark(os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name)) if not successful: return if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) genes_by_labels = dict() # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) label = qutils.label_from_fpath(fasta_path) genes_by_labels[label], unique_count, count = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if count is not None: report.add_field(reporting.Fields.PREDICTED_GENES, count) if unique_count is None and count is None: logger.error(' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + label + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: for dirpath in glob.iglob(tmp_dirpath + '*'): if os.path.isdir(dirpath): shutil.rmtree(dirpath) logger.main_info('Done.') return genes_by_labels
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, features_dict, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): coords_dirpath = os.path.join(detailed_contigs_reports_dirpath, qconfig.minimap_output_dirname) from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: coords_dirpath = os.path.join(coords_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) genome_size, reference_chromosomes, ns_by_chromosomes = fastaparser.get_genome_stats( ref_fpath) # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = os.path.join(genome_stats_dirpath, 'genome_info.txt') res_file = open(result_fpath, 'w') containers = [] for feature, feature_fpath in features_dict.items(): containers.append(FeatureContainer([feature_fpath], feature)) if not features_dict: logger.notice( 'No file with genomic features were provided. ' 'Use the --features option if you want to specify it.\n', indent=' ') if operons_fpaths: containers.append(FeatureContainer(operons_fpaths, 'operon')) else: logger.notice( 'No file with operons were provided. ' 'Use the -O option if you want to specify it.', indent=' ') for container in containers: if not container.fpaths: continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file( fpath, container.kind) if len(container.region_list) == 0: logger.warning('No genomic features of type "' + container.kind + '" were loaded.', indent=' ') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' genomic features of type "' + container.kind + '"') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict( container.kind, container.region_list, list(reference_chromosomes.keys())) ref_genes_num, ref_operons_num = None, None for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) genomic_features = 0 for container in containers: if container.kind == 'operon': ref_operons_num = len(container.region_list) report.add_field(reporting.Fields.REF_OPERONS, len(container.region_list)) else: genomic_features += len(container.region_list) if genomic_features: ref_genes_num = genomic_features report.add_field(reporting.Fields.REF_GENES, genomic_features) # for cumulative plots: files_features_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_unsorted_features_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} files_unsorted_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: process_results = Parallel(n_jobs=n_jobs)( delayed(process_single_file)( contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) else: process_results = [ process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers) for index, contigs_fpath in enumerate(aligned_contigs_fpaths) ] num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [ process_results[i][1] for i in range(len(process_results)) ] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ ref_lengths[i][ref] for i in range(len(ref_lengths)) ] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, ' + 'total length without N\'s: ' + str(chr_len - len(ns_by_chromosomes[chr_name])) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('=' * 120 + '\n') for contigs_fpath, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\ in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_features_in_contigs[contigs_fpath] = features_in_contigs files_unsorted_features_in_contigs[ contigs_fpath] = unsorted_features_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs files_unsorted_operons_in_contigs[ contigs_fpath] = unsorted_operons_in_contigs full_found_genes.append(sum(features_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) res_file.write( '%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], report.get_field( reporting.Fields.MAPPEDGENOME), report.get_field(reporting.Fields.DUPLICATION_RATIO), gaps_count)) genome_mapped.append( float(report.get_field(reporting.Fields.MAPPEDGENOME))) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if qconfig.html_report: from quast_libs.html_saver import html_saver if ref_genes_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'features', files_features_in_contigs, ref_genes_num) if ref_operons_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter from quast_libs.ca_utils.misc import contigs_aligned_lengths if ref_genes_num: plotter.genes_operons_plot( ref_genes_num, aligned_contigs_fpaths, files_features_in_contigs, genome_stats_dirpath + '/features_cumulative_plot', 'genomic features') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_features_in_contigs, genome_stats_dirpath + '/features_frcurve_plot', 'genomic features') plotter.histogram( aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_features_histogram', '# complete genomic features') if ref_operons_num: plotter.genes_operons_plot( ref_operons_num, aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_operons_in_contigs, genome_stats_dirpath + '/operons_frcurve_plot', 'operons') plotter.histogram( aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return containers
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning( "GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning( ' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) elif not install_genemark(): logger.warning( ' Can\'t copy the license key to ~/.gm_key, skipping gene prediction.' ) else: if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: results = Parallel(n_jobs=n_jobs)( delayed(predict_genes) (index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) else: results = [ predict_genes(index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths) ] if not is_license_valid(out_dirpath, fasta_fpaths): return genes_by_labels = dict() # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) label = qutils.label_from_fpath(fasta_path) genes_by_labels[ label], unique_count, full_genes, partial_genes = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if full_genes is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique_count is None and full_genes is None: logger.error( ' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + label + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: for dirpath in glob.iglob(tmp_dirpath + '*'): if os.path.isdir(dirpath): shutil.rmtree(dirpath) logger.main_info('Done.') return genes_by_labels
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file( fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict( container.kind, container.region_list, list(reference_chromosomes.keys())) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # for cumulative plots: files_genes_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_unsorted_genes_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} files_unsorted_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: process_results = Parallel(n_jobs=n_jobs)( delayed(process_single_file) (contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) else: process_results = [ process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths) ] num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [ process_results[i][1] for i in range(len(process_results)) ] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ ref_lengths[i][ref] for i in range(len(ref_lengths)) ] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('=' * 120 + '\n') for contigs_fpath, (results, unsorted_genes_in_contigs, genes_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\ in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_unsorted_genes_in_contigs[ contigs_fpath] = unsorted_genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs files_unsorted_operons_in_contigs[ contigs_fpath] = unsorted_operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None if qconfig.html_report: from quast_libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter from quast_libs.ca_utils.misc import contigs_aligned_lengths if genes_container.region_list: plotter.genes_operons_plot( len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_genes_in_contigs, genome_stats_dirpath + '/genes_frcurve_plot', 'genes') plotter.histogram( aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot( len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_operons_in_contigs, genome_stats_dirpath + '/operons_frcurve_plot', 'operons') plotter.histogram( aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return [genes_container, operons_container]