def check_emem_functionality(logger): if not is_emem_aligner(): return True logger.debug('Checking correctness of E-MEM compilation...') nucmer_output_dirpath = create_nucmer_output_dir(qconfig.output_dirpath) nucmer_fpath = join(nucmer_output_dirpath, 'test') return_code = run_nucmer(nucmer_fpath, options_parser.test_contigs_fpaths[0], options_parser.test_contigs_fpaths[1], '/dev/null', '/dev/null', 0, emem_threads=1) if return_code != 0: logger.main_info('E-MEM does not work properly. QUAST will try to recompile contig aligner software.') open(e_mem_failed_compilation_flag, 'w').close() clean_tmp_files(nucmer_fpath) return compile_aligner(logger)
def check_emem_functionality(logger): if not is_emem_aligner(): return True logger.debug('Checking correctness of E-MEM compilation...') nucmer_output_dirpath = create_nucmer_output_dir(qconfig.output_dirpath) nucmer_fpath = join(nucmer_output_dirpath, 'test') return_code = run_nucmer(nucmer_fpath, options_parser.test_contigs_fpaths[0], options_parser.test_contigs_fpaths[1], '/dev/null', '/dev/null', 0, emem_threads=1) if return_code != 0: if get_installed_emem(): logger.main_info('Preinstalled E-MEM does not work properly.') else: logger.main_info('E-MEM does not work properly. QUAST will try to use Nucmer.') reset_aligner_selection() qconfig.force_nucmer = True safe_create(e_mem_failed_compilation_flag, logger, is_required=True) clean_tmp_files(nucmer_fpath) return compile_aligner(logger)
def do(ref_fpath, contigs_fpaths, output_dirpath): gage_results_dirpath = os.path.join(output_dirpath, 'gage') # suffixes for files with report tables in plain text and tab separated formats if not os.path.isdir(gage_results_dirpath): os.mkdir(gage_results_dirpath) ######################################################################## gage_tool_path = os.path.join(gage_dirpath, 'getCorrectnessStats.sh') ######################################################################## logger.print_timestamp() logger.main_info('Running GAGE...') metrics = [ 'Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs', 'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation', 'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50' ] metrics_in_reporting = [ reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE, reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50 ] tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp') if not os.path.exists(tmp_dirpath): os.makedirs(tmp_dirpath) if not compile_aligner(logger) or ( not all_required_java_classes_exist(gage_dirpath) and not compile_gage()): logger.error( 'GAGE module was not installed properly, so it is disabled and you cannot use --gage.' ) return n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: return_codes = Parallel(n_jobs=n_jobs)( delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths)) else: return_codes = [ run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths) ] if 0 not in return_codes: logger.error('Error occurred while GAGE was processing assemblies.' ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr')) return ## find metrics for total report: for i, contigs_fpath in enumerate(contigs_fpaths): corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) report = reporting.get(contigs_fpath) log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout') logfile_out = open(log_out_fpath, 'r') cur_metric_id = 0 for line in logfile_out: if metrics[cur_metric_id] in line: if (metrics[cur_metric_id].startswith('N50')): report.add_field( metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip()) else: report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip()) cur_metric_id += 1 if cur_metric_id == len(metrics): break logfile_out.close() reporting.save_gage(output_dirpath) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
return any(c in args for c in cmds) if 'bdist_wheel' in sys.argv: raise RuntimeError( "This setup.py does not support wheels. setup.py install will be run automatically..." ) if abspath(dirname(__file__)) != abspath(os.getcwd()): logger.error('Please change to ' + dirname(__file__) + ' before running setup.py') sys.exit() if cmd_in(['clean', 'sdist']): logger.info('Cleaning up binary files...') compile_aligner(logger, only_clean=True) compile_gnuplot(logger, only_clean=True) compile_glimmer(logger, only_clean=True) compile_gage(only_clean=True) compile_bwa(logger, only_clean=True) compile_bedtools(logger, only_clean=True) for fpath in [fn for fn in glob(join(quast_package, '*.pyc'))]: os.remove(fpath) for fpath in [ fn for fn in glob(join(quast_package, 'html_saver', '*.pyc')) ]: os.remove(fpath) for fpath in [ fn for fn in glob(join(quast_package, 'site_packages', '*', '*.pyc')) ]:
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if not success_compilation: logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return dict( zip(contigs_fpaths, [AlignerStatus.FAILED] * len(contigs_fpaths))), None num_nf_errors = logger._num_nf_errors create_minimap_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed genome_size, reference_chromosomes, ns_by_chromosomes = get_genome_stats( reference, skip_ns=True) threads = qconfig.max_threads if qconfig.memory_efficient else threads args = [(is_cyclic, i, contigs_fpath, output_dir, reference, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads) for i, (contigs_fpath, old_contigs_fpath ) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))] statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs = run_parallel( align_and_analyze, args, n_jobs) reports = [] aligner_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict( zip(contigs_fpaths, aligned_lengths_by_contigs)) if AlignerStatus.OK in aligner_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == AlignerStatus.OK: reports.append( save_result(results[index], report, fname, reference, genome_size)) elif statuses[index] == AlignerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if AlignerStatus.OK in aligner_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot( reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict( (contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(aligner_statuses.values()).count(AlignerStatus.OK) not_aligned = list(aligner_statuses.values()).count( AlignerStatus.NOT_ALIGNED) failed = list(aligner_statuses.values()).count(AlignerStatus.FAILED) errors = list(aligner_statuses.values()).count(AlignerStatus.ERROR) problems = not_aligned + failed + errors all = len(aligner_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info( 'Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return aligner_statuses, aligned_lengths_per_fpath
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None if qconfig.draw_plots: compile_gnuplot(logger, only_clean=False) num_nf_errors = logger._num_nf_errors create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.splitted_ref and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append(align_and_analyze( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\ [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)] reports = [] nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs)) if NucmerStatus.OK in nucmer_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname, reference)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(nucmer_statuses.values()).count(NucmerStatus.OK) not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED) failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED) errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return nucmer_statuses, aligned_lengths_per_fpath
args = sys.argv[1:] def cmd_in(cmds): return any(c in args for c in cmds) if abspath(dirname(__file__)) != abspath(os.getcwd()): logger.error('Please change to ' + dirname(__file__) + ' before running setup.py') sys.exit() if cmd_in(['clean', 'sdist']): logger.info('Cleaning up binary files...') compile_aligner(logger, only_clean=True) compile_glimmer(logger, only_clean=True) compile_gage(only_clean=True) compile_bwa(logger, only_clean=True) compile_bedtools(logger, only_clean=True) for fpath in [fn for fn in glob(join(quast_package, '*.pyc'))]: os.remove(fpath) for fpath in [fn for fn in glob(join(quast_package, 'html_saver', '*.pyc'))]: os.remove(fpath) for fpath in [fn for fn in glob(join(quast_package, 'site_packages', '*', '*.pyc'))]: os.remove(fpath) if cmd_in(['clean']): if isdir('build'): shutil.rmtree('build') if isdir('dist'): shutil.rmtree('dist') if isdir(name + '.egg-info'): shutil.rmtree(name + '.egg-info')
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if qconfig.test and is_emem_aligner(): success_compilation = check_emem_functionality(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None if qconfig.draw_plots: compile_gnuplot(logger, only_clean=False) num_nf_errors = logger._num_nf_errors create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if qconfig.memory_efficient: threads = 1 else: threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.splitted_ref and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append(align_and_analyze( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\ [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)] reports = [] nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs)) if NucmerStatus.OK in nucmer_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname, reference)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(nucmer_statuses.values()).count(NucmerStatus.OK) not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED) failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED) errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') if not qconfig.test and is_emem_aligner(): logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.') return nucmer_statuses, aligned_lengths_per_fpath
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') num_nf_errors = logger._num_nf_errors success_compilation = compile_aligner(logger) if qconfig.test and is_emem_aligner(): success_compilation = check_emem_functionality(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if qconfig.memory_efficient: threads = 1 else: threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.splitted_ref: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append(align_and_analyze( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \ [x[1] for x in statuses_results_lengths_tuples], \ [x[2] for x in statuses_results_lengths_tuples] reports = [] for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) if qconfig.draw_plots: from . import plotter plotter.draw_misassembl_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) oks = list(nucmer_statuses.values()).count(NucmerStatus.OK) not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED) failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED) errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') if not qconfig.test and is_emem_aligner(): logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.') return nucmer_statuses, aligned_lengths_per_fpath
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [AlignerStatus.FAILED] * len(contigs_fpaths))), None num_nf_errors = logger._num_nf_errors create_minimap_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) genome_size, reference_chromosomes, ns_by_chromosomes = get_genome_stats(reference, skip_ns=True) threads = qconfig.max_threads if qconfig.memory_efficient else threads args = [(is_cyclic, i, contigs_fpath, output_dir, reference, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))] statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs = run_parallel(align_and_analyze, args, n_jobs) reports = [] aligner_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs)) if AlignerStatus.OK in aligner_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == AlignerStatus.OK: reports.append(save_result(results[index], report, fname, reference, genome_size)) elif statuses[index] == AlignerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if AlignerStatus.OK in aligner_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(aligner_statuses.values()).count(AlignerStatus.OK) not_aligned = list(aligner_statuses.values()).count(AlignerStatus.NOT_ALIGNED) failed = list(aligner_statuses.values()).count(AlignerStatus.FAILED) errors = list(aligner_statuses.values()).count(AlignerStatus.ERROR) problems = not_aligned + failed + errors all = len(aligner_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return aligner_statuses, aligned_lengths_per_fpath
def do(ref_fpath, contigs_fpaths, output_dirpath): gage_results_dirpath = os.path.join(output_dirpath, 'gage') # suffixes for files with report tables in plain text and tab separated formats if not os.path.isdir(gage_results_dirpath): os.mkdir(gage_results_dirpath) ######################################################################## gage_tool_path = os.path.join(gage_dirpath, 'getCorrectnessStats.sh') ######################################################################## logger.print_timestamp() logger.main_info('Running GAGE...') metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs', 'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation', 'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50'] metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE, reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50] tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp') if not os.path.exists(tmp_dirpath): os.makedirs(tmp_dirpath) if not compile_aligner(logger) or (not all_required_java_classes_exist(gage_dirpath) and not compile_gage()): logger.error('GAGE module was not installed properly, so it is disabled and you cannot use --gage.') return n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths)) if 0 not in return_codes: logger.error('Error occurred while GAGE was processing assemblies.' ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr')) return ## find metrics for total report: for i, contigs_fpath in enumerate(contigs_fpaths): corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) report = reporting.get(contigs_fpath) log_out_fpath = os.path.join( gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout') logfile_out = open(log_out_fpath, 'r') cur_metric_id = 0 for line in logfile_out: if metrics[cur_metric_id] in line: if (metrics[cur_metric_id].startswith('N50')): report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip()) else: report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip()) cur_metric_id += 1 if cur_metric_id == len(metrics): break logfile_out.close() reporting.save_gage(output_dirpath) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def do(reference, contigs_fpaths, cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') num_nf_errors = logger._num_nf_errors if not compile_aligner(logger): logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return dict( zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if qconfig.memory_efficient: threads = 1 else: threads = max(int(qconfig.max_threads / n_jobs), 1) from joblib import Parallel, delayed if not qconfig.splitted_ref: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)( delayed(align_and_analyze)(cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath ) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len( qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)( delayed(align_and_analyze)(cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate( zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate( zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append( align_and_analyze(cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \ [x[1] for x in statuses_results_lengths_tuples], \ [x[2] for x in statuses_results_lengths_tuples] reports = [] if qconfig.is_combined_ref: ref_misassemblies = [ result['istranslocations_by_refs'] if result else [] for result in results ] if ref_misassemblies: for i, fpath in enumerate(contigs_fpaths): if ref_misassemblies[i]: assembly_name = qutils.name_from_fpath(fpath) all_rows = [] all_refs = sorted( list( set([ ref for ref in ref_labels_by_chromosomes.values() ]))) row = { 'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))] } all_rows.append(row) for k in all_refs: row = {'metricName': k, 'values': []} for ref in all_refs: if ref == k or ref not in ref_misassemblies[i]: row['values'].append(None) else: row['values'].append( ref_misassemblies[i][ref][k]) all_rows.append(row) misassembly_by_ref_fpath = join( output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name) print >> open( misassembly_by_ref_fpath, 'w' ), 'Number of interspecies translocations by references: \n' print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True) print >> open(misassembly_by_ref_fpath, 'a'), '\nReferences: ' for ref_num, ref in enumerate(all_refs): print >> open(misassembly_by_ref_fpath, 'a'), str(ref_num + 1) + ' - ' + ref logger.info( ' Information about interspecies translocations by references for %s is saved to %s' % (assembly_name, misassembly_by_ref_fpath)) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) if qconfig.draw_plots: import plotter plotter.draw_misassembl_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') oks = nucmer_statuses.values().count(NucmerStatus.OK) not_aligned = nucmer_statuses.values().count(NucmerStatus.NOT_ALIGNED) failed = nucmer_statuses.values().count(NucmerStatus.FAILED) errors = nucmer_statuses.values().count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info( 'Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return nucmer_statuses, aligned_lengths_per_fpath