def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger): ref_misassemblies = [result['istranslocations_by_refs'] if result else [] for result in results] potential_misassemblies_by_refs = [result['potential_misassemblies_by_refs'] if result else [] for result in results] all_refs = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()]))) misassemblies_by_refs_rows = [] row = {'metricName': 'References', 'values': all_refs} misassemblies_by_refs_rows.append(row) if ref_misassemblies: for i, fpath in enumerate(contigs_fpaths): row = {'metricName': qutils.label_from_fpath(fpath), 'values': []} misassemblies_by_refs_rows.append(row) if ref_misassemblies[i]: assembly_name = qutils.name_from_fpath(fpath) all_rows = [] row = {'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))]} all_rows.append(row) for k in all_refs: row = {'metricName': k, 'values': []} for ref in all_refs: if ref == k or ref not in ref_misassemblies[i]: row['values'].append(None) else: row['values'].append(ref_misassemblies[i][ref][k]) misassemblies_by_refs_rows[-1]['values'].append(max(0, sum([r for r in row['values'] if r]) + potential_misassemblies_by_refs[i][k])) all_rows.append(row) misassembly_by_ref_fpath = os.path.join(output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name) with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file: misassembly_by_ref_file.write('Number of interspecies translocations by references: \n') print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True) with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file: misassembly_by_ref_file.write('References:\n') for ref_num, ref in enumerate(all_refs): misassembly_by_ref_file.write(str(ref_num + 1) + ' - ' + ref + '\n') logger.info(' Information about interspecies translocations by references for %s is saved to %s' % (assembly_name, misassembly_by_ref_fpath)) misassemblies = [] if qconfig.draw_plots: from quast_libs import plotter aligned_contigs_labels = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: aligned_contigs_labels.append(row['metricName']) else: misassemblies_by_refs_rows.remove(row) for i in range(len(all_refs)): cur_results = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: cur_results.append(row['values'][i]) misassemblies.append(cur_results) is_translocations_plot_fpath = os.path.join(output_dir, 'intergenomic_misassemblies.' + qconfig.plot_extension) plotter.draw_meta_summary_plot('', output_dir, aligned_contigs_labels, all_refs, misassemblies_by_refs_rows, misassemblies, is_translocations_plot_fpath, title='Intergenomic misassemblies (found and supposed)', reverse=False, yaxis_title=None, print_all_refs=True)
def do(html_fpath, output_dirpath, combined_output_dirpath, output_dirpath_per_ref, metrics, misassembly_metrics, ref_names): labels = get_labels(combined_output_dirpath, qconfig.report_prefix + '.tsv') contigs_num = len(labels) plots_dirname = qconfig.plot_extension.upper() for ext in ['TXT', plots_dirname, 'TEX', 'TSV']: if not os.path.isdir(os.path.join(output_dirpath, ext)): os.mkdir(os.path.join(output_dirpath, ext)) for metric in metrics: if not isinstance(metric, tuple): summary_txt_fpath = os.path.join(output_dirpath, 'TXT', metric.replace(' ', '_') + '.txt') summary_tex_fpath = os.path.join(output_dirpath, 'TEX', metric.replace(' ', '_') + '.tex') summary_tsv_fpath = os.path.join(output_dirpath, 'TSV', metric.replace(' ', '_') + '.tsv') summary_plot_fpath = os.path.join(output_dirpath, plots_dirname, metric.replace(' ', '_')) results, all_rows, cur_ref_names = \ get_results_for_metric(ref_names, metric, contigs_num, labels, output_dirpath_per_ref, qconfig.transposed_report_prefix + '.tsv') if not results or not results[0]: continue if cur_ref_names: transposed_table = [{ 'metricName': 'Assemblies', 'values': [ all_rows[i]['metricName'] for i in range(1, len(all_rows)) ], }] for i in range(len(all_rows[0]['values'])): values = [] for j in range(1, len(all_rows)): values.append(all_rows[j]['values'][i]) transposed_table.append({ 'metricName': all_rows[0]['values'][i], # name of reference 'values': values }) print_file(transposed_table, summary_txt_fpath) reporting.save_tsv(summary_tsv_fpath, transposed_table) reporting.save_tex(summary_tex_fpath, transposed_table) reverse = False if reporting.get_quality( metric) == reporting.Fields.Quality.MORE_IS_BETTER: reverse = True y_label = None if metric in [ reporting.Fields.TOTALLEN, reporting.Fields.TOTALLENS__FOR_1000_THRESHOLD, reporting.Fields.TOTALLENS__FOR_10000_THRESHOLD, reporting.Fields.TOTALLENS__FOR_50000_THRESHOLD ]: y_label = 'Total length' elif metric == reporting.Fields.TOTAL_ALIGNED_LEN: y_label = 'Aligned length' elif metric in [ reporting.Fields.LARGCONTIG, reporting.Fields.N50, reporting.Fields.NGA50, reporting.Fields.MIS_EXTENSIVE_BASES ]: y_label = 'Contig length' elif metric == reporting.Fields.LARGALIGN: y_label = 'Alignment length' plotter.draw_meta_summary_plot(html_fpath, output_dirpath, labels, cur_ref_names, results, summary_plot_fpath, title=metric, reverse=reverse, yaxis_title=y_label, print_all_refs=True, logger=logger) if metric == reporting.Fields.MISASSEMBL: mis_results = [] report_fname = os.path.join( 'contigs_reports', qconfig.transposed_report_prefix + '_misassemblies' + '.tsv') if ref_names[-1] == qconfig.not_aligned_name: cur_ref_names = ref_names[:-1] for misassembly_metric in misassembly_metrics: results, all_rows, cur_ref_names = \ get_results_for_metric(cur_ref_names, misassembly_metric[len(reporting.Fields.TAB):], contigs_num, labels, output_dirpath_per_ref, report_fname) if results: mis_results.append(results) if mis_results: json_points = [] for contig_num in range(contigs_num): plot_fpath = os.path.join( output_dirpath, plots_dirname, qutils.slugify(labels[contig_num]) + '_misassemblies') json_points.append( plotter.draw_meta_summary_misassemblies_plot( mis_results, cur_ref_names, contig_num, plot_fpath, title=labels[contig_num])) if qconfig.html_report: from quast_libs.html_saver import html_saver if ref_names[-1] == qconfig.not_aligned_name: cur_ref_names = ref_names[:-1] if json_points: html_saver.save_meta_misassemblies( html_fpath, output_dirpath, json_points, labels, cur_ref_names) logger.main_info('') logger.main_info( ' Text versions of reports and plots for each metric (for all references and assemblies) are saved to ' + output_dirpath + '/')
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger): istranslocations_by_asm = [result['istranslocations_by_refs'] if result else None for result in results] misassemblies_by_asm = [result['misassemblies_by_ref'] if result else None for result in results] all_refs = [] for ref in ref_labels_by_chromosomes.values(): if ref not in all_refs: all_refs.append(ref) if not qconfig.use_input_ref_order: all_refs.sort() misassemblies_by_refs_rows = [] row = {'metricName': 'References', 'values': all_refs} misassemblies_by_refs_rows.append(row) if not istranslocations_by_asm: return for i, fpath in enumerate(contigs_fpaths): label = qutils.label_from_fpath(fpath) row = {'metricName': label, 'values': []} misassemblies_by_refs_rows.append(row) istranslocations_by_ref = istranslocations_by_asm[i] intergenomic_misassemblies_by_asm[label] = defaultdict(list) for ref in all_refs: intergenomic_misassemblies_by_asm[label][ref] = misassemblies_by_asm[i][ref] if misassemblies_by_asm[i] else [] if istranslocations_by_ref: assembly_name = qutils.name_from_fpath(fpath) all_rows = [] row = {'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))]} all_rows.append(row) for ref in all_refs: row = {'metricName': ref, 'values': []} for second_ref in all_refs: if ref == second_ref or second_ref not in istranslocations_by_ref: row['values'].append(None) else: row['values'].append(istranslocations_by_ref[ref][second_ref]) possible_misassemblies = 0 misassemblies_by_ref = misassemblies_by_asm[i] if misassemblies_by_ref: possible_misassemblies = misassemblies_by_ref[ref].count(Misassembly.POSSIBLE_MISASSEMBLIES) istranslocations = max(0, sum([r for r in row['values'] if r])) misassemblies_by_refs_rows[-1]['values'].append(istranslocations + possible_misassemblies) all_rows.append(row) misassembly_by_ref_fpath = os.path.join(output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name) with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file: misassembly_by_ref_file.write('Number of interspecies translocations by references: \n') print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True) with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file: misassembly_by_ref_file.write('References:\n') for ref_num, ref in enumerate(all_refs): misassembly_by_ref_file.write(str(ref_num + 1) + ' - ' + ref + '\n') logger.info(' Information about interspecies translocations by references for %s is saved to %s' % (assembly_name, misassembly_by_ref_fpath)) misassemblies = [] if qconfig.draw_plots: from quast_libs import plotter aligned_contigs_labels = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: aligned_contigs_labels.append(row['metricName']) else: misassemblies_by_refs_rows.remove(row) for i in range(len(all_refs)): cur_results = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: cur_results.append(row['values'][i]) misassemblies.append(cur_results) is_translocations_plot_fpath = os.path.join(output_dir, 'intergenomic_misassemblies') plotter.draw_meta_summary_plot('', output_dir, aligned_contigs_labels, all_refs, misassemblies, is_translocations_plot_fpath, title='Intergenomic misassemblies (found and supposed)', reverse=False, yaxis_title=None, print_all_refs=True, logger=logger)
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger): istranslocations_by_asm = [ result['istranslocations_by_refs'] if result else None for result in results ] misassemblies_by_asm = [ result['misassemblies_by_ref'] if result else None for result in results ] all_refs = [] for ref in ref_labels_by_chromosomes.values(): if ref not in all_refs: all_refs.append(ref) if not qconfig.use_input_ref_order: all_refs.sort() misassemblies_by_refs_rows = [] row = {'metricName': 'References', 'values': all_refs} misassemblies_by_refs_rows.append(row) if not istranslocations_by_asm: return for i, fpath in enumerate(contigs_fpaths): label = qutils.label_from_fpath(fpath) row = {'metricName': label, 'values': []} misassemblies_by_refs_rows.append(row) istranslocations_by_ref = istranslocations_by_asm[i] intergenomic_misassemblies_by_asm[label] = defaultdict(list) for ref in all_refs: intergenomic_misassemblies_by_asm[label][ ref] = misassemblies_by_asm[i][ref] if misassemblies_by_asm[ i] else [] if istranslocations_by_ref: assembly_name = qutils.name_from_fpath(fpath) all_rows = [] row = { 'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))] } all_rows.append(row) for ref in all_refs: row = {'metricName': ref, 'values': []} for second_ref in all_refs: if ref == second_ref or second_ref not in istranslocations_by_ref: row['values'].append(None) else: row['values'].append( istranslocations_by_ref[ref][second_ref]) possible_misassemblies = 0 misassemblies_by_ref = misassemblies_by_asm[i] if misassemblies_by_ref: possible_misassemblies = misassemblies_by_ref[ref].count( Misassembly.POSSIBLE_MISASSEMBLIES) istranslocations = max(0, sum([r for r in row['values'] if r])) misassemblies_by_refs_rows[-1]['values'].append( istranslocations + possible_misassemblies) all_rows.append(row) misassembly_by_ref_fpath = os.path.join( output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name) with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file: misassembly_by_ref_file.write( 'Number of interspecies translocations by references: \n') print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True) with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file: misassembly_by_ref_file.write('References:\n') for ref_num, ref in enumerate(all_refs): misassembly_by_ref_file.write( str(ref_num + 1) + ' - ' + ref + '\n') logger.info( ' Information about interspecies translocations by references for %s is saved to %s' % (assembly_name, misassembly_by_ref_fpath)) misassemblies = [] if qconfig.draw_plots: from quast_libs import plotter aligned_contigs_labels = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: aligned_contigs_labels.append(row['metricName']) else: misassemblies_by_refs_rows.remove(row) for i in range(len(all_refs)): cur_results = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: cur_results.append(row['values'][i]) misassemblies.append(cur_results) is_translocations_plot_fpath = os.path.join( output_dir, 'intergenomic_misassemblies') plotter.draw_meta_summary_plot( '', output_dir, aligned_contigs_labels, all_refs, misassemblies, is_translocations_plot_fpath, title='Intergenomic misassemblies (found and supposed)', reverse=False, yaxis_title=None, print_all_refs=True, logger=logger)
def do(html_fpath, output_dirpath, combined_output_dirpath, output_dirpath_per_ref, metrics, misassembl_metrics, ref_names): labels = get_labels(combined_output_dirpath, qconfig.report_prefix + '.tsv') contigs_num = len(labels) plots_dirname = qconfig.plot_extension.upper() for ext in ['TXT', plots_dirname, 'TEX', 'TSV']: if not os.path.isdir(os.path.join(output_dirpath, ext)): os.mkdir(os.path.join(output_dirpath, ext)) for metric in metrics: if not isinstance(metric, tuple): summary_txt_fpath = os.path.join(output_dirpath, 'TXT', metric.replace(' ', '_') + '.txt') summary_tex_fpath = os.path.join(output_dirpath, 'TEX', metric.replace(' ', '_') + '.tex') summary_tsv_fpath = os.path.join(output_dirpath, 'TSV', metric.replace(' ', '_') + '.tsv') summary_png_fpath = os.path.join(output_dirpath, plots_dirname, metric.replace(' ', '_') + '.' + qconfig.plot_extension) results, all_rows, cur_ref_names = get_results_for_metric(ref_names, metric, contigs_num, labels, output_dirpath_per_ref, qconfig.transposed_report_prefix + '.tsv') if not results or not results[0]: continue if cur_ref_names: transposed_table = [{'metricName': 'Assemblies', 'values': [all_rows[i]['metricName'] for i in range(1, len(all_rows))],}] for i in range(len(all_rows[0]['values'])): values = [] for j in range(1, len(all_rows)): values.append(all_rows[j]['values'][i]) transposed_table.append({'metricName': all_rows[0]['values'][i], # name of reference 'values': values}) print_file(transposed_table, summary_txt_fpath) reporting.save_tsv(summary_tsv_fpath, transposed_table) reporting.save_tex(summary_tex_fpath, transposed_table) reverse = False if reporting.get_quality(metric) == reporting.Fields.Quality.MORE_IS_BETTER: reverse = True y_label = None if metric == reporting.Fields.TOTALLEN: y_label = 'Total length ' elif metric == reporting.Fields.TOTAL_ALIGNED_LEN: y_label = 'Aligned length ' elif metric in [reporting.Fields.LARGCONTIG, reporting.Fields.N50, reporting.Fields.NGA50, reporting.Fields.MIS_EXTENSIVE_BASES]: y_label = 'Contig length ' elif metric == reporting.Fields.LARGALIGN: y_label = 'Alignment length ' plotter.draw_meta_summary_plot(html_fpath, output_dirpath, labels, cur_ref_names, all_rows, results, summary_png_fpath, title=metric, reverse=reverse, yaxis_title=y_label) if metric == reporting.Fields.MISASSEMBL: mis_results = [] report_fname = os.path.join('contigs_reports', qconfig.transposed_report_prefix + '_misassemblies' + '.tsv') if ref_names[-1] == qconfig.not_aligned_name: cur_ref_names = ref_names[:-1] for misassembl_metric in misassembl_metrics: results, all_rows, cur_ref_names = get_results_for_metric(cur_ref_names, misassembl_metric[len(reporting.Fields.TAB):], contigs_num, labels, output_dirpath_per_ref, report_fname) if results: mis_results.append(results) if mis_results: json_points = [] for contig_num in range(contigs_num): plot_fpath = os.path.join(output_dirpath, plots_dirname, qutils.slugify(labels[contig_num]) + '_misassemblies') json_points.append(plotter.draw_meta_summary_misassembl_plot(mis_results, cur_ref_names, contig_num, plot_fpath, title=labels[contig_num])) if qconfig.html_report: from quast_libs.html_saver import html_saver if ref_names[-1] == qconfig.not_aligned_name: cur_ref_names = ref_names[:-1] if json_points: html_saver.save_meta_misassemblies(html_fpath, output_dirpath, json_points, labels, cur_ref_names) logger.main_info('') logger.main_info(' Text versions of reports and plots for each metric (for all references and assemblies) are saved to ' + output_dirpath + '/')
def do(reference, contigs_fpaths, cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') num_nf_errors = logger._num_nf_errors if not compile_aligner(logger): logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return dict( zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if qconfig.memory_efficient: threads = 1 else: threads = max(int(qconfig.max_threads / n_jobs), 1) from joblib import Parallel, delayed if not qconfig.splitted_ref: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)( delayed(align_and_analyze)(cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath ) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len( qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)( delayed(align_and_analyze)(cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate( zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate( zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append( align_and_analyze(cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \ [x[1] for x in statuses_results_lengths_tuples], \ [x[2] for x in statuses_results_lengths_tuples] reports = [] if qconfig.is_combined_ref: ref_misassemblies = [ result['istranslocations_by_refs'] if result else [] for result in results ] if ref_misassemblies: for i, fpath in enumerate(contigs_fpaths): if ref_misassemblies[i]: assembly_name = qutils.name_from_fpath(fpath) all_rows = [] all_refs = sorted( list( set([ ref for ref in ref_labels_by_chromosomes.values() ]))) row = { 'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))] } all_rows.append(row) for k in all_refs: row = {'metricName': k, 'values': []} for ref in all_refs: if ref == k or ref not in ref_misassemblies[i]: row['values'].append(None) else: row['values'].append( ref_misassemblies[i][ref][k]) all_rows.append(row) misassembly_by_ref_fpath = join( output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name) print >> open( misassembly_by_ref_fpath, 'w' ), 'Number of interspecies translocations by references: \n' print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True) print >> open(misassembly_by_ref_fpath, 'a'), '\nReferences: ' for ref_num, ref in enumerate(all_refs): print >> open(misassembly_by_ref_fpath, 'a'), str(ref_num + 1) + ' - ' + ref logger.info( ' Information about interspecies translocations by references for %s is saved to %s' % (assembly_name, misassembly_by_ref_fpath)) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) if qconfig.draw_plots: import plotter plotter.draw_misassembl_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') oks = nucmer_statuses.values().count(NucmerStatus.OK) not_aligned = nucmer_statuses.values().count(NucmerStatus.NOT_ALIGNED) failed = nucmer_statuses.values().count(NucmerStatus.FAILED) errors = nucmer_statuses.values().count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info( 'Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return nucmer_statuses, aligned_lengths_per_fpath