def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None if ref_fpath: reference_lengths = sorted( fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content( ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC if reference_GC is not None else 'undefined') if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning( ' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).' ) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_(\d+\.?\d*)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold( seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * ( cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] num_contigs = max( [len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator corr_lists_of_lengths = [[ sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length) ] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [ sum(reference_lengths[( (i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum( reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points) ] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append( sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] if reference_lengths: # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] list_of_GC_contigs_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate( zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution, GC_contigs_distribution = GC_content( contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) list_of_GC_contigs_distributions.append(GC_contigs_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field( reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil( (largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if qconfig.html_report and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', []) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx', [reference_length for i in range(len(contigs_fpaths))]) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length') if not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot')) for contigs_fpath, GC_distribution in zip( contigs_fpaths, list_of_GC_contigs_distributions): plotter.contigs_GC_content_plot( contigs_fpath, GC_distribution, join( output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot')) if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.')
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None icarus_gc_fpath = None circos_gc_fpath = None if ref_fpath: reference_lengths = sorted(fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(ref_fpath) if qconfig.create_icarus_html or qconfig.draw_plots: icarus_gc_fpath = join(output_dirpath, 'gc.icarus.txt') save_icarus_GC(ref_fpath, icarus_gc_fpath) if qconfig.draw_plots: circos_gc_fpath = join(output_dirpath, 'gc.circos.txt') save_circos_GC(ref_fpath, reference_length, circos_gc_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC if reference_GC is not None else 'undefined') if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning(' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).') elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_(\d+\.?\d*)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * (cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator corr_lists_of_lengths = [[sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [sum(reference_lengths[((i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum(reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points)] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] if reference_lengths: # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] list_of_GC_contigs_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution, GC_contigs_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) list_of_GC_contigs_distributions.append(GC_contigs_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil((largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if qconfig.html_report and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', []) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx', [reference_length for i in range(len(contigs_fpaths))]) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length') if not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot')) for contigs_fpath, GC_distribution in zip(contigs_fpaths, list_of_GC_contigs_distributions): plotter.contigs_GC_content_plot(contigs_fpath, GC_distribution, join(output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot')) if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.') return icarus_gc_fpath, circos_gc_fpath