def save_GC_info(results_dirpath, contigs_fpaths, list_of_GC_distributions, reference_index): json_fpath = json_saver.save_GC_info(results_dirpath, contigs_fpaths, list_of_GC_distributions, reference_index) if json_fpath: append(results_dirpath, json_fpath, 'gcInfos')
def save_GC_info(results_dirpath, contigs_fpaths, list_of_GC_distributions, list_of_GC_contigs_distributions, reference_index): json_fpath = json_saver.save_GC_info(results_dirpath, contigs_fpaths, list_of_GC_distributions, list_of_GC_contigs_distributions, reference_index) if json_fpath: append(results_dirpath, json_fpath, 'gcInfos')
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] if ref_fpath: reference_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values() reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) if reference_lengths: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_lengths(json_output_dir, reference_lengths) # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_([\d\.]+)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * (cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs/qconfig.max_points) max_points = num_contigs/multiplicator lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] corr_lists_of_lengths = [[sum(list_of_length[((i-1)*multiplicator):(i*multiplicator)]) for i in range(1, max_points) if (i*multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index*multiplicator:])) else: corr_lists_of_lengths = lists_of_lengths # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths) json_saver.save_tick_x(json_output_dir, multiplicator) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] largest_contig = 0 import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil((largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if json_output_dir: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions_with_ref, reference_index) if qconfig.html_report and not qconfig.is_combined_ref: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, reference_index) import plotter ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', [], json_output_dir=json_output_dir) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))], json_output_dir=json_output_dir) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length') if not qconfig.is_combined_ref: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot') if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.')
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None if ref_fpath: reference_lengths = sorted(fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC) if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning(' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).') elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_([\d\.]+)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * (cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] corr_lists_of_lengths = [[sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [sum(reference_lengths[((i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum(reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points)] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths) json_saver.save_tick_x(json_output_dir, multiplicator) if reference_lengths: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_lengths(json_output_dir, reference_lengths) # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil((largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if json_output_dir and not qconfig.no_gc: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions_with_ref, reference_index) if qconfig.html_report and not qconfig.is_combined_ref and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', [], json_output_dir=json_output_dir) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))], json_output_dir=json_output_dir) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length') if not qconfig.is_combined_ref and not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot') if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.')