def draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath): total_len = dict() contigs_dict = dict() contigs_with_coverage = [contigs_fpath for contigs_fpath in contigs_fpaths if coverage_dict[contigs_fpath]] for contigs_fpath in contigs_fpaths: total_len[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.TOTALLEN) contigs_dict[contigs_fpath] = reporting.get(contigs_fpath).get_field(reporting.Fields.CONTIGS) cov_values = [coverage_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage] num_contigs = [contigs_dict[contigs_fpath] for contigs_fpath in contigs_with_coverage] common_coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage(cov_values, num_contigs) histogram_title = 'Coverage histogram (bin size: ' + str(bin_size) + 'x)' plotter.coverage_histogram(contigs_with_coverage, common_coverage_values, output_dirpath + '/coverage_histogram', histogram_title, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold) for contigs_fpath in contigs_with_coverage: coverage_values, bin_size, low_threshold, high_threshold, max_cov = binning_coverage([coverage_dict[contigs_fpath]], [contigs_dict[contigs_fpath]]) label = qutils.label_from_fpath(contigs_fpath) corr_label = qutils.label_from_fpath_for_fname(contigs_fpath) histogram_title = label + ' coverage histogram (bin size: ' + str(bin_size) + 'x)' histogram_fpath = os.path.join(output_dirpath, corr_label + '_coverage_histogram') plotter.coverage_histogram([contigs_fpath], coverage_values, histogram_fpath, histogram_title, draw_bars=True, bin_size=bin_size, max_cov=max_cov, low_threshold=low_threshold, high_threshold=high_threshold)
def get_assemblies_data(contigs_fpaths, icarus_dirpath, stdout_pattern, nx_marks): assemblies_n50 = defaultdict(dict) assemblies_data = '' assemblies_data += 'var assemblies_links = {};\n' assemblies_data += 'var assemblies_len = {};\n' assemblies_data += 'var assemblies_contigs = {};\n' assemblies_data += 'var assemblies_misassemblies = {};\n' assemblies_data += 'var assemblies_n50 = {};\n' assemblies_contig_size_data = '' for contigs_fpath in contigs_fpaths: assembly_label = qutils.label_from_fpath(contigs_fpath) report = reporting.get(contigs_fpath) l = report.get_field(reporting.Fields.TOTALLEN) contigs = report.get_field(reporting.Fields.CONTIGS) n50 = report.get_field(reporting.Fields.N50) if stdout_pattern: contig_stdout_fpath = stdout_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) + '.stdout' contig_stdout_fpath = qutils.relpath(contig_stdout_fpath, icarus_dirpath) assemblies_data += 'assemblies_links["' + assembly_label + '"] = "' + contig_stdout_fpath + '";\n' assemblies_contig_size_data += 'assemblies_len["' + assembly_label + '"] = ' + str(l) + ';\n' assemblies_contig_size_data += 'assemblies_contigs["' + assembly_label + '"] = ' + str(contigs) + ';\n' assemblies_contig_size_data += 'assemblies_n50["' + assembly_label + '"] = "' + str(n50) + '";\n' for nx in nx_marks: assemblies_n50[assembly_label][nx] = report.get_field(nx) return assemblies_data, assemblies_contig_size_data, assemblies_n50
def calculate_ave_read_support(combined_output_dirpath, assemblies): unique_contigs_fpath = os.path.join(combined_output_dirpath, 'contigs_reports', qconfig.unique_contigs_fname_pattern) for assembly in assemblies: aligned_contigs_by_ref = dict() assembly_label = qutils.label_from_fpath(assembly.fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath) with open(unique_contigs_fpath % corr_assembly_label) as in_f: for line in in_f: ref_name, contig_len, contig_cov = line.strip().split('\t') aligned_contigs_by_ref.setdefault(ref_name, []).append( (float(contig_len), float(contig_cov))) for ref_name, contigs in aligned_contigs_by_ref.items(): ref_cov = sum(contig_cov * aligned_len for (aligned_len, contig_cov) in contigs) ref_cov /= sum(aligned_len for (aligned_len, contig_cov) in contigs) corr_assembly_label = qutils.label_from_fpath_for_fname( assembly.fpath) ref_contigs_fpath = os.path.join( os.path.dirname(assembly.fpath), corr_assembly_label + '_to_' + ref_name + '.fasta') qconfig.assembly_labels_by_fpath[ ref_contigs_fpath] = assembly_label report = reporting.get(ref_contigs_fpath, ref_name=ref_name) report.add_field(reporting.Fields.AVE_READ_SUPPORT, '%.2f' % ref_cov)
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed if qconfig.memory_efficient: results = Parallel(n_jobs=n_jobs)( delayed(predict_genes)(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) else: results = [ predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths) ] genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label], unique, full_genes, partial_genes = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if full_genes is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique is None and full_genes is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % label) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running Barrnap...') n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) if not os.path.isdir(output_dir): os.makedirs(output_dir) log_fpath = join(output_dir, 'barrnap.log') logger.info('Logging to ' + log_fpath + '...') kingdom = 'bac' if qconfig.prokaryote else 'euk' gff_fpaths = [ join(output_dir, qutils.label_from_fpath_for_fname(contigs_fpath) + '.rna.gff') for contigs_fpath in contigs_fpaths ] barrnap_args = [ (contigs_fpath, gff_fpath, log_fpath, threads, kingdom) for contigs_fpath, gff_fpath in zip(contigs_fpaths, gff_fpaths) ] run_parallel(run, barrnap_args, qconfig.max_threads) if not any(fpath for fpath in gff_fpaths): logger.info('Failed predicting the location of ribosomal RNA genes.') return # saving results for index, (contigs_fpath, gff_fpath) in enumerate(zip(contigs_fpaths, gff_fpaths)): genes = parse_gff(open(gff_fpath), 'rrna') report = reporting.get(contigs_fpath) if not os.path.isfile(gff_fpath): logger.error('Failed running Barrnap for ' + contigs_fpath + '. See ' + log_fpath + ' for information.') continue part_count = len([ gene for gene in genes if 'product' in gene.attributes and 'partial' in gene.attributes['product'] ]) total_count = len(genes) report.add_field( reporting.Fields.RNA_GENES, '%s + %s part' % (total_count - part_count, part_count)) logger.info(' ' + qutils.index_to_str(index) + ' Ribosomal RNA genes = ' + str(total_count)) logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + gff_fpath) logger.info('Done.')
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath): from quast_libs import reporting ref_reads_stats = None if ref_fpath: ref_name = qutils.name_from_fpath(ref_fpath) stats_fpath = join(output_dir, ref_name + '.stat') if isfile(stats_fpath): ref_reads_stats = parse_reads_stats(stats_fpath) if int(ref_reads_stats['mapped']) == 0: logger.info(' BWA: nothing aligned for reference.') # process all contigs files for index, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) stats_fpath = join(output_dir, assembly_name + '.stat') if ref_reads_stats: report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped']) report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt']) report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons']) report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint']) report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth']) if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS, [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0]) if not isfile(stats_fpath): continue reads_stats = parse_reads_stats(stats_fpath) report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total']) report.add_field(reporting.Fields.LEFT_READS, reads_stats['left']) report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right']) report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped']) report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt']) if int(reads_stats['mapped']) == 0: logger.info(' ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.') report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons']) report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint']) report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.DEPTH, reads_stats['depth']) if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS, [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0])
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if qconfig.memory_efficient: results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) else: results = [predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)] genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label], unique, full_genes, partial_genes = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if full_genes is not None: genes = ['%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes)] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique is None and full_genes is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % label) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) parallel_args = [(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tool_exec_fpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)] genes_list, unique, full_genes, partial_genes = run_parallel( predict_genes, parallel_args, n_jobs) genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label] = genes_list[i] if unique[i] is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique[i]) if full_genes[i] is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes[i], partial_genes[i]) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique[i] is None and full_genes[i] is None: logger.error('Failed running Glimmer for %s. ' % label + ( 'Run with the --debug option' ' to see the command line.' if not qconfig.debug else '')) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def calculate_ave_read_support(combined_output_dirpath, assemblies): unique_contigs_fpath = os.path.join(combined_output_dirpath, 'contigs_reports', qconfig.unique_contigs_fname_pattern) for assembly in assemblies: aligned_contigs_by_ref = dict() assembly_label = qutils.label_from_fpath(assembly.fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath) with open(unique_contigs_fpath % corr_assembly_label) as in_f: for line in in_f: ref_name, contig_len, contig_cov = line.strip().split('\t') aligned_contigs_by_ref.setdefault(ref_name, []).append((float(contig_len), float(contig_cov))) for ref_name, contigs in aligned_contigs_by_ref.items(): ref_cov = sum(contig_cov * aligned_len for (aligned_len, contig_cov) in contigs) ref_cov /= sum(aligned_len for (aligned_len, contig_cov) in contigs) corr_assembly_label = qutils.label_from_fpath_for_fname(assembly.fpath) ref_contigs_fpath = os.path.join( os.path.dirname(assembly.fpath), corr_assembly_label + '_to_' + ref_name + '.fasta') qconfig.assembly_labels_by_fpath[ref_contigs_fpath] = assembly_label report = reporting.get(ref_contigs_fpath, ref_name=ref_name) report.add_field(reporting.Fields.AVE_READ_SUPPORT, '%.2f' % ref_cov)
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tmp_dirpath = os.path.join(out_dirpath, 'tmp') tool_exec_fpath = compile_glimmer(logger) if not tool_exec_fpath: return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) parallel_args = [(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tool_exec_fpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)] genes_list, unique, full_genes, partial_genes = run_parallel(predict_genes, parallel_args, n_jobs) genes_by_labels = dict() # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) label = qutils.label_from_fpath(contigs_fpath) genes_by_labels[label] = genes_list[i] if unique[i] is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique[i]) if full_genes[i] is not None: genes = ['%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes[i], partial_genes[i])] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique[i] is None and full_genes[i] is None: logger.error( 'Failed running Glimmer for %s. ' % label + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '')) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.') return genes_by_labels
def js_data_gen(assemblies, contigs_fpaths, chromosomes_length, output_dirpath, structures_by_labels, contigs_by_assemblies, ambiguity_alignments_by_labels=None, contig_names_by_refs=None, ref_fpath=None, stdout_pattern=None, features_data=None, gc_fpath=None, cov_fpath=None, physical_cov_fpath=None, json_output_dir=None): chr_names = [] if chromosomes_length and assemblies: chr_to_aligned_blocks = OrderedDict() chr_names = list(chromosomes_length.keys()) for assembly in assemblies.assemblies: chr_to_aligned_blocks[assembly.label] = defaultdict(list) similar_correct = 0 similar_misassembled = 0 for align in assembly.alignments: chr_to_aligned_blocks[assembly.label][align.ref_name].append( align) if align.similar: if align.misassembled: similar_misassembled += 1 else: similar_correct += 1 report = reporting.get(assembly.fpath) report.add_field(reporting.Fields.SIMILAR_CONTIGS, similar_correct) report.add_field(reporting.Fields.SIMILAR_MIS_BLOCKS, similar_misassembled) main_menu_fpath = os.path.join(output_dirpath, qconfig.icarus_html_fname) output_all_files_dir_path = os.path.join(output_dirpath, qconfig.icarus_dirname) if not os.path.exists(output_all_files_dir_path): os.mkdir(output_all_files_dir_path) chr_full_names, contig_names_by_refs = group_references( chr_names, contig_names_by_refs, chromosomes_length, ref_fpath) cov_data, max_depth = parse_cov_fpath(cov_fpath, chr_names, chr_full_names, contig_names_by_refs) physical_cov_data, physical_max_depth = parse_cov_fpath( physical_cov_fpath, chr_names, chr_full_names, contig_names_by_refs) gc_data, max_gc = parse_cov_fpath(gc_fpath, chr_names, chr_full_names, contig_names_by_refs) chr_sizes = {} num_contigs = {} aligned_bases = genome_analyzer.get_ref_aligned_lengths() nx_marks = [ reporting.Fields.N50, reporting.Fields.N75, reporting.Fields.NG50, reporting.Fields.NG75 ] assemblies_data, assemblies_contig_size_data, assemblies_n50 = get_assemblies_data( contigs_fpaths, output_all_files_dir_path, stdout_pattern, nx_marks) ref_contigs_dict = {} chr_lengths_dict = {} ref_data = 'var references_by_id = {};\n' chr_names_by_id = dict( (chrom, str(i)) for i, chrom in enumerate(chr_names)) for chrom, i in chr_names_by_id.items(): ref_data += 'references_by_id["' + str(i) + '"] = "' + chrom + '";\n' for i, chr in enumerate(chr_full_names): if contig_names_by_refs: ref_contigs = [ contig for contig in chr_names if contig_names_by_refs[contig] == chr ] elif len(chr_full_names) == 1: ref_contigs = chr_names else: ref_contigs = [chr] ref_contigs_dict[chr] = ref_contigs chr_lengths_dict[chr] = [0] + [ chromosomes_length[contig] for contig in ref_contigs ] num_misassemblies = defaultdict(int) aligned_bases_by_chr = defaultdict(list) aligned_assemblies = defaultdict(set) for i, chr in enumerate(chr_full_names): ref_contigs = ref_contigs_dict[chr] chr_lengths = chr_lengths_dict[chr] chr_size = sum([chromosomes_length[contig] for contig in ref_contigs]) chr_sizes[chr] = chr_size num_contigs[chr] = len(ref_contigs) data_str = [] data_str.append('var chromosomes_len = {};') for ref_contig in ref_contigs: l = chromosomes_length[ref_contig] data_str.append('chromosomes_len["' + ref_contig + '"] = ' + str(l) + ';') aligned_bases_by_chr[chr].extend(aligned_bases[ref_contig]) cov_data_str = format_cov_data(chr, cov_data, 'coverage_data', max_depth, 'reads_max_depth') if cov_data else None physical_cov_data_str = format_cov_data(chr, physical_cov_data, 'physical_coverage_data', physical_max_depth, 'physical_max_depth') \ if physical_cov_data else None gc_data_str = format_cov_data(chr, gc_data, 'gc_data', 100, 'max_gc') if gc_data else None alignment_viewer_fpath, ref_data_str, contigs_structure_str, additional_assemblies_data, ms_selectors, num_misassemblies[chr], aligned_assemblies[chr] = \ prepare_alignment_data_for_one_ref(chr, chr_full_names, chr_names_by_id, ref_contigs, data_str, chr_to_aligned_blocks, structures_by_labels, contigs_by_assemblies, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels, cov_data_str=cov_data_str, physical_cov_data_str=physical_cov_data_str, gc_data_str=gc_data_str, contig_names_by_refs=contig_names_by_refs, output_dir_path=output_all_files_dir_path) ref_name = qutils.name_from_fpath(ref_fpath) save_alignment_data_for_one_ref( chr, ref_contigs, ref_name, json_output_dir, alignment_viewer_fpath, ref_data_str, ms_selectors, ref_data=ref_data, features_data=features_data, assemblies_data=assemblies_data, contigs_structure_str=contigs_structure_str, additional_assemblies_data=additional_assemblies_data) contigs_sizes_str, too_many_contigs = get_contigs_data( contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels, contig_names_by_refs, chr_names, chr_full_names) all_data = assemblies_data + assemblies_contig_size_data + contigs_sizes_str save_contig_size_html(output_all_files_dir_path, json_output_dir, too_many_contigs, all_data) icarus_links = defaultdict(list) if len(chr_full_names) > 1: chr_link = qconfig.icarus_html_fname icarus_links["links"].append(chr_link) icarus_links["links_names"].append(qconfig.icarus_link) main_menu_template_fpath = html_saver.get_real_path( qconfig.icarus_menu_template_fname) main_data_dict = dict() labels = [ qconfig.assembly_labels_by_fpath[contigs_fpath] for contigs_fpath in contigs_fpaths ] main_data_dict['assemblies'] = labels html_saver.save_icarus_data(json_output_dir, ', '.join(labels), 'assemblies') contig_size_browser_fpath = os.path.join(qconfig.icarus_dirname, qconfig.contig_size_viewer_fname) main_data_dict['contig_size_html'] = contig_size_browser_fpath html_saver.save_icarus_data(json_output_dir, contig_size_browser_fpath, 'contig_size_html') if not chr_names: icarus_links["links"].append(contig_size_browser_fpath) icarus_links["links_names"].append(qconfig.icarus_link) if chr_full_names and (len(chr_full_names) > 1 or qconfig.is_combined_ref): main_data_dict['table_references'] = {'references': []} num_aligned_assemblies = [ len(aligned_assemblies[chr]) for chr in chr_full_names ] is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1 if is_unaligned_asm_exists: main_data_dict['table_references']['th_assemblies'] = True for chr in sorted(chr_full_names, key=natural_sort): chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr( chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths, contig_names_by_refs, one_chromosome=len(chr_full_names) == 1) reference_dict = dict() reference_dict['chr_link'] = chr_link reference_dict['tooltip'] = tooltip reference_dict['chr_name'] = os.path.basename(chr_name) reference_dict['num_contigs'] = str(num_contigs[chr]) reference_dict['chr_size'] = format_long_numbers(chr_size) if is_unaligned_asm_exists: reference_dict['num_assemblies'] = str( len(aligned_assemblies[chr])) reference_dict['chr_gf'] = '%.3f' % chr_genome reference_dict['num_misassemblies'] = str(num_misassemblies[chr]) main_data_dict['table_references']['references'].append( reference_dict) html_saver.save_icarus_data(json_output_dir, main_data_dict['table_references'], 'table_references', as_text=False) else: if chr_full_names: chr = chr_full_names[0] chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr( chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths, contig_names_by_refs, one_chromosome=True) main_data_dict['one_reference'] = dict() main_data_dict['one_reference']['alignment_link'] = chr_link main_data_dict['one_reference']['ref_fpath'] = os.path.basename( ref_fpath) main_data_dict['one_reference']['ref_fragments'] = str( num_contigs[chr]) main_data_dict['one_reference']['ref_size'] = format_long_numbers( chr_size) main_data_dict['one_reference']['ref_gf'] = '%.3f' % chr_genome main_data_dict['one_reference']['num_misassemblies'] = str( num_misassemblies[chr]) icarus_links["links"].append(chr_link) icarus_links["links_names"].append(qconfig.icarus_link) html_saver.save_icarus_data(json_output_dir, main_data_dict['one_reference'], 'menu_reference', as_text=False) html_saver.save_icarus_html(main_menu_template_fpath, main_menu_fpath, main_data_dict) html_saver.save_icarus_links(output_dirpath, icarus_links) return main_menu_fpath
def do(reference, contigs_fpaths, cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') num_nf_errors = logger._num_nf_errors if not compile_aligner(logger): logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return dict( zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if qconfig.memory_efficient: threads = 1 else: threads = max(int(qconfig.max_threads / n_jobs), 1) from joblib import Parallel, delayed if not qconfig.splitted_ref: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)( delayed(align_and_analyze)(cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath ) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len( qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)( delayed(align_and_analyze)(cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate( zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate( zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append( align_and_analyze(cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \ [x[1] for x in statuses_results_lengths_tuples], \ [x[2] for x in statuses_results_lengths_tuples] reports = [] if qconfig.is_combined_ref: ref_misassemblies = [ result['istranslocations_by_refs'] if result else [] for result in results ] if ref_misassemblies: for i, fpath in enumerate(contigs_fpaths): if ref_misassemblies[i]: assembly_name = qutils.name_from_fpath(fpath) all_rows = [] all_refs = sorted( list( set([ ref for ref in ref_labels_by_chromosomes.values() ]))) row = { 'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))] } all_rows.append(row) for k in all_refs: row = {'metricName': k, 'values': []} for ref in all_refs: if ref == k or ref not in ref_misassemblies[i]: row['values'].append(None) else: row['values'].append( ref_misassemblies[i][ref][k]) all_rows.append(row) misassembly_by_ref_fpath = join( output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name) print >> open( misassembly_by_ref_fpath, 'w' ), 'Number of interspecies translocations by references: \n' print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True) print >> open(misassembly_by_ref_fpath, 'a'), '\nReferences: ' for ref_num, ref in enumerate(all_refs): print >> open(misassembly_by_ref_fpath, 'a'), str(ref_num + 1) + ' - ' + ref logger.info( ' Information about interspecies translocations by references for %s is saved to %s' % (assembly_name, misassembly_by_ref_fpath)) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) if qconfig.draw_plots: import plotter plotter.draw_misassembl_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') oks = nucmer_statuses.values().count(NucmerStatus.OK) not_aligned = nucmer_statuses.values().count(NucmerStatus.NOT_ALIGNED) failed = nucmer_statuses.values().count(NucmerStatus.FAILED) errors = nucmer_statuses.values().count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info( 'Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return nucmer_statuses, aligned_lengths_per_fpath
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, features_dict, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): coords_dirpath = os.path.join(detailed_contigs_reports_dirpath, qconfig.minimap_output_dirname) from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: coords_dirpath = os.path.join(coords_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) genome_size, reference_chromosomes, ns_by_chromosomes = fastaparser.get_genome_stats(ref_fpath) # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = os.path.join(genome_stats_dirpath, 'genome_info.txt') res_file = open(result_fpath, 'w') containers = [] for feature, feature_fpath in features_dict.items(): containers.append(FeatureContainer([feature_fpath], feature)) if not features_dict: logger.notice('No file with genomic features were provided. ' 'Use the --features option if you want to specify it.\n', indent=' ') if operons_fpaths: containers.append(FeatureContainer(operons_fpaths, 'operon')) else: logger.notice('No file with operons were provided. ' 'Use the -O option if you want to specify it.', indent=' ') for container in containers: if not container.fpaths: continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No genomic features of type "' + container.kind + '" were loaded.', indent=' ') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' genomic features of type "' + container.kind + '"') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys())) ref_genes_num, ref_operons_num = None, None for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) genomic_features = 0 for container in containers: if container.kind == 'operon': ref_operons_num = len(container.region_list) report.add_field(reporting.Fields.REF_OPERONS, len(container.region_list)) else: genomic_features += len(container.region_list) if genomic_features: ref_genes_num = genomic_features report.add_field(reporting.Fields.REF_GENES, genomic_features) # for cumulative plots: files_features_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_unsorted_features_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} files_unsorted_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) parallel_run_args = [(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)] ref_lengths, results_genes_operons_tuples = run_parallel(process_single_file, parallel_run_args, n_jobs, filter_results=True) num_nf_errors += len(aligned_contigs_fpaths) - len(ref_lengths) logger._num_nf_errors = num_nf_errors if not ref_lengths: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, ' + 'total length without N\'s: ' + str(chr_len - len(ns_by_chromosomes[chr_name])) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('=' * 120 + '\n') for contigs_fpath, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\ in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_features_in_contigs[contigs_fpath] = features_in_contigs files_unsorted_features_in_contigs[contigs_fpath] = unsorted_features_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs files_unsorted_operons_in_contigs[contigs_fpath] = unsorted_operons_in_contigs full_found_genes.append(sum(features_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], report.get_field(reporting.Fields.MAPPEDGENOME), report.get_field(reporting.Fields.DUPLICATION_RATIO), gaps_count)) genome_mapped.append(float(report.get_field(reporting.Fields.MAPPEDGENOME))) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if qconfig.html_report: from quast_libs.html_saver import html_saver if ref_genes_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'features', files_features_in_contigs, ref_genes_num) if ref_operons_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter from quast_libs.ca_utils.misc import contigs_aligned_lengths if ref_genes_num: plotter.genes_operons_plot(ref_genes_num, aligned_contigs_fpaths, files_features_in_contigs, genome_stats_dirpath + '/features_cumulative_plot', 'genomic features') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_features_in_contigs, genome_stats_dirpath + '/features_frcurve_plot', 'genomic features') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_features_histogram', '# complete genomic features') if ref_operons_num: plotter.genes_operons_plot(ref_operons_num, aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_operons_in_contigs, genome_stats_dirpath + '/operons_frcurve_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return containers
def save_result(result, report, fname, ref_fpath): region_misassemblies = result['region_misassemblies'] misassemblies_by_ref = result['misassemblies_by_ref'] region_struct_variations = result['region_struct_variations'] misassemblies_matched_sv = result['misassemblies_matched_sv'] misassembled_contigs = result['misassembled_contigs'] misassembled_bases = result['misassembled_bases'] misassembly_internal_overlap = result['misassembly_internal_overlap'] unaligned = result['unaligned'] partially_unaligned = result['partially_unaligned'] partially_unaligned_bases = result['partially_unaligned_bases'] fully_unaligned_bases = result['fully_unaligned_bases'] ambiguous_contigs = result['ambiguous_contigs'] ambiguous_contigs_extra_bases = result['ambiguous_contigs_extra_bases'] SNPs = result['SNPs'] indels_list = result['indels_list'] total_aligned_bases = result['total_aligned_bases'] half_unaligned_with_misassembly = result['half_unaligned_with_misassembly'] report.add_field(reporting.Fields.MISLOCAL, region_misassemblies.count(Misassembly.LOCAL)) report.add_field(reporting.Fields.MISASSEMBL, region_misassemblies.count(Misassembly.RELOCATION) + region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) + region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.MISCONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MISCONTIGSBASES, misassembled_bases) report.add_field(reporting.Fields.MISINTERNALOVERLAP, misassembly_internal_overlap) if qconfig.bed: report.add_field(reporting.Fields.STRUCT_VARIATIONS, misassemblies_matched_sv) report.add_field(reporting.Fields.UNALIGNED, '%d + %d part' % (unaligned, partially_unaligned)) report.add_field(reporting.Fields.UNALIGNEDBASES, (fully_unaligned_bases + partially_unaligned_bases)) report.add_field(reporting.Fields.AMBIGUOUS, ambiguous_contigs) report.add_field(reporting.Fields.AMBIGUOUSEXTRABASES, ambiguous_contigs_extra_bases) report.add_field(reporting.Fields.MISMATCHES, SNPs) # different types of indels: if indels_list is not None: report.add_field(reporting.Fields.INDELS, len(indels_list)) report.add_field(reporting.Fields.INDELSBASES, sum(indels_list)) report.add_field(reporting.Fields.MIS_SHORT_INDELS, len([i for i in indels_list if i <= qconfig.SHORT_INDEL_THRESHOLD])) report.add_field(reporting.Fields.MIS_LONG_INDELS, len([i for i in indels_list if i > qconfig.SHORT_INDEL_THRESHOLD])) if total_aligned_bases: report.add_field(reporting.Fields.SUBSERROR, "%.2f" % (float(SNPs) * 100000.0 / float(total_aligned_bases))) report.add_field(reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS)) * 100000.0 / float(total_aligned_bases))) # for misassemblies report: report.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.RELOCATION) + region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) + region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.MIS_RELOCATION, region_misassemblies.count(Misassembly.RELOCATION)) report.add_field(reporting.Fields.MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.TRANSLOCATION)) report.add_field(reporting.Fields.MIS_INVERTION, region_misassemblies.count(Misassembly.INVERSION)) report.add_field(reporting.Fields.MIS_EXTENSIVE_CONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MIS_EXTENSIVE_BASES, misassembled_bases) report.add_field(reporting.Fields.MIS_LOCAL, region_misassemblies.count(Misassembly.LOCAL)) if qconfig.is_combined_ref: report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, region_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) all_references = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()]))) for ref_name in all_references: subreport = reporting.get(fname, ref_name=ref_name) ref_misassemblies = misassemblies_by_ref[ref_name] subreport.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, ref_misassemblies.count(Misassembly.RELOCATION) + ref_misassemblies.count(Misassembly.INVERSION) + ref_misassemblies.count(Misassembly.TRANSLOCATION) + ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_RELOCATION, ref_misassemblies.count(Misassembly.RELOCATION)) subreport.add_field(reporting.Fields.MIS_TRANSLOCATION, ref_misassemblies.count(Misassembly.TRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_INVERTION, ref_misassemblies.count(Misassembly.INVERSION)) subreport.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_LOCAL, ref_misassemblies.count(Misassembly.LOCAL)) subreport.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) subreport.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) if fname not in qconfig.dict_of_broken_scaffolds: subreport.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, ref_misassemblies.count(Misassembly.SCAFFOLD_GAP)) if qconfig.check_for_fragmented_ref: subreport.add_field(reporting.Fields.MIS_FRAGMENTED, ref_misassemblies.count(Misassembly.FRAGMENTED)) elif intergenomic_misassemblies_by_asm: label = qutils.label_from_fpath(fname) ref_name = qutils.name_from_fpath(ref_fpath) ref_misassemblies = intergenomic_misassemblies_by_asm[label][ref_name] report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) if fname not in qconfig.dict_of_broken_scaffolds: report.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, region_misassemblies.count(Misassembly.SCAFFOLD_GAP)) if qconfig.check_for_fragmented_ref: report.add_field(reporting.Fields.MIS_FRAGMENTED, region_misassemblies.count(Misassembly.FRAGMENTED)) # for unaligned report: report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned) report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH, fully_unaligned_bases) report.add_field(reporting.Fields.UNALIGNED_PART_CNTGS, partially_unaligned) report.add_field(reporting.Fields.UNALIGNED_PART_LENGTH, partially_unaligned_bases) report.add_field(reporting.Fields.UNALIGNED_MISASSEMBLED_CTGS, half_unaligned_with_misassembly) return report
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None if qconfig.draw_plots: compile_gnuplot(logger, only_clean=False) num_nf_errors = logger._num_nf_errors create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.splitted_ref and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append(align_and_analyze( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\ [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)] reports = [] nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs)) if NucmerStatus.OK in nucmer_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname, reference)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(nucmer_statuses.values()).count(NucmerStatus.OK) not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED) failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED) errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return nucmer_statuses, aligned_lengths_per_fpath
def main(args): check_dirpath( qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n.' + 'Please, put QUAST in a different directory, then try again.\n', exit_code=3) if not args: qconfig.usage(stream=sys.stderr) sys.exit(1) try: import imp imp.reload(qconfig) imp.reload(qutils) except: reload(qconfig) reload(qutils) try: locale.setlocale(locale.LC_ALL, 'en_US.utf8') except Exception: try: locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') except Exception: logger.warning('Python locale settings can\'t be changed') quast_path = [os.path.realpath(__file__)] quast_py_args, contigs_fpaths = parse_options(logger, quast_path + args) output_dirpath, ref_fpath, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.main_info() logger.print_params() ######################################################################## from quast_libs import reporting reports = reporting.reports try: import imp imp.reload(reporting) except: reload(reporting) reporting.reports = reports reporting.assembly_fpaths = [] from quast_libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. if qconfig.is_combined_ref: corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname) else: if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) qconfig.set_max_threads(logger) check_reads_fpaths(logger) # PROCESSING REFERENCE if ref_fpath: logger.main_info() logger.main_info('Reference:') original_ref_fpath = ref_fpath ref_fpath = qutils.correct_reference(ref_fpath, corrected_dirpath) if qconfig.optimal_assembly: if not qconfig.pacbio_reads and not qconfig.nanopore_reads and not qconfig.mate_pairs: logger.warning( 'Optimal assembly cannot be created. It requires mate-pairs or long reads (Pacbio SMRT or Oxford Nanopore).' ) else: optimal_assembly_fpath = optimal_assembly.do( ref_fpath, original_ref_fpath, os.path.join(output_dirpath, qconfig.optimal_assembly_basename)) if optimal_assembly_fpath is not None: contigs_fpaths.insert(0, optimal_assembly_fpath) labels.insert(0, 'Optimal') labels = qutils.process_labels(contigs_fpaths, labels) else: ref_fpath = '' # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs( contigs_fpaths, corrected_dirpath, labels, reporting) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) cov_fpath = qconfig.cov_fpath physical_cov_fpath = qconfig.phys_cov_fpath if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_sam or qconfig.sam_fpaths or qconfig.bam_fpaths: bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do( ref_fpath, contigs_fpaths, os.path.join(output_dirpath, qconfig.reads_stats_dirname), external_logger=logger) qconfig.bed = bed_fpath if not contigs_fpaths: logger.error( "None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 if qconfig.used_colors and qconfig.used_ls: for i, label in enumerate(labels): plotter_data.dict_color_and_ls[label] = (qconfig.used_colors[i], qconfig.used_ls[i]) qconfig.assemblies_fpaths = contigs_fpaths # Where all pdfs will be saved all_pdf_fpath = None if qconfig.draw_plots and plotter.can_draw_plots: all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) if qconfig.json_output_dirpath: from quast_libs.html_saver import json_saver if json_saver.simplejson_error: qconfig.json_output_dirpath = None ######################################################################## ### Stats and plots ######################################################################## from quast_libs import basic_stats icarus_gc_fpath, circos_gc_fpath = basic_stats.do( ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), output_dirpath) if qconfig.large_genome and ref_fpath: unique_kmers.do(os.path.join(output_dirpath, 'basic_stats'), ref_fpath, contigs_fpaths, logger) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None icarus_html_fpath = None circos_png_fpath = None if ref_fpath: ######################################################################## ### former PLANTAKOLYA, PLANTAGORA ######################################################################## from quast_libs import contigs_analyzer is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref aligner_statuses, aligned_lengths_per_fpath = contigs_analyzer.do( ref_fpath, contigs_fpaths, is_cyclic, os.path.join(output_dirpath, 'contigs_reports'), old_contigs_fpaths, qconfig.bed) for contigs_fpath in contigs_fpaths: if aligner_statuses[ contigs_fpath] == contigs_analyzer.AlignerStatus.OK: aligned_contigs_fpaths.append(contigs_fpath) aligned_lengths_lists.append( aligned_lengths_per_fpath[contigs_fpath]) # Before continue evaluating, check if aligner didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None features_containers = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join( output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from quast_libs import aligned_stats aligned_stats.do(ref_fpath, aligned_contigs_fpaths, output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from quast_libs import genome_analyzer features_containers = genome_analyzer.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, qconfig.features, qconfig.operons, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) genes_by_labels = None if qconfig.gene_finding: if qconfig.glimmer: ######################################################################## ### Glimmer ######################################################################## from quast_libs import glimmer genes_by_labels = glimmer.do( contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes')) if not qconfig.glimmer or qconfig.test: ######################################################################## ### GeneMark ######################################################################## from quast_libs import genemark genes_by_labels = genemark.do( contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.prokaryote, qconfig.metagenemark) else: logger.main_info("") logger.notice( "Genes are not predicted by default. Use --gene-finding option to enable it." ) if qconfig.rna_gene_finding: run_barrnap.do(contigs_fpaths, os.path.join(output_dirpath, 'predicted_genes'), logger) if qconfig.run_busco and not qconfig.is_combined_ref: if qconfig.platform_name == 'macosx': logger.main_info("") logger.warning("BUSCO can be run on Linux only") elif sys.version[0:3] == '2.5': logger.main_info("") logger.warning( "BUSCO does not support Python versions older than 2.6.") else: from quast_libs import run_busco run_busco.do(contigs_fpaths, os.path.join(output_dirpath, qconfig.busco_dirname), logger) ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total( output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots or qconfig.create_icarus_html: logger.print_timestamp() logger.main_info('Creating large visual summaries...') logger.main_info( 'This may take a while: press Ctrl-C to skip this step..') try: if detailed_contigs_reports_dirpath: report_for_icarus_fpath_pattern = os.path.join( detailed_contigs_reports_dirpath, qconfig.icarus_report_fname_pattern) stdout_pattern = os.path.join( detailed_contigs_reports_dirpath, qconfig.contig_report_fname_pattern) else: report_for_icarus_fpath_pattern = None stdout_pattern = None draw_alignment_plots = qconfig.draw_svg or qconfig.create_icarus_html draw_circos_plot = qconfig.draw_plots and ref_fpath and len( aligned_contigs_fpaths) and not qconfig.space_efficient number_of_steps = sum([ int(bool(value)) for value in [draw_alignment_plots, draw_circos_plot, all_pdf_fpath] ]) if draw_alignment_plots: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.main_info(' 1 of %d: Creating Icarus viewers...' % number_of_steps) from quast_libs import icarus icarus_html_fpath, contig_alignment_plot_fpath = icarus.do( contigs_fpaths, report_for_icarus_fpath_pattern, output_dirpath, ref_fpath, stdout_pattern=stdout_pattern, features=features_containers, cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, gc_fpath=icarus_gc_fpath, json_output_dir=qconfig.json_output_dirpath, genes_by_labels=genes_by_labels) if draw_circos_plot: logger.main_info( ' %d of %d: Creating Circos plots...' % (2 if draw_alignment_plots else 1, number_of_steps)) from quast_libs import circos circos_png_fpath, circos_legend_fpath = circos.do( ref_fpath, contigs_fpaths, report_for_icarus_fpath_pattern, circos_gc_fpath, features_containers, cov_fpath, os.path.join(output_dirpath, 'circos'), logger) if all_pdf_fpath: # full report in PDF format: all tables and plots logger.main_info( ' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_fpath) logger.main_info('Done') except KeyboardInterrupt: logger.main_info('..step skipped!') if all_pdf_fpath and os.path.isfile(all_pdf_fpath): os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.main_info('RESULTS:') logger.main_info(' Text versions of total report are saved to ' + reports_fpaths) logger.main_info( ' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_colors(output_dirpath, contigs_fpaths, plotter_data.dict_color_and_ls) html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath) if all_pdf_fpath and os.path.isfile(all_pdf_fpath): logger.main_info(' PDF version (tables and plots) is saved to ' + all_pdf_fpath) if circos_png_fpath: logger.main_info( ' Circos plot is saved to %s (the annotation is in %s). Circos configuration file is saved to %s' % (circos_png_fpath, circos_legend_fpath, circos_png_fpath.replace('.png', '.conf'))) if icarus_html_fpath: logger.main_info(' Icarus (contig browser) is saved to %s' % icarus_html_fpath) if qconfig.draw_svg and contig_alignment_plot_fpath: logger.main_info(' Contig alignment plot is saved to %s' % contig_alignment_plot_fpath) cleanup(corrected_dirpath) return logger.finish_up(check_test=qconfig.test)
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() kmer_len = qconfig.unique_kmer_len logger.main_info('Running analysis based on unique ' + str(kmer_len) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field( reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 7: corr_len = int(stats_content[1].strip().split(': ')[-1]) mis_len = int(stats_content[2].strip().split(': ')[-1]) undef_len = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) translocations = int(stats_content[5].strip().split(': ')[-1]) relocations = int(stats_content[6].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_CORR_LENGTH, '%.2f' % (corr_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_MIS_LENGTH, '%.2f' % (mis_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_UNDEF_LENGTH, '%.2f' % (undef_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_TRANSLOCATIONS, translocations) report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations) report.add_field(reporting.Fields.KMER_MISASSEMBLIES, translocations + relocations) checked_assemblies.append(contigs_fpath) contigs_fpaths = [ fpath for fpath in contigs_fpaths if fpath not in checked_assemblies ] if len(contigs_fpaths) == 0: save_kmers(output_dir) logger.info('Done.') return if qconfig.platform_name == 'linux_32': logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger) global kmc_bin_fpath global kmc_tools_fpath kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) if not exists(kmc_bin_fpath) or not exists( kmc_tools_fpath) or not compile_minimap(logger): logger.warning(' Sorry, can\'t run KMC, skipping...') return None logger.info(' Running KMC on reference...') if not isdir(output_dir): os.makedirs(output_dir) log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, kmer_len, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: logger.warning('KMC failed, check ' + log_fpath + ' and ' + err_fpath + '. Skipping...') return logger.info(' Analyzing assemblies completeness...') kmc_out_fpaths = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, kmer_len, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers( tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info(' Analyzing assemblies correctness...') ref_contigs = [name for name, _ in read_fasta(ref_fpath)] logger.info(' Downsampling k-mers...') ref_kmers, downsampled_kmers_fpath = downsample_kmers( tmp_dirpath, ref_fpath, ref_kmc_out_fpath, kmer_len, log_fpath, err_fpath) for id, (contigs_fpath, kmc_db_fpath) in enumerate(zip(contigs_fpaths, kmc_out_fpaths)): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) report = reporting.get(contigs_fpath) corr_len = None mis_len = None undef_len = None translocations, relocations = None, None total_len = 0 contig_lens = dict() for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(ref_contigs) > MAX_REF_CONTIGS_NUM: logger.warning( 'Reference is too fragmented. Scaffolding accuracy will not be assessed.' ) else: corr_len = 0 mis_len = 0 kmers_by_contig, kmers_pos_by_contig = align_kmers( tmp_dirpath, contigs_fpath, downsampled_kmers_fpath, err_fpath, qconfig.max_threads) is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref cyclic_ref_lens = report.get_field( reporting.Fields.REFLEN) if is_cyclic else None translocations = 0 relocations = 0 with open( join( tmp_dirpath, qutils.label_from_fpath_for_fname(contigs_fpath) + '.misjoins.txt'), 'w') as out: for contig in kmers_by_contig.keys(): contig_markers = [] prev_pos, prev_ref_pos, prev_chrom, marker = None, None, None, None for pos, kmer in sorted(zip(kmers_pos_by_contig[contig], kmers_by_contig[contig]), key=lambda x: x[0]): ref_chrom, ref_pos = ref_kmers[kmer] if prev_pos and prev_chrom: if prev_chrom == ref_chrom and abs( abs(pos - prev_pos) / abs(ref_pos - prev_ref_pos) - 1) <= 0.05: marker = (pos, ref_pos, ref_chrom) elif marker: contig_markers.append(marker) pos, ref_pos, ref_chrom, marker = None, None, None, None prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom if marker: contig_markers.append(marker) prev_pos, prev_ref_pos, prev_chrom = None, None, None is_misassembled = False for marker in contig_markers: pos, ref_pos, ref_chrom = marker if prev_pos and prev_chrom: if ref_chrom != prev_chrom: translocations += 1 out.write( 'Translocation in %s: %s %d | %s %d\n' % (contig, prev_chrom, prev_pos, ref_chrom, pos)) is_misassembled = True elif _get_dist_inconstistency( pos, prev_pos, ref_pos, prev_ref_pos, cyclic_ref_lens) > EXT_RELOCATION_SIZE: relocations += 1 out.write( 'Relocation in %s: %d (%d) | %d (%d)\n' % (contig, prev_pos, prev_ref_pos, pos, ref_pos)) is_misassembled = True prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom if is_misassembled: mis_len += contig_lens[contig] elif len(contig_markers) > 0: corr_len += contig_lens[contig] undef_len = total_len - corr_len - mis_len report.add_field(reporting.Fields.KMER_CORR_LENGTH, '%.2f' % (corr_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_MIS_LENGTH, '%.2f' % (mis_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_UNDEF_LENGTH, '%.2f' % (undef_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_TRANSLOCATIONS, translocations) report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations) report.add_field(reporting.Fields.KMER_MISASSEMBLIES, translocations + relocations) create_kmc_stats_file( output_dir, contigs_fpath, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), corr_len, mis_len, undef_len, total_len, translocations, relocations) save_kmers(output_dir) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def save_result(result, report, fname, ref_fpath, genome_size): region_misassemblies = result['region_misassemblies'] misassemblies_by_ref = result['misassemblies_by_ref'] misassembled_contigs = result['misassembled_contigs'] misassembled_bases = result['misassembled_bases'] misassembly_internal_overlap = result['misassembly_internal_overlap'] unaligned = result['unaligned'] partially_unaligned = result['partially_unaligned'] partially_unaligned_bases = result['partially_unaligned_bases'] fully_unaligned_bases = result['fully_unaligned_bases'] ambiguous_contigs = result['ambiguous_contigs'] ambiguous_contigs_extra_bases = result['ambiguous_contigs_extra_bases'] SNPs = result['SNPs'] indels_list = result['indels_list'] aligned_ref_bases = result['aligned_ref_bases'] aligned_assembly_bases = result['aligned_assembly_bases'] half_unaligned_with_misassembly = result['half_unaligned_with_misassembly'] report.add_field(reporting.Fields.MISLOCAL, region_misassemblies.count(Misassembly.LOCAL)) report.add_field( reporting.Fields.MISASSEMBL, region_misassemblies.count(Misassembly.RELOCATION) + region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) + region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.MISCONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MISCONTIGSBASES, misassembled_bases) report.add_field(reporting.Fields.MISINTERNALOVERLAP, misassembly_internal_overlap) if qconfig.bed: report.add_field(reporting.Fields.STRUCT_VARIATIONS, region_misassemblies.count(Misassembly.MATCHED_SV)) if qconfig.large_genome: report.add_field(reporting.Fields.POTENTIAL_MGE, region_misassemblies.count(Misassembly.POTENTIAL_MGE)) report.add_field(reporting.Fields.UNALIGNED, '%d + %d part' % (unaligned, partially_unaligned)) report.add_field(reporting.Fields.UNALIGNEDBASES, (fully_unaligned_bases + partially_unaligned_bases)) report.add_field(reporting.Fields.AMBIGUOUS, ambiguous_contigs) report.add_field(reporting.Fields.AMBIGUOUSEXTRABASES, ambiguous_contigs_extra_bases) report.add_field(reporting.Fields.MISMATCHES, SNPs) # different types of indels: if indels_list is not None: report.add_field(reporting.Fields.INDELS, len(indels_list)) report.add_field(reporting.Fields.INDELSBASES, sum(indels_list)) report.add_field( reporting.Fields.MIS_SHORT_INDELS, len([i for i in indels_list if i <= qconfig.SHORT_INDEL_THRESHOLD])) report.add_field( reporting.Fields.MIS_LONG_INDELS, len([i for i in indels_list if i > qconfig.SHORT_INDEL_THRESHOLD])) if aligned_ref_bases: genome_fraction = aligned_ref_bases * 100.0 / genome_size duplication_ratio = float( aligned_assembly_bases + misassembly_internal_overlap + ambiguous_contigs_extra_bases) / aligned_ref_bases report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) report.add_field( reporting.Fields.SUBSERROR, "%.2f" % (float(SNPs) * 100000.0 / float(aligned_assembly_bases))) report.add_field( reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS)) * 100000.0 / float(aligned_assembly_bases))) # for misassemblies report: report.add_field( reporting.Fields.MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.RELOCATION) + region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) + region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.MIS_RELOCATION, region_misassemblies.count(Misassembly.RELOCATION)) report.add_field(reporting.Fields.MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.TRANSLOCATION)) report.add_field(reporting.Fields.MIS_INVERTION, region_misassemblies.count(Misassembly.INVERSION)) report.add_field(reporting.Fields.MIS_EXTENSIVE_CONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MIS_EXTENSIVE_BASES, misassembled_bases) report.add_field(reporting.Fields.MIS_LOCAL, region_misassemblies.count(Misassembly.LOCAL)) # special case for separating contig and scaffold misassemblies report.add_field( reporting.Fields.SCF_MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.SCF_RELOCATION) + region_misassemblies.count(Misassembly.SCF_INVERSION) + region_misassemblies.count(Misassembly.SCF_TRANSLOCATION) + region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.SCF_MIS_RELOCATION, region_misassemblies.count(Misassembly.SCF_RELOCATION)) report.add_field(reporting.Fields.SCF_MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.SCF_TRANSLOCATION)) report.add_field(reporting.Fields.SCF_MIS_INVERTION, region_misassemblies.count(Misassembly.SCF_INVERSION)) report.add_field( reporting.Fields.CTG_MIS_ALL_EXTENSIVE, report.get_field(reporting.Fields.MIS_ALL_EXTENSIVE) - report.get_field(reporting.Fields.SCF_MIS_ALL_EXTENSIVE)) report.add_field( reporting.Fields.CTG_MIS_RELOCATION, region_misassemblies.count(Misassembly.RELOCATION) - region_misassemblies.count(Misassembly.SCF_RELOCATION)) report.add_field( reporting.Fields.CTG_MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.TRANSLOCATION) - region_misassemblies.count(Misassembly.SCF_TRANSLOCATION)) report.add_field( reporting.Fields.CTG_MIS_INVERTION, region_misassemblies.count(Misassembly.INVERSION) - region_misassemblies.count(Misassembly.SCF_INVERSION)) if qconfig.is_combined_ref: report.add_field( reporting.Fields.MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field( reporting.Fields.SCF_MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION)) report.add_field( reporting.Fields.CTG_MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION) - region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION)) report.add_field( reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) report.add_field( reporting.Fields.POSSIBLE_MISASSEMBLIES, region_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) all_references = sorted( list(set([ref for ref in ref_labels_by_chromosomes.values()]))) for ref_name in all_references: subreport = reporting.get(fname, ref_name=ref_name) ref_misassemblies = misassemblies_by_ref[ref_name] subreport.add_field( reporting.Fields.MIS_ALL_EXTENSIVE, ref_misassemblies.count(Misassembly.RELOCATION) + ref_misassemblies.count(Misassembly.INVERSION) + ref_misassemblies.count(Misassembly.TRANSLOCATION) + ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) subreport.add_field( reporting.Fields.MIS_RELOCATION, ref_misassemblies.count(Misassembly.RELOCATION)) subreport.add_field( reporting.Fields.MIS_TRANSLOCATION, ref_misassemblies.count(Misassembly.TRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_INVERTION, ref_misassemblies.count(Misassembly.INVERSION)) subreport.add_field( reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_LOCAL, ref_misassemblies.count(Misassembly.LOCAL)) subreport.add_field( reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) subreport.add_field( reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) if fname not in qconfig.dict_of_broken_scaffolds: subreport.add_field( reporting.Fields.MIS_SCAFFOLDS_GAP, ref_misassemblies.count(Misassembly.SCAFFOLD_GAP)) subreport.add_field( reporting.Fields.MIS_LOCAL_SCAFFOLDS_GAP, ref_misassemblies.count(Misassembly.LOCAL_SCAFFOLD_GAP)) if qconfig.check_for_fragmented_ref: subreport.add_field( reporting.Fields.MIS_FRAGMENTED, ref_misassemblies.count(Misassembly.FRAGMENTED)) elif intergenomic_misassemblies_by_asm: label = qutils.label_from_fpath(fname) ref_name = qutils.name_from_fpath(ref_fpath) ref_misassemblies = intergenomic_misassemblies_by_asm[label][ref_name] report.add_field( reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field( reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) report.add_field( reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) if fname not in qconfig.dict_of_broken_scaffolds: report.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, region_misassemblies.count(Misassembly.SCAFFOLD_GAP)) report.add_field( reporting.Fields.MIS_LOCAL_SCAFFOLDS_GAP, region_misassemblies.count(Misassembly.LOCAL_SCAFFOLD_GAP)) if qconfig.check_for_fragmented_ref: report.add_field(reporting.Fields.MIS_FRAGMENTED, region_misassemblies.count(Misassembly.FRAGMENTED)) # for unaligned report: report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned) report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH, fully_unaligned_bases) report.add_field(reporting.Fields.UNALIGNED_PART_CNTGS, partially_unaligned) report.add_field(reporting.Fields.UNALIGNED_PART_LENGTH, partially_unaligned_bases) report.add_field(reporting.Fields.UNALIGNED_MISASSEMBLED_CTGS, half_unaligned_with_misassembly) return report
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None icarus_gc_fpath = None circos_gc_fpath = None if ref_fpath: reference_lengths = sorted(fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(ref_fpath) if qconfig.create_icarus_html or qconfig.draw_plots: icarus_gc_fpath = join(output_dirpath, 'gc.icarus.txt') save_icarus_GC(ref_fpath, icarus_gc_fpath) if qconfig.draw_plots: circos_gc_fpath = join(output_dirpath, 'gc.circos.txt') save_circos_GC(ref_fpath, reference_length, circos_gc_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC if reference_GC is not None else 'undefined') if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning(' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).') elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_(\d+\.?\d*)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * (cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator corr_lists_of_lengths = [[sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [sum(reference_lengths[((i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum(reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points)] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] if reference_lengths: # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] list_of_GC_contigs_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution, GC_contigs_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) list_of_GC_contigs_distributions.append(GC_contigs_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil((largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if qconfig.html_report and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', []) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx', [reference_length for i in range(len(contigs_fpaths))]) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length') if not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot')) for contigs_fpath, GC_distribution in zip(contigs_fpaths, list_of_GC_contigs_distributions): plotter.contigs_GC_content_plot(contigs_fpath, GC_distribution, join(output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot')) if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.') return icarus_gc_fpath, circos_gc_fpath
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath): from quast_libs import reporting ref_reads_stats = None ref_lap_score = None if ref_fpath: ref_name = qutils.name_from_fpath(ref_fpath) stats_fpath = join(output_dir, ref_name + '.stat') if isfile(stats_fpath): ref_reads_stats = parse_reads_stats(stats_fpath) if int(ref_reads_stats['mapped']) == 0: logger.info(' BWA: nothing aligned for reference.') lap_out_fpath = get_safe_fpath(output_dir, ref_name + '.lap.out') if is_non_empty_file(lap_out_fpath): with open(lap_out_fpath) as f: l = f.readline() ref_lap_score = float(l.split()[0]) if l else None # process all contigs files for index, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) stats_fpath = join(output_dir, assembly_name + '.stat') if ref_reads_stats: report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped']) report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt']) report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons']) report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint']) report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth']) if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS, [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0]) if not isfile(stats_fpath): continue reads_stats = parse_reads_stats(stats_fpath) report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total']) report.add_field(reporting.Fields.LEFT_READS, reads_stats['left']) report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right']) report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped']) report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt']) if int(reads_stats['mapped']) == 0: logger.info(' ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.') report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons']) report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint']) report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.DEPTH, reads_stats['depth']) if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS, [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0]) lap_out_fpath = get_safe_fpath(output_dir, assembly_name + '.lap.out') if is_non_empty_file(lap_out_fpath): with open(lap_out_fpath) as f: l = f.readline() lap_score = float(l.split()[0]) if l else None report.add_field(reporting.Fields.LAP_SCORE, ('%.3f' % lap_score if lap_score is not None else None)) report.add_field(reporting.Fields.REF_LAP_SCORE, ('%.3f' % ref_lap_score if ref_lap_score is not None else None))
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [AlignerStatus.FAILED] * len(contigs_fpaths))), None num_nf_errors = logger._num_nf_errors create_minimap_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) genome_size, reference_chromosomes, ns_by_chromosomes = get_genome_stats(reference, skip_ns=True) threads = qconfig.max_threads if qconfig.memory_efficient else threads args = [(is_cyclic, i, contigs_fpath, output_dir, reference, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))] statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs = run_parallel(align_and_analyze, args, n_jobs) reports = [] aligner_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs)) if AlignerStatus.OK in aligner_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == AlignerStatus.OK: reports.append(save_result(results[index], report, fname, reference, genome_size)) elif statuses[index] == AlignerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if AlignerStatus.OK in aligner_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(aligner_statuses.values()).count(AlignerStatus.OK) not_aligned = list(aligner_statuses.values()).count(AlignerStatus.NOT_ALIGNED) failed = list(aligner_statuses.values()).count(AlignerStatus.FAILED) errors = list(aligner_statuses.values()).count(AlignerStatus.ERROR) problems = not_aligned + failed + errors all = len(aligner_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return aligner_statuses, aligned_lengths_per_fpath
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.main_info('Running NA-NGA calculation...') ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath) reference_length = sum(ref_chr_lengths.values()) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values())) for i, (contigs_fpath, lens, assembly_len) in enumerate( zip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) na75 = N50.NG50(lens, assembly_len, 75) la50 = N50.LG50(lens, assembly_len) la75 = N50.LG50(lens, assembly_len, 75) if not qconfig.is_combined_ref: nga50 = N50.NG50(lens, reference_length) nga75 = N50.NG50(lens, reference_length, 75) lga50 = N50.LG50(lens, reference_length) lga75 = N50.LG50(lens, reference_length, 75) logger.info(' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') + ', LA50 = ' + str(la50) + (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else '')) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LA75, la75) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))]) if json_output_dirpath: from quast_libs.html_saver import json_saver json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths, json_output_dir=json_output_dirpath) if not qconfig.is_combined_ref: plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath) logger.main_info('Done.') return report_dict
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 5: len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1]) len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1]) len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) checked_assemblies.append(contigs_fpath) contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies] if len(contigs_fpaths) == 0: logger.info('Done.') return if qconfig.platform_name == 'linux_32': logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger) global kmc_bin_fpath global kmc_tools_fpath kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath) or not compile_minimap(logger): logger.warning(' Sorry, can\'t run KMC, skipping...') return None logger.info('Running KMC on reference...') log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: return logger.info('Analyzing assemblies completeness...') kmc_out_fpaths = [] for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info('Analyzing assemblies accuracy...') if len(kmc_out_fpaths) > 1: shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath) else: shared_kmc_db = kmc_out_fpaths[0] kmer_fraction = 0.001 ref_contigs = [name for name, _ in read_fasta(ref_fpath)] ref_kmc_dbs = [] if len(ref_contigs) <= MAX_REF_CONTIGS_NUM: shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, ref_fpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction) for name, seq in read_fasta(ref_fpath): seq_kmc_db = seq_to_kmc_db(tmp_dirpath, log_fpath, err_fpath, seq=seq, name=name, is_ref=True, intersect_with=shared_downsampled_kmc_db) ref_kmc_dbs.append((name, seq_kmc_db)) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = None len_map_to_multi_chrom = None len_map_to_none_chrom = None total_len = 0 long_contigs = [] contig_lens = dict() contig_markers = defaultdict(list) label = qutils.label_from_fpath_for_fname(contigs_fpath) list_files_fpath = join(tmp_dirpath, label + '_files.txt') with open(list_files_fpath, 'w') as list_files: for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(seq) >= MIN_CONTIGS_LEN: long_contigs.append(len(seq)) tmp_contig_fpath = join(tmp_dirpath, name + '.fasta') with open(tmp_contig_fpath, 'w') as out_f: out_f.write('>%s\n' % name) out_f.write('%s\n' % seq) list_files.write(tmp_contig_fpath + '\n') if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5: logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.') elif len(ref_contigs) > MAX_REF_CONTIGS_NUM: logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.') else: len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 filtered_fpath = join(tmp_dirpath, label + '.filtered.fasta') filter_contigs(list_files_fpath, filtered_fpath, shared_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MARKERS) filtered_list_files_fpath = join(tmp_dirpath, label + '_files.filtered.txt') with open(filtered_list_files_fpath, 'w') as list_files: for name, _ in read_fasta(filtered_fpath): tmp_contig_fpath = join(tmp_dirpath, name + '.fasta') list_files.write(tmp_contig_fpath + '\n') for ref_name, ref_kmc_db in ref_kmc_dbs: tmp_filtered_fpath = join(tmp_dirpath, ref_name + '.filtered.fasta') filter_contigs(filtered_list_files_fpath, tmp_filtered_fpath, ref_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MISJOIN_MARKERS) if exists(tmp_filtered_fpath): for name, _ in read_fasta(tmp_filtered_fpath): contig_markers[name].append(ref_name) for name, chr_markers in contig_markers.items(): if len(chr_markers) == 1: len_map_to_one_chrom += contig_lens[name] else: len_map_to_multi_chrom += contig_lens[name] len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries( logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus) if not clade_dirpath: logger.info('Failed finding conservative genes.') return config_fpath = make_config(output_dir, tmp_dir, busco_threads, clade_dirpath, augustus_dirpath) logger.info('Logs and results will be saved under ' + output_dir + '...') os.environ['BUSCO_CONFIG_FILE'] = config_fpath os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_contigs( augustus_dirpath, tmp_dir) if not os.environ['AUGUSTUS_CONFIG_PATH']: logger.error( 'Augustus configs not found, failed to run BUSCO without them.') busco_args = [[ contigs_fpath, qutils.label_from_fpath_for_fname(contigs_fpath) ] for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco_main_handler, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error( 'Failed running BUSCO for all the assemblies. See log files in ' + output_dir + ' for information.') return # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field( reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) shutil.copy(summary_fpaths[i], output_dir) else: logger.error('Failed running BUSCO for ' + contigs_fpath + '. See the log for detailed information.') if not qconfig.debug: cleanup(output_dir) logger.info('Done.')
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') num_nf_errors = logger._num_nf_errors success_compilation = compile_aligner(logger) if qconfig.test and is_emem_aligner(): success_compilation = check_emem_functionality(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if qconfig.memory_efficient: threads = 1 else: threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.splitted_ref: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append(align_and_analyze( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths = [x[0] for x in statuses_results_lengths_tuples], \ [x[1] for x in statuses_results_lengths_tuples], \ [x[2] for x in statuses_results_lengths_tuples] reports = [] for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) if qconfig.draw_plots: from . import plotter plotter.draw_misassembl_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) oks = list(nucmer_statuses.values()).count(NucmerStatus.OK) not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED) failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED) errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') if not qconfig.test and is_emem_aligner(): logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.') return nucmer_statuses, aligned_lengths_per_fpath
def main(args): check_dirpath( qconfig.QUAST_HOME, "You are trying to run it from " + str(qconfig.QUAST_HOME) + "\n." + "Please, put QUAST in a different directory, then try again.\n", exit_code=3, ) if not args: qconfig.usage() sys.exit(0) try: import imp imp.reload(qconfig) except: reload(qconfig) try: locale.setlocale(locale.LC_ALL, "en_US.utf8") except Exception: try: locale.setlocale(locale.LC_ALL, "en_US.UTF-8") except Exception: logger.warning("Python locale settings can't be changed") quast_path = [os.path.realpath(__file__)] quast_py_args, contigs_fpaths = parse_options(logger, quast_path + args) output_dirpath, ref_fpath, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.main_info() logger.print_params() ######################################################################## from quast_libs import reporting reports = reporting.reports try: import imp imp.reload(reporting) except: reload(reporting) reporting.reports = reports reporting.assembly_fpaths = [] from quast_libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. if qconfig.is_combined_ref: corrected_dirpath = os.path.join(output_dirpath, "..", qconfig.corrected_dirname) else: if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) qconfig.set_max_threads(logger) # PROCESSING REFERENCE if ref_fpath: logger.main_info() logger.main_info("Reference:") ref_fpath = qutils.correct_reference(ref_fpath, corrected_dirpath) else: ref_fpath = "" # PROCESSING CONTIGS logger.main_info() logger.main_info("Contigs:") contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) reads_fpaths = [] cov_fpath = qconfig.cov_fpath physical_cov_fpath = qconfig.phys_cov_fpath if qconfig.forward_reads: reads_fpaths.append(qconfig.forward_reads) if qconfig.reverse_reads: reads_fpaths.append(qconfig.reverse_reads) if (reads_fpaths or qconfig.sam or qconfig.bam) and ref_fpath: bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do( ref_fpath, contigs_fpaths, reads_fpaths, None, os.path.join(output_dirpath, qconfig.variation_dirname), external_logger=logger, sam_fpath=qconfig.sam, bam_fpath=qconfig.bam, bed_fpath=qconfig.bed, ) qconfig.bed = bed_fpath if not contigs_fpaths: logger.error( "None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True, ) return 4 if qconfig.used_colors and qconfig.used_ls: for i, label in enumerate(labels): plotter.dict_color_and_ls[label] = (qconfig.used_colors[i], qconfig.used_ls[i]) qconfig.assemblies_fpaths = contigs_fpaths # Where all pdfs will be saved all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) all_pdf_file = None if qconfig.draw_plots and plotter.can_draw_plots: try: from matplotlib.backends.backend_pdf import PdfPages all_pdf_file = PdfPages(all_pdf_fpath) except: all_pdf_file = None if qconfig.json_output_dirpath: from quast_libs.html_saver import json_saver if json_saver.simplejson_error: json_output_dirpath = None ######################################################################## ### Stats and plots ######################################################################## from quast_libs import basic_stats basic_stats.do( ref_fpath, contigs_fpaths, os.path.join(output_dirpath, "basic_stats"), qconfig.json_output_dirpath, output_dirpath, ) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None icarus_html_fpath = None if ref_fpath: ######################################################################## ### former PLANTAKOLYA, PLANTAGORA ######################################################################## from quast_libs import contigs_analyzer is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do( ref_fpath, contigs_fpaths, is_cyclic, os.path.join(output_dirpath, "contigs_reports"), old_contigs_fpaths, qconfig.bed, ) for contigs_fpath in contigs_fpaths: if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK: aligned_contigs_fpaths.append(contigs_fpath) aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath]) # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None features_containers = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join(output_dirpath, "contigs_reports") ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from quast_libs import aligned_stats aligned_stats.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, qconfig.json_output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, "aligned_stats"), ) ######################################################################## ### GENOME_ANALYZER ######################################################################## from quast_libs import genome_analyzer features_containers = genome_analyzer.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, qconfig.json_output_dirpath, qconfig.genes, qconfig.operons, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, "genome_stats"), ) if qconfig.with_gage: ######################################################################## ### GAGE ######################################################################## if not ref_fpath: logger.warning("GAGE can't be run without a reference and will be skipped.") else: from quast_libs import gage gage.do(ref_fpath, contigs_fpaths, output_dirpath) genes_by_labels = None if qconfig.gene_finding: if qconfig.glimmer: ######################################################################## ### Glimmer ######################################################################## from quast_libs import glimmer genes_by_labels = glimmer.do( contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, "predicted_genes") ) else: ######################################################################## ### GeneMark ######################################################################## from quast_libs import genemark genes_by_labels = genemark.do( contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, "predicted_genes"), qconfig.prokaryote, qconfig.meta, ) else: logger.main_info("") logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.") ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots or qconfig.create_icarus_html: logger.print_timestamp() logger.main_info("Creating large visual summaries...") logger.main_info("This may take a while: press Ctrl-C to skip this step..") try: if detailed_contigs_reports_dirpath: report_for_icarus_fpath_pattern = os.path.join( detailed_contigs_reports_dirpath, qconfig.icarus_report_fname_pattern ) stdout_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.contig_report_fname_pattern) else: report_for_icarus_fpath_pattern = None stdout_pattern = None draw_alignment_plots = qconfig.draw_svg or qconfig.create_icarus_html number_of_steps = sum([int(bool(value)) for value in [draw_alignment_plots, all_pdf_file]]) if draw_alignment_plots: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.main_info(" 1 of %d: Creating Icarus viewers..." % number_of_steps) from quast_libs import icarus icarus_html_fpath, contig_alignment_plot_fpath = icarus.do( contigs_fpaths, report_for_icarus_fpath_pattern, output_dirpath, ref_fpath, stdout_pattern=stdout_pattern, features=features_containers, cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, json_output_dir=qconfig.json_output_dirpath, genes_by_labels=genes_by_labels, ) if all_pdf_file: # full report in PDF format: all tables and plots logger.main_info( " %d of %d: Creating PDF with all tables and plots..." % (number_of_steps, number_of_steps) ) plotter.fill_all_pdf_file(all_pdf_file) logger.main_info("Done") except KeyboardInterrupt: logger.main_info("..step skipped!") os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.main_info("RESULTS:") logger.main_info(" Text versions of total report are saved to " + reports_fpaths) logger.main_info(" Text versions of transposed total report are saved to " + transposed_reports_fpaths) if qconfig.json_output_dirpath: json_saver.save_total_report(qconfig.json_output_dirpath, qconfig.min_contig, ref_fpath) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls) html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath) if os.path.isfile(all_pdf_fpath): logger.main_info(" PDF version (tables and plots) is saved to " + all_pdf_fpath) if icarus_html_fpath: logger.main_info(" Icarus (contig browser) is saved to %s" % icarus_html_fpath) if qconfig.draw_svg and contig_alignment_plot_fpath: logger.main_info(" Contig alignment plot is saved to %s" % contig_alignment_plot_fpath) cleanup(corrected_dirpath) return logger.finish_up(check_test=qconfig.test)
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique 101-mers...') addsitedir(jellyfish_python_dirpath) try: compile_jellyfish(logger) import jellyfish try: import imp imp.reload(jellyfish) except: reload(jellyfish) jellyfish.MerDNA.k(KMERS_LEN) except: logger.warning('Failed unique 101-mers analysis.') return checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_jf_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): jf_stats_fpath = join(output_dir, label + '.stat') stats_content = open(jf_stats_fpath).read().split('\n') if len(stats_content) < 4: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field( reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % float(stats_content[1].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % float(stats_content[2].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % float(stats_content[3].strip().split(': ')[-1])) checked_assemblies.append(contigs_fpath) contigs_fpaths = [ fpath for fpath in contigs_fpaths if fpath not in checked_assemblies ] if len(contigs_fpaths) == 0: logger.info('Done.') return logger.info('Running Jellyfish on reference...') jf_out_fpath = join(output_dir, basename(ref_fpath) + '.jf') qutils.call_subprocess([ jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s', str(getsize(ref_fpath)), '-o', jf_out_fpath, '-t', str(qconfig.max_threads), ref_fpath ]) ref_kmers = jellyfish.ReadMerFile(jf_out_fpath) os.remove(jf_out_fpath) logger.info('Running Jellyfish on assemblies...') contigs_kmers = [] for contigs_fpath in contigs_fpaths: jf_out_fpath = join(output_dir, basename(contigs_fpath) + '.jf') qutils.call_subprocess([ jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s', str(getsize(contigs_fpath)), '-o', jf_out_fpath, '-t', str(qconfig.max_threads), contigs_fpath ]) contigs_kmers.append(jellyfish.QueryMerFile(jf_out_fpath)) os.remove(jf_out_fpath) logger.info('Analyzing completeness and accuracy of assemblies...') unique_kmers = 0 matched_kmers = defaultdict(int) shared_kmers = set() kmer_i = 0 for kmer, count in ref_kmers: unique_kmers += 1 matches = 0 for idx in range(len(contigs_fpaths)): if contigs_kmers[idx][kmer]: matched_kmers[idx] += 1 matches += 1 if matches == len(contigs_fpaths): if kmer_i % 100 == 0: shared_kmers.add(str(kmer)) kmer_i += 1 for idx, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) completeness = matched_kmers[idx] * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) shared_kmers_by_chrom = dict() ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath)) for name, seq in ref_contigs.items(): seq_kmers = jellyfish.string_mers(seq) for kmer in seq_kmers: if str(kmer) in shared_kmers: shared_kmers_by_chrom[str(kmer)] = name for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 total_len = 0 for name, seq in read_fasta(contigs_fpath): total_len += len(seq) seq_kmers = jellyfish.string_mers(seq) chrom_markers = [] for kmer in seq_kmers: kmer_str = str(kmer) if kmer_str in shared_kmers_by_chrom: chrom = shared_kmers_by_chrom[kmer_str] chrom_markers.append(chrom) if len(chrom_markers) < MIN_MARKERS: continue if len(set(chrom_markers)) == 1: len_map_to_one_chrom += len(seq) else: len_map_to_multi_chrom += len(seq) len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_jf_stats_file( output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom) logger.info('Done.')
def main(args): check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n.' + 'Please, put QUAST in a different directory, then try again.\n', exit_code=3) if not args: qconfig.usage(stream=sys.stderr) sys.exit(1) try: import imp imp.reload(qconfig) imp.reload(qutils) except: reload(qconfig) reload(qutils) try: locale.setlocale(locale.LC_ALL, 'en_US.utf8') except Exception: try: locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') except Exception: logger.warning('Python locale settings can\'t be changed') quast_path = [os.path.realpath(__file__)] quast_py_args, contigs_fpaths = parse_options(logger, quast_path + args) output_dirpath, ref_fpath, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.main_info() logger.print_params() ######################################################################## from quast_libs import reporting reports = reporting.reports try: import imp imp.reload(reporting) except: reload(reporting) reporting.reports = reports reporting.assembly_fpaths = [] from quast_libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. if qconfig.is_combined_ref: corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname) else: if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) qconfig.set_max_threads(logger) check_reads_fpaths(logger) # PROCESSING REFERENCE if ref_fpath: logger.main_info() logger.main_info('Reference:') original_ref_fpath = ref_fpath ref_fpath = qutils.correct_reference(ref_fpath, corrected_dirpath) if qconfig.ideal_assembly: ideal_assembly_fpath = ideal_assembly.do(ref_fpath, original_ref_fpath, os.path.join(output_dirpath, qconfig.ideal_assembly_basename)) if ideal_assembly_fpath is not None: contigs_fpaths.insert(0, ideal_assembly_fpath) labels.insert(0, 'IDEAL ASSEMBLY') labels = qutils.process_labels(contigs_fpaths, labels) else: ref_fpath = '' # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) cov_fpath = qconfig.cov_fpath physical_cov_fpath = qconfig.phys_cov_fpath if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_sam or qconfig.sam_fpaths or qconfig.bam_fpaths: bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, qconfig.reads_stats_dirname), external_logger=logger) qconfig.bed = bed_fpath if not contigs_fpaths: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 if qconfig.used_colors and qconfig.used_ls: for i, label in enumerate(labels): plotter_data.dict_color_and_ls[label] = (qconfig.used_colors[i], qconfig.used_ls[i]) qconfig.assemblies_fpaths = contigs_fpaths # Where all pdfs will be saved all_pdf_fpath = None if qconfig.draw_plots and plotter.can_draw_plots: all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) if qconfig.json_output_dirpath: from quast_libs.html_saver import json_saver if json_saver.simplejson_error: qconfig.json_output_dirpath = None ######################################################################## ### Stats and plots ######################################################################## from quast_libs import basic_stats icarus_gc_fpath, circos_gc_fpath = basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), output_dirpath) if qconfig.large_genome and ref_fpath: unique_kmers.do(os.path.join(output_dirpath, 'basic_stats'), ref_fpath, contigs_fpaths, logger) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None icarus_html_fpath = None circos_png_fpath = None if ref_fpath: ######################################################################## ### former PLANTAKOLYA, PLANTAGORA ######################################################################## from quast_libs import contigs_analyzer is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do( ref_fpath, contigs_fpaths, is_cyclic, os.path.join(output_dirpath, 'contigs_reports'), old_contigs_fpaths, qconfig.bed) for contigs_fpath in contigs_fpaths: if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK: aligned_contigs_fpaths.append(contigs_fpath) aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath]) # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None features_containers = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from quast_libs import aligned_stats aligned_stats.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from quast_libs import genome_analyzer features_containers = genome_analyzer.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, qconfig.genes, qconfig.operons, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) genes_by_labels = None if qconfig.gene_finding: if qconfig.glimmer: ######################################################################## ### Glimmer ######################################################################## from quast_libs import glimmer genes_by_labels = glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes')) if not qconfig.glimmer or qconfig.test: ######################################################################## ### GeneMark ######################################################################## from quast_libs import genemark genes_by_labels = genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.prokaryote, qconfig.metagenemark) else: logger.main_info("") logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.") if qconfig.rna_gene_finding: run_barrnap.do(contigs_fpaths, os.path.join(output_dirpath, 'predicted_genes'), logger) if qconfig.run_busco and not qconfig.is_combined_ref: if qconfig.platform_name == 'macosx': logger.main_info("") logger.warning("BUSCO can be run on Linux only") elif sys.version[0:3] == '2.5': logger.main_info("") logger.warning("BUSCO does not support Python versions older than 2.6.") else: from quast_libs import run_busco run_busco.do(contigs_fpaths, os.path.join(output_dirpath, qconfig.busco_dirname), logger) ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots or qconfig.create_icarus_html: logger.print_timestamp() logger.main_info('Creating large visual summaries...') logger.main_info('This may take a while: press Ctrl-C to skip this step..') try: if detailed_contigs_reports_dirpath: report_for_icarus_fpath_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.icarus_report_fname_pattern) stdout_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.contig_report_fname_pattern) else: report_for_icarus_fpath_pattern = None stdout_pattern = None draw_alignment_plots = qconfig.draw_svg or qconfig.create_icarus_html draw_circos_plot = qconfig.draw_plots and ref_fpath and len(aligned_contigs_fpaths) and not qconfig.space_efficient number_of_steps = sum([int(bool(value)) for value in [draw_alignment_plots, draw_circos_plot, all_pdf_fpath]]) if draw_alignment_plots: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.main_info(' 1 of %d: Creating Icarus viewers...' % number_of_steps) from quast_libs import icarus icarus_html_fpath, contig_alignment_plot_fpath = icarus.do( contigs_fpaths, report_for_icarus_fpath_pattern, output_dirpath, ref_fpath, stdout_pattern=stdout_pattern, features=features_containers, cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, gc_fpath=icarus_gc_fpath, json_output_dir=qconfig.json_output_dirpath, genes_by_labels=genes_by_labels) if draw_circos_plot: logger.main_info(' %d of %d: Creating Circos plots...' % (2 if draw_alignment_plots else 1, number_of_steps)) from quast_libs import circos circos_png_fpath, circos_legend_fpath = circos.do(ref_fpath, contigs_fpaths, report_for_icarus_fpath_pattern, circos_gc_fpath, features_containers, cov_fpath, os.path.join(output_dirpath, 'circos'), logger) if all_pdf_fpath: # full report in PDF format: all tables and plots logger.main_info(' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_fpath) logger.main_info('Done') except KeyboardInterrupt: logger.main_info('..step skipped!') if all_pdf_fpath and os.path.isfile(all_pdf_fpath): os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.main_info('RESULTS:') logger.main_info(' Text versions of total report are saved to ' + reports_fpaths) logger.main_info(' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_colors(output_dirpath, contigs_fpaths, plotter_data.dict_color_and_ls) html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath) if all_pdf_fpath and os.path.isfile(all_pdf_fpath): logger.main_info(' PDF version (tables and plots) is saved to ' + all_pdf_fpath) if circos_png_fpath: logger.main_info(' Circos plot is saved to %s (the annotation is in %s). Circos configuration file is saved to %s' % (circos_png_fpath, circos_legend_fpath, circos_png_fpath.replace('.png', '.conf'))) if icarus_html_fpath: logger.main_info(' Icarus (contig browser) is saved to %s' % icarus_html_fpath) if qconfig.draw_svg and contig_alignment_plot_fpath: logger.main_info(' Contig alignment plot is saved to %s' % contig_alignment_plot_fpath) cleanup(corrected_dirpath) return logger.finish_up(check_test=qconfig.test)
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries(logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return set_augustus_dir(augustus_dirpath) if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus) if not clade_dirpath: logger.info('Failed finding conservative genes.') return log_fpath = join(output_dir, 'busco.log') logger.info('Logging to ' + log_fpath + '...') busco_args = [(['-i', contigs_fpath, '-o', qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath, '-m', 'genome', '-f', '-z', '-c', str(busco_threads), '-t', tmp_dir, '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' + join(augustus_dirpath, 'config') + '\'' ], output_dir) for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error('Failed running BUSCO for all the assemblies. See ' + log_fpath + ' for information.') return # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field(reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) else: logger.error( 'Failed running BUSCO for ' + contigs_fpath + '. See ' + log_fpath + ' for information.') logger.info('Done.')
def js_data_gen(assemblies, contigs_fpaths, chromosomes_length, output_dirpath, structures_by_labels, contigs_by_assemblies, ambiguity_alignments_by_labels=None, contig_names_by_refs=None, ref_fpath=None, stdout_pattern=None, features_data=None, cov_fpath=None, physical_cov_fpath=None, json_output_dir=None): chr_names = [] if chromosomes_length and assemblies: chr_to_aligned_blocks = OrderedDict() chr_names = list(chromosomes_length.keys()) for assembly in assemblies.assemblies: chr_to_aligned_blocks[assembly.label] = defaultdict(list) similar_correct = 0 similar_misassembled = 0 for align in assembly.alignments: chr_to_aligned_blocks[assembly.label][align.ref_name].append(align) if align.similar: if align.misassembled: similar_misassembled += 1 else: similar_correct += 1 report = reporting.get(assembly.fpath) report.add_field(reporting.Fields.SIMILAR_CONTIGS, similar_correct) report.add_field(reporting.Fields.SIMILAR_MIS_BLOCKS, similar_misassembled) main_menu_fpath = os.path.join(output_dirpath, qconfig.icarus_html_fname) output_all_files_dir_path = os.path.join(output_dirpath, qconfig.icarus_dirname) if not os.path.exists(output_all_files_dir_path): os.mkdir(output_all_files_dir_path) chr_full_names, contig_names_by_refs = group_references(chr_names, contig_names_by_refs, chromosomes_length, ref_fpath) cov_data, not_covered, max_depth = parse_cov_fpath(cov_fpath, chr_names, chr_full_names, contig_names_by_refs) physical_cov_data, not_covered, physical_max_depth = parse_cov_fpath(physical_cov_fpath, chr_names, chr_full_names, contig_names_by_refs) chr_sizes = {} num_contigs = {} aligned_bases = genome_analyzer.get_ref_aligned_lengths() nx_marks = [reporting.Fields.N50, reporting.Fields.N75, reporting.Fields.NG50, reporting.Fields.NG75] assemblies_data, assemblies_contig_size_data, assemblies_n50 = get_assemblies_data(contigs_fpaths, output_all_files_dir_path, stdout_pattern, nx_marks) ref_contigs_dict = {} chr_lengths_dict = {} ref_data = 'var references_by_id = {};\n' chr_names_by_id = dict((chrom, str(i)) for i, chrom in enumerate(chr_names)) for chrom, i in chr_names_by_id.items(): ref_data += 'references_by_id["' + str(i) + '"] = "' + chrom + '";\n' for i, chr in enumerate(chr_full_names): if contig_names_by_refs: ref_contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr] elif len(chr_full_names) == 1: ref_contigs = chr_names else: ref_contigs = [chr] ref_contigs_dict[chr] = ref_contigs chr_lengths_dict[chr] = [0] + [chromosomes_length[contig] for contig in ref_contigs] num_misassemblies = defaultdict(int) aligned_bases_by_chr = defaultdict(list) aligned_assemblies = defaultdict(set) for i, chr in enumerate(chr_full_names): ref_contigs = ref_contigs_dict[chr] chr_lengths = chr_lengths_dict[chr] chr_size = sum([chromosomes_length[contig] for contig in ref_contigs]) chr_sizes[chr] = chr_size num_contigs[chr] = len(ref_contigs) data_str = [] data_str.append('var chromosomes_len = {};') for ref_contig in ref_contigs: l = chromosomes_length[ref_contig] data_str.append('chromosomes_len["' + ref_contig + '"] = ' + str(l) + ';') aligned_bases_by_chr[chr].extend(aligned_bases[ref_contig]) cov_data_str = format_cov_data(cov_data, max_depth, chr, 'coverage_data', 'reads_max_depth') if cov_data else None physical_cov_data_str = format_cov_data(physical_cov_data, physical_max_depth, chr, 'physical_coverage_data', 'physical_max_depth') \ if physical_cov_data else None alignment_viewer_fpath, ref_data_str, contigs_structure_str, additional_assemblies_data, ms_selectors, num_misassemblies[chr], aligned_assemblies[chr] = \ prepare_alignment_data_for_one_ref(chr, chr_full_names, chr_names_by_id, ref_contigs, data_str, chr_to_aligned_blocks, structures_by_labels, contigs_by_assemblies, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels, cov_data_str=cov_data_str, physical_cov_data_str=physical_cov_data_str, contig_names_by_refs=contig_names_by_refs, output_dir_path=output_all_files_dir_path) ref_name = qutils.name_from_fpath(ref_fpath) save_alignment_data_for_one_ref(chr, ref_contigs, ref_name, json_output_dir, alignment_viewer_fpath, ref_data_str, ms_selectors, ref_data=ref_data, features_data=features_data, assemblies_data=assemblies_data, contigs_structure_str=contigs_structure_str, additional_assemblies_data=additional_assemblies_data) contigs_sizes_str, too_many_contigs = get_contigs_data(contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels, contig_names_by_refs, chr_names, chr_full_names) all_data = assemblies_data + assemblies_contig_size_data + contigs_sizes_str save_contig_size_html(output_all_files_dir_path, json_output_dir, too_many_contigs, all_data) icarus_links = defaultdict(list) if len(chr_full_names) > 1: chr_link = qconfig.icarus_html_fname icarus_links["links"].append(chr_link) icarus_links["links_names"].append(qconfig.icarus_link) main_menu_template_fpath = html_saver.get_real_path(qconfig.icarus_menu_template_fname) main_data_dict = dict() labels = [qconfig.assembly_labels_by_fpath[contigs_fpath] for contigs_fpath in contigs_fpaths] main_data_dict['assemblies'] = labels html_saver.save_icarus_data(json_output_dir, ', '.join(labels), 'assemblies') contig_size_browser_fpath = os.path.join(qconfig.icarus_dirname, qconfig.contig_size_viewer_fname) main_data_dict['contig_size_html'] = contig_size_browser_fpath html_saver.save_icarus_data(json_output_dir, contig_size_browser_fpath, 'contig_size_html') if not chr_names: icarus_links["links"].append(contig_size_browser_fpath) icarus_links["links_names"].append(qconfig.icarus_link) if chr_full_names and (len(chr_full_names) > 1 or qconfig.is_combined_ref): main_data_dict['table_references'] = {'references': []} num_aligned_assemblies = [len(aligned_assemblies[chr]) for chr in chr_full_names] is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1 if is_unaligned_asm_exists: main_data_dict['table_references']['th_assemblies'] = True for chr in sorted(chr_full_names): chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths, contig_names_by_refs, one_chromosome=len(chr_full_names) == 1) reference_dict = dict() reference_dict['chr_link'] = chr_link reference_dict['tooltip'] = tooltip reference_dict['chr_name'] = os.path.basename(chr_name) reference_dict['num_contigs'] = str(num_contigs[chr]) reference_dict['chr_size'] = format_long_numbers(chr_size) if is_unaligned_asm_exists: reference_dict['num_assemblies'] = str(len(aligned_assemblies[chr])) reference_dict['chr_gf'] = '%.3f' % chr_genome reference_dict['num_misassemblies'] = str(num_misassemblies[chr]) main_data_dict['table_references']['references'].append(reference_dict) html_saver.save_icarus_data(json_output_dir, main_data_dict['table_references'], 'table_references', as_text=False) else: if chr_full_names: chr = chr_full_names[0] chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths, contig_names_by_refs, one_chromosome=True) main_data_dict['one_reference'] = dict() main_data_dict['one_reference']['alignment_link'] = chr_link main_data_dict['one_reference']['ref_fpath'] = os.path.basename(ref_fpath) main_data_dict['one_reference']['ref_fragments'] = str(num_contigs[chr]) main_data_dict['one_reference']['ref_size'] = format_long_numbers(chr_size) main_data_dict['one_reference']['ref_gf'] = '%.3f' % chr_genome main_data_dict['one_reference']['num_misassemblies'] = str(num_misassemblies[chr]) icarus_links["links"].append(chr_link) icarus_links["links_names"].append(qconfig.icarus_link) html_saver.save_icarus_data(json_output_dir, main_data_dict['one_reference'], 'menu_reference', as_text=False) html_saver.save_icarus_html(main_menu_template_fpath, main_menu_fpath, main_data_dict) html_saver.save_icarus_links(output_dirpath, icarus_links) if json_output_dir: json_saver.save_icarus_links(json_output_dir, icarus_links) return main_menu_fpath
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning( "GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning( ' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) elif not install_genemark(): logger.warning( ' Can\'t copy the license key to ~/.gm_key, skipping gene prediction.' ) else: if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: results = Parallel(n_jobs=n_jobs)( delayed(predict_genes) (index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) else: results = [ predict_genes(index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths) ] if not is_license_valid(out_dirpath, fasta_fpaths): return genes_by_labels = dict() # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) label = qutils.label_from_fpath(fasta_path) genes_by_labels[ label], unique_count, full_genes, partial_genes = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if full_genes is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique_count is None and full_genes is None: logger.error( ' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + label + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: for dirpath in glob.iglob(tmp_dirpath + '*'): if os.path.isdir(dirpath): shutil.rmtree(dirpath) logger.main_info('Done.') return genes_by_labels
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if not success_compilation: logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return dict( zip(contigs_fpaths, [AlignerStatus.FAILED] * len(contigs_fpaths))), None num_nf_errors = logger._num_nf_errors create_minimap_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed genome_size, reference_chromosomes, ns_by_chromosomes = get_genome_stats( reference, skip_ns=True) threads = qconfig.max_threads if qconfig.memory_efficient else threads args = [(is_cyclic, i, contigs_fpath, output_dir, reference, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads) for i, (contigs_fpath, old_contigs_fpath ) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))] statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs = run_parallel( align_and_analyze, args, n_jobs) reports = [] aligner_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict( zip(contigs_fpaths, aligned_lengths_by_contigs)) if AlignerStatus.OK in aligner_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == AlignerStatus.OK: reports.append( save_result(results[index], report, fname, reference, genome_size)) elif statuses[index] == AlignerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if AlignerStatus.OK in aligner_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot( reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict( (contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(aligner_statuses.values()).count(AlignerStatus.OK) not_aligned = list(aligner_statuses.values()).count( AlignerStatus.NOT_ALIGNED) failed = list(aligner_statuses.values()).count(AlignerStatus.FAILED) errors = list(aligner_statuses.values()).count(AlignerStatus.ERROR) problems = not_aligned + failed + errors all = len(aligner_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info( 'Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info( 'Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.' ) return aligner_statuses, aligned_lengths_per_fpath
def do(ref_fpath, contigs_fpaths, output_dirpath): gage_results_dirpath = os.path.join(output_dirpath, 'gage') # suffixes for files with report tables in plain text and tab separated formats if not os.path.isdir(gage_results_dirpath): os.mkdir(gage_results_dirpath) ######################################################################## gage_tool_path = os.path.join(gage_dirpath, 'getCorrectnessStats.sh') ######################################################################## logger.print_timestamp() logger.main_info('Running GAGE...') metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs', 'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation', 'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50'] metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE, reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50] tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp') if not os.path.exists(tmp_dirpath): os.makedirs(tmp_dirpath) if not compile_aligner(logger) or (not all_required_java_classes_exist(gage_dirpath) and not compile_gage()): logger.error('GAGE module was not installed properly, so it is disabled and you cannot use --gage.') return n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths)) if 0 not in return_codes: logger.error('Error occurred while GAGE was processing assemblies.' ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr')) return ## find metrics for total report: for i, contigs_fpath in enumerate(contigs_fpaths): corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) report = reporting.get(contigs_fpath) log_out_fpath = os.path.join( gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout') logfile_out = open(log_out_fpath, 'r') cur_metric_id = 0 for line in logfile_out: if metrics[cur_metric_id] in line: if (metrics[cur_metric_id].startswith('N50')): report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip()) else: report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip()) cur_metric_id += 1 if cur_metric_id == len(metrics): break logfile_out.close() reporting.save_gage(output_dirpath) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 5: len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1]) len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1]) len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) checked_assemblies.append(contigs_fpath) contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies] if len(contigs_fpaths) == 0: logger.info('Done.') return if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath): logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None logger.info('Running KMC on reference...') log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: return logger.info('Analyzing assemblies completeness...') kmc_out_fpaths = [] for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info('Analyzing assemblies accuracy...') if len(kmc_out_fpaths) > 1: shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath) else: shared_kmc_db = kmc_out_fpaths[0] kmer_fraction = 100 if getsize(ref_fpath) < 500 * 1024 ** 2 else 1000 shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction) shared_kmers_by_chrom = dict() shared_kmers_fpath = join(tmp_dirpath, 'shared_kmers.txt') ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath)) with open(shared_kmers_fpath, 'w') as out_f: for name, seq in ref_contigs.items(): seq_kmers = get_string_kmers(tmp_dirpath, log_fpath, err_fpath, seq=seq, intersect_with=shared_downsampled_kmc_db) for kmer_i, kmer in enumerate(seq_kmers): shared_kmers_by_chrom[str(kmer)] = name out_f.write('>' + str(kmer_i) + '\n') out_f.write(kmer + '\n') shared_kmc_db = count_kmers(tmp_dirpath, shared_kmers_fpath, log_fpath, err_fpath) ref_kmc_dbs = [] for ref_name, ref_seq in ref_contigs.items(): ref_contig_fpath = join(tmp_dirpath, ref_name + '.fa') if not is_non_empty_file(ref_contig_fpath): with open(ref_contig_fpath, 'w') as out_f: out_f.write(ref_seq) ref_kmc_db = count_kmers(tmp_dirpath, ref_contig_fpath, log_fpath, err_fpath) ref_shared_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, shared_kmc_db], log_fpath, err_fpath) ref_kmc_dbs.append((ref_name, ref_shared_kmc_db)) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = None len_map_to_multi_chrom = None len_map_to_none_chrom = None total_len = 0 long_contigs = [] contig_lens = dict() contig_markers = defaultdict(list) for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(seq) >= MIN_CONTIGS_LEN: long_contigs.append(len(seq)) if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5: logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.') elif len(ref_kmc_dbs) > MAX_CONTIGS_NUM: logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.') else: len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 for name, seq in read_fasta(contigs_fpath): if len(seq) < MIN_CONTIGS_LEN: continue tmp_contig_fpath = join(tmp_dirpath, name + '.fa') with open(tmp_contig_fpath, 'w') as out_tmp_f: out_tmp_f.write(seq) contig_kmc_db = count_kmers(tmp_dirpath, tmp_contig_fpath, log_fpath, err_fpath) intersect_all_ref_kmc_db = intersect_kmers(tmp_dirpath, [contig_kmc_db, shared_kmc_db], log_fpath, err_fpath) kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_all_ref_kmc_db, log_fpath, err_fpath) if kmers_cnt < MIN_MARKERS: continue for ref_name, ref_kmc_db in ref_kmc_dbs: intersect_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, intersect_all_ref_kmc_db], log_fpath, err_fpath) kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_kmc_db, log_fpath, err_fpath) if kmers_cnt: contig_markers[name].append(ref_name) for name, chr_markers in contig_markers.items(): if len(chr_markers) == 1: len_map_to_one_chrom += contig_lens[name] else: len_map_to_multi_chrom += contig_lens[name] len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, reference_chromosomes.keys()) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # for cumulative plots: files_genes_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed process_results = Parallel(n_jobs=n_jobs)(delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [process_results[i][1] for i in range(len(process_results))] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.iteritems(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('================================================================================================================\n') for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from quast_libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: import plotter if genes_container.region_list: plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return [genes_container, operons_container]
def do(ref_fpath, contigs_fpaths, output_dirpath): gage_results_dirpath = os.path.join(output_dirpath, 'gage') # suffixes for files with report tables in plain text and tab separated formats if not os.path.isdir(gage_results_dirpath): os.mkdir(gage_results_dirpath) ######################################################################## gage_tool_path = os.path.join(gage_dirpath, 'getCorrectnessStats.sh') ######################################################################## logger.print_timestamp() logger.main_info('Running GAGE...') metrics = [ 'Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs', 'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation', 'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50' ] metrics_in_reporting = [ reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE, reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50 ] tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp') if not os.path.exists(tmp_dirpath): os.makedirs(tmp_dirpath) if not compile_aligner(logger) or ( not all_required_java_classes_exist(gage_dirpath) and not compile_gage()): logger.error( 'GAGE module was not installed properly, so it is disabled and you cannot use --gage.' ) return n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: return_codes = Parallel(n_jobs=n_jobs)( delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths)) else: return_codes = [ run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths) ] if 0 not in return_codes: logger.error('Error occurred while GAGE was processing assemblies.' ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr')) return ## find metrics for total report: for i, contigs_fpath in enumerate(contigs_fpaths): corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) report = reporting.get(contigs_fpath) log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout') logfile_out = open(log_out_fpath, 'r') cur_metric_id = 0 for line in logfile_out: if metrics[cur_metric_id] in line: if (metrics[cur_metric_id].startswith('N50')): report.add_field( metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip()) else: report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip()) cur_metric_id += 1 if cur_metric_id == len(metrics): break logfile_out.close() reporting.save_gage(output_dirpath) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def main(args): if ' ' in qconfig.QUAST_HOME: logger.error('QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage() sys.exit(0) reload(qconfig) quast_path = [os.path.realpath(__file__)] quast_py_args, contigs_fpaths = parse_options(logger, quast_path + args) output_dirpath, ref_fpath, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.main_info() logger.print_params() ######################################################################## from quast_libs import reporting reports = reporting.reports reload(reporting) reporting.reports = reports reporting.assembly_fpaths = [] from quast_libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. if qconfig.is_combined_ref: corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname) else: if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCE if ref_fpath: logger.main_info() logger.main_info('Reference:') ref_fpath = qutils.correct_reference(ref_fpath, corrected_dirpath) else: ref_fpath = '' # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') contigs_fpaths, old_contigs_fpaths = qutils.correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) reads_fpaths = [] cov_fpath = [] physical_cov_fpath = [] if qconfig.forward_reads: reads_fpaths.append(qconfig.forward_reads) if qconfig.reverse_reads: reads_fpaths.append(qconfig.reverse_reads) if (reads_fpaths or qconfig.sam or qconfig.bam) and ref_fpath: bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths, reads_fpaths, None, os.path.join(output_dirpath, qconfig.variation_dirname), external_logger=logger, sam_fpath=qconfig.sam, bam_fpath=qconfig.bam, bed_fpath=qconfig.bed) qconfig.bed = bed_fpath if not contigs_fpaths: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 if qconfig.used_colors and qconfig.used_ls: for i, label in enumerate(labels): plotter.dict_color_and_ls[label] = (qconfig.used_colors[i], qconfig.used_ls[i]) qconfig.assemblies_fpaths = contigs_fpaths if qconfig.with_gage: ######################################################################## ### GAGE ######################################################################## if not ref_fpath: logger.warning("GAGE can't be run without a reference and will be skipped.") else: from quast_libs import gage gage.do(ref_fpath, contigs_fpaths, output_dirpath) # Where all pdfs will be saved all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) all_pdf_file = None if qconfig.draw_plots and plotter.can_draw_plots: try: from matplotlib.backends.backend_pdf import PdfPages all_pdf_file = PdfPages(all_pdf_fpath) except: all_pdf_file = None if qconfig.json_output_dirpath: from quast_libs.html_saver import json_saver if json_saver.simplejson_error: json_output_dirpath = None ######################################################################## ### Stats and plots ######################################################################## from quast_libs import basic_stats basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), qconfig.json_output_dirpath, output_dirpath) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None icarus_html_fpath = None if ref_fpath: ######################################################################## ### former PLANTAKOLYA, PLANTAGORA ######################################################################## from quast_libs import contigs_analyzer nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do( ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports'), old_contigs_fpaths, qconfig.bed) for contigs_fpath in contigs_fpaths: if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK: aligned_contigs_fpaths.append(contigs_fpath) aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath]) # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None features_containers = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from quast_libs import aligned_stats aligned_stats.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, qconfig.json_output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from quast_libs import genome_analyzer features_containers = genome_analyzer.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, qconfig.json_output_dirpath, qconfig.genes, qconfig.operons, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) genes_by_labels = None if qconfig.gene_finding: if qconfig.glimmer: ######################################################################## ### Glimmer ######################################################################## from quast_libs import glimmer genes_by_labels = glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes')) else: ######################################################################## ### GeneMark ######################################################################## from quast_libs import genemark genes_by_labels = genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.prokaryote, qconfig.meta) else: logger.main_info("") logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.") ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots or qconfig.create_icarus_html: logger.print_timestamp() logger.main_info('Creating large visual summaries...') logger.main_info('This may take a while: press Ctrl-C to skip this step..') try: if detailed_contigs_reports_dirpath: report_for_icarus_fpath_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.icarus_report_fname_pattern) stdout_pattern = os.path.join(detailed_contigs_reports_dirpath, qconfig.contig_report_fname_pattern) else: report_for_icarus_fpath_pattern = None stdout_pattern = None draw_alignment_plots = qconfig.draw_svg or qconfig.create_icarus_html number_of_steps = sum([int(bool(value)) for value in [draw_alignment_plots, all_pdf_file]]) if draw_alignment_plots: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.main_info(' 1 of %d: Creating Icarus viewers...' % number_of_steps) from quast_libs import icarus icarus_html_fpath, contig_alignment_plot_fpath = icarus.do( contigs_fpaths, report_for_icarus_fpath_pattern, output_dirpath, ref_fpath, stdout_pattern=stdout_pattern, features=features_containers, cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, json_output_dir=qconfig.json_output_dirpath, genes_by_labels=genes_by_labels) if all_pdf_file: # full report in PDF format: all tables and plots logger.main_info(' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_file) logger.main_info('Done') except KeyboardInterrupt: logger.main_info('..step skipped!') os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.main_info('RESULTS:') logger.main_info(' Text versions of total report are saved to ' + reports_fpaths) logger.main_info(' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if qconfig.json_output_dirpath: json_saver.save_total_report(qconfig.json_output_dirpath, qconfig.min_contig, ref_fpath) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls) html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath) if os.path.isfile(all_pdf_fpath): logger.main_info(' PDF version (tables and plots) is saved to ' + all_pdf_fpath) if icarus_html_fpath: logger.main_info(' Icarus (contig browser) is saved to %s' % icarus_html_fpath) if qconfig.draw_svg and contig_alignment_plot_fpath: logger.main_info(' Contig alignment plot is saved to %s' % contig_alignment_plot_fpath) cleanup(corrected_dirpath) return logger.finish_up(check_test=qconfig.test)
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning("GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning(' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) else: successful = install_genemark(os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name)) if not successful: return if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) genes_by_labels = dict() # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) label = qutils.label_from_fpath(fasta_path) genes_by_labels[label], unique_count, count = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if count is not None: report.add_field(reporting.Fields.PREDICTED_GENES, count) if unique_count is None and count is None: logger.error(' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + label + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: for dirpath in glob.iglob(tmp_dirpath + '*'): if os.path.isdir(dirpath): shutil.rmtree(dirpath) logger.main_info('Done.') return genes_by_labels
def do(reference, contigs_fpaths, is_cyclic, output_dir, old_contigs_fpaths, bed_fpath=None): if not os.path.isdir(output_dir): os.mkdir(output_dir) logger.print_timestamp() logger.main_info('Running Contig analyzer...') success_compilation = compile_aligner(logger) if qconfig.test and is_emem_aligner(): success_compilation = check_emem_functionality(logger) if not success_compilation: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') return dict(zip(contigs_fpaths, [NucmerStatus.FAILED] * len(contigs_fpaths))), None if qconfig.draw_plots: compile_gnuplot(logger, only_clean=False) num_nf_errors = logger._num_nf_errors create_nucmer_output_dir(output_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) if qconfig.memory_efficient: threads = 1 else: threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.splitted_ref and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: if len(contigs_fpaths) >= len(qconfig.splitted_ref) and not qconfig.memory_efficient: statuses_results_lengths_tuples = Parallel(n_jobs=n_jobs)(delayed(align_and_analyze)( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, threads=threads) for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths))) else: statuses_results_lengths_tuples = [] for i, (contigs_fpath, old_contigs_fpath) in enumerate(zip(contigs_fpaths, old_contigs_fpaths)): statuses_results_lengths_tuples.append(align_and_analyze( is_cyclic, i, contigs_fpath, output_dir, reference, old_contigs_fpath, bed_fpath, parallel_by_chr=True, threads=qconfig.max_threads)) # unzipping statuses, results, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs =\ [[x[i] for x in statuses_results_lengths_tuples] for i in range(5)] reports = [] nucmer_statuses = dict(zip(contigs_fpaths, statuses)) aligned_lengths_per_fpath = dict(zip(contigs_fpaths, aligned_lengths)) misc.contigs_aligned_lengths = dict(zip(contigs_fpaths, aligned_lengths_by_contigs)) if NucmerStatus.OK in nucmer_statuses.values(): if qconfig.is_combined_ref: save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger) for index, fname in enumerate(contigs_fpaths): report = reporting.get(fname) if statuses[index] == NucmerStatus.OK: reports.append(save_result(results[index], report, fname, reference)) elif statuses[index] == NucmerStatus.NOT_ALIGNED: save_result_for_unaligned(results[index], report) if NucmerStatus.OK in nucmer_statuses.values(): reporting.save_misassemblies(output_dir) reporting.save_unaligned(output_dir) from . import plotter if qconfig.draw_plots: plotter.draw_misassemblies_plot(reports, join(output_dir, 'misassemblies_plot'), 'Misassemblies') if qconfig.draw_plots or qconfig.html_report: misassemblies_in_contigs = dict((contigs_fpaths[i], misassemblies_in_contigs[i]) for i in range(len(contigs_fpaths))) plotter.frc_plot(dirname(output_dir), reference, contigs_fpaths, misc.contigs_aligned_lengths, misassemblies_in_contigs, join(output_dir, 'misassemblies_frcurve_plot'), 'misassemblies') oks = list(nucmer_statuses.values()).count(NucmerStatus.OK) not_aligned = list(nucmer_statuses.values()).count(NucmerStatus.NOT_ALIGNED) failed = list(nucmer_statuses.values()).count(NucmerStatus.FAILED) errors = list(nucmer_statuses.values()).count(NucmerStatus.ERROR) problems = not_aligned + failed + errors all = len(nucmer_statuses) logger._num_nf_errors = num_nf_errors + errors if oks == all: logger.main_info('Done.') if oks < all and problems < all: logger.main_info('Done for ' + str(all - problems) + ' out of ' + str(all) + '. For the rest, only basic stats are going to be evaluated.') if problems == all: logger.main_info('Failed aligning the contigs for all the assemblies. Only basic stats are going to be evaluated.') if not qconfig.test and is_emem_aligner(): logger.warning('Please rerun QUAST using --test option to ensure that E-MEM aligner works properly.') return nucmer_statuses, aligned_lengths_per_fpath
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries( logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote, is_fungus=qconfig.is_fungus) if not clade_dirpath: logger.info('Failed finding conservative genes.') return config_fpath = make_config(output_dir, tmp_dir, busco_threads, clade_dirpath, augustus_dirpath) logger.info('Logs and results will be saved under ' + output_dir + '...') os.environ['BUSCO_CONFIG_FILE'] = config_fpath os.environ['AUGUSTUS_CONFIG_PATH'] = copy_augustus_configs( augustus_dirpath, tmp_dir) if not os.environ['AUGUSTUS_CONFIG_PATH']: logger.error( 'Augustus configs not found, failed to run BUSCO without them.') busco_args = [[ contigs_fpath, qutils.label_from_fpath_for_fname(contigs_fpath) ] for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco_main_handler, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error( 'Failed running BUSCO for all the assemblies. See log files in ' + output_dir + ' for information ' '(rerun with --debug to keep all intermediate files).') return # saving results zero_output_for_all = True for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field( reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) if complete_buscos + part_buscos > 0: zero_output_for_all = False shutil.copy(summary_fpaths[i], output_dir) else: logger.error( 'Failed running BUSCO for ' + contigs_fpath + '. See the log for detailed information' ' (rerun with --debug to keep all intermediate files).') if zero_output_for_all: logger.warning( 'BUSCO did not fail explicitly but found nothing for all assemblies! ' 'Possible reasons and workarounds:\n' ' 1. Provided assemblies are so small that they do not contain even a single partial BUSCO gene. Not likely but may happen -- nothing to worry then.\n' ' 2. Incorrect lineage database was used. To run with fungi DB use --fungus, to run with eukaryota DB use --eukaryote, otherwise BUSCO uses bacteria DB.\n' ' 3. Problem with BUSCO dependencies, most likely Augustus. Check that the binaries in ' + augustus_dirpath + '/bin/ are working properly.\n' ' If something is wrong with Augustus, you may try to install it yourself (https://github.com/Gaius-Augustus/Augustus) and add "augustus" binary to PATH.\n' ' 4. Some other problem with BUSCO. Check the logs (you may need to rerun QUAST with --debug to see all intermediate files).\n' ' If you cannot solve the problem yourself, post an issue at https://github.com/ablab/quast/issues or write to [email protected]' ) if not qconfig.debug: cleanup(output_dir) logger.info('Done.')
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running BUSCO...') compilation_success = True augustus_dirpath = download_augustus(logger) if not augustus_dirpath: compilation_success = False elif not compile_tool('Augustus', augustus_dirpath, [join('bin', 'augustus')], logger=logger): compilation_success = False if compilation_success and not download_blast_binaries( logger=logger, filenames=blast_filenames): compilation_success = False if not compilation_success: logger.info('Failed finding conservative genes.') return set_augustus_dir(augustus_dirpath) if not os.path.isdir(output_dir): os.makedirs(output_dir) tmp_dir = join(output_dir, 'tmp') if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) busco_threads = max(1, qconfig.max_threads // n_jobs) clade_dirpath = download_db(logger, is_prokaryote=qconfig.prokaryote) if not clade_dirpath: logger.info('Failed finding conservative genes.') return log_fpath = join(output_dir, 'busco.log') logger.info('Logging to ' + log_fpath + '...') busco_args = [([ '-i', contigs_fpath, '-o', qutils.label_from_fpath_for_fname(contigs_fpath), '-l', clade_dirpath, '-m', 'genome', '-f', '-z', '-c', str(busco_threads), '-t', tmp_dir, '--augustus_parameters=\'--AUGUSTUS_CONFIG_PATH=' + join(augustus_dirpath, 'config') + '\'' ], output_dir) for contigs_fpath in contigs_fpaths] summary_fpaths = run_parallel(busco.main, busco_args, qconfig.max_threads) if not any(fpath for fpath in summary_fpaths): logger.error('Failed running BUSCO for all the assemblies. See ' + log_fpath + ' for information.') return # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) if summary_fpaths[i] and os.path.isfile(summary_fpaths[i]): total_buscos, part_buscos, complete_buscos = 0, 0, 0 with open(summary_fpaths[i]) as f: for line in f: if 'Complete BUSCOs' in line: complete_buscos = int(line.split()[0]) elif 'Fragmented' in line: part_buscos = int(line.split()[0]) elif 'Total' in line: total_buscos = int(line.split()[0]) if total_buscos != 0: report.add_field( reporting.Fields.BUSCO_COMPLETE, ('%.2f' % (float(complete_buscos) * 100.0 / total_buscos))) report.add_field(reporting.Fields.BUSCO_PART, ('%.2f' % (float(part_buscos) * 100.0 / total_buscos))) else: logger.error('Failed running BUSCO for ' + contigs_fpath + '. See ' + log_fpath + ' for information.') logger.info('Done.')
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.main_info('Running NA-NGA calculation...') ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath) reference_length = sum(ref_chr_lengths.values()) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values())) import N50 for i, (contigs_fpath, lens, assembly_len) in enumerate( itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) na75 = N50.NG50(lens, assembly_len, 75) la50 = N50.LG50(lens, assembly_len) la75 = N50.LG50(lens, assembly_len, 75) if not qconfig.is_combined_ref: nga50 = N50.NG50(lens, reference_length) nga75 = N50.NG50(lens, reference_length, 75) lga50 = N50.LG50(lens, reference_length) lga75 = N50.LG50(lens, reference_length, 75) logger.info(' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') + ', LA50 = ' + str(la50) + (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else '')) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LA75, la75) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))]) if json_output_dirpath: from quast_libs.html_saver import json_saver json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) import plotter if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths, json_output_dir=json_output_dirpath) if not qconfig.is_combined_ref: plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath) logger.main_info('Done.') return report_dict
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys())) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # for cumulative plots: files_genes_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed process_results = Parallel(n_jobs=n_jobs)(delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [process_results[i][1] for i in range(len(process_results))] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('================================================================================================================\n') for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from quast_libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter if genes_container.region_list: plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return [genes_container, operons_container]
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None if ref_fpath: reference_lengths = sorted( fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content( ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC if reference_GC is not None else 'undefined') if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning( ' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).' ) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_(\d+\.?\d*)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold( seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * ( cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] num_contigs = max( [len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator corr_lists_of_lengths = [[ sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length) ] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [ sum(reference_lengths[( (i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum( reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points) ] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append( sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] if reference_lengths: # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] list_of_GC_contigs_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate( zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution, GC_contigs_distribution = GC_content( contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) list_of_GC_contigs_distributions.append(GC_contigs_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field( reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil( (largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if qconfig.html_report and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', []) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx', [reference_length for i in range(len(contigs_fpaths))]) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length') if not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot')) for contigs_fpath, GC_distribution in zip( contigs_fpaths, list_of_GC_contigs_distributions): plotter.contigs_GC_content_plot( contigs_fpath, GC_distribution, join( output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot')) if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.')
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, features_dict, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): coords_dirpath = os.path.join(detailed_contigs_reports_dirpath, qconfig.minimap_output_dirname) from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: coords_dirpath = os.path.join(coords_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) genome_size, reference_chromosomes, ns_by_chromosomes = fastaparser.get_genome_stats( ref_fpath) # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = os.path.join(genome_stats_dirpath, 'genome_info.txt') res_file = open(result_fpath, 'w') containers = [] for feature, feature_fpath in features_dict.items(): containers.append(FeatureContainer([feature_fpath], feature)) if not features_dict: logger.notice( 'No file with genomic features were provided. ' 'Use the --features option if you want to specify it.\n', indent=' ') if operons_fpaths: containers.append(FeatureContainer(operons_fpaths, 'operon')) else: logger.notice( 'No file with operons were provided. ' 'Use the -O option if you want to specify it.', indent=' ') for container in containers: if not container.fpaths: continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file( fpath, container.kind) if len(container.region_list) == 0: logger.warning('No genomic features of type "' + container.kind + '" were loaded.', indent=' ') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' genomic features of type "' + container.kind + '"') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict( container.kind, container.region_list, list(reference_chromosomes.keys())) ref_genes_num, ref_operons_num = None, None for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) genomic_features = 0 for container in containers: if container.kind == 'operon': ref_operons_num = len(container.region_list) report.add_field(reporting.Fields.REF_OPERONS, len(container.region_list)) else: genomic_features += len(container.region_list) if genomic_features: ref_genes_num = genomic_features report.add_field(reporting.Fields.REF_GENES, genomic_features) # for cumulative plots: files_features_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_unsorted_features_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} files_unsorted_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: process_results = Parallel(n_jobs=n_jobs)( delayed(process_single_file)( contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) else: process_results = [ process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers) for index, contigs_fpath in enumerate(aligned_contigs_fpaths) ] num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [ process_results[i][1] for i in range(len(process_results)) ] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ ref_lengths[i][ref] for i in range(len(ref_lengths)) ] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, ' + 'total length without N\'s: ' + str(chr_len - len(ns_by_chromosomes[chr_name])) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('=' * 120 + '\n') for contigs_fpath, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\ in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_features_in_contigs[contigs_fpath] = features_in_contigs files_unsorted_features_in_contigs[ contigs_fpath] = unsorted_features_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs files_unsorted_operons_in_contigs[ contigs_fpath] = unsorted_operons_in_contigs full_found_genes.append(sum(features_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) res_file.write( '%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], report.get_field( reporting.Fields.MAPPEDGENOME), report.get_field(reporting.Fields.DUPLICATION_RATIO), gaps_count)) genome_mapped.append( float(report.get_field(reporting.Fields.MAPPEDGENOME))) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if qconfig.html_report: from quast_libs.html_saver import html_saver if ref_genes_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'features', files_features_in_contigs, ref_genes_num) if ref_operons_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter from quast_libs.ca_utils.misc import contigs_aligned_lengths if ref_genes_num: plotter.genes_operons_plot( ref_genes_num, aligned_contigs_fpaths, files_features_in_contigs, genome_stats_dirpath + '/features_cumulative_plot', 'genomic features') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_features_in_contigs, genome_stats_dirpath + '/features_frcurve_plot', 'genomic features') plotter.histogram( aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_features_histogram', '# complete genomic features') if ref_operons_num: plotter.genes_operons_plot( ref_operons_num, aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_operons_in_contigs, genome_stats_dirpath + '/operons_frcurve_plot', 'operons') plotter.histogram( aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return containers