def align_contigs(output_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath): log_out_f = open(log_out_fpath, 'w') successful_check_fpath = out_basename + '.sf' log_out_f.write('Aligning contigs to reference...\n') # Special case: if there is a need to reuse alignments from the combined_reference stage if qconfig.alignments_for_reuse_dirpath is not None and os.path.isdir( qconfig.alignments_for_reuse_dirpath): _, coords_to_reuse_fname, _, _ = get_aux_out_fpaths( os.path.basename(out_basename)) coords_to_reuse_fpath = os.path.join( qconfig.alignments_for_reuse_dirpath, coords_to_reuse_fname) if isfile(coords_to_reuse_fpath): # symlink coords.filtered from combined_reference stage to coords in the current run if isfile(output_fpath): os.remove(output_fpath) os.symlink( os.path.relpath(coords_to_reuse_fpath, os.path.dirname(output_fpath)), output_fpath) log_out_f.write( '\tReusing alignments from the combined_reference stage...\n') logger.info( ' ' + qutils.index_to_str(index) + 'Reusing alignments from the combined_reference stage... ') return AlignerStatus.OK qconfig.alignments_for_reuse_dirpath = None # Checking if there are existing previous alignments. # If they exist, using them to save time. if isfile(successful_check_fpath) and isfile(output_fpath): if check_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath): log_out_f.write('\tUsing existing alignments...\n') logger.info(' ' + qutils.index_to_str(index) + 'Using existing alignments... ') return AlignerStatus.OK log_out_f.write('\tAligning contigs to the reference\n') logger.info(' ' + qutils.index_to_str(index) + 'Aligning contigs to the reference') tmp_output_fpath = output_fpath + '_tmp' exit_code = run_minimap(tmp_output_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, threads) if exit_code != 0: return AlignerStatus.ERROR if not isfile(tmp_output_fpath): return AlignerStatus.FAILED if not is_non_empty_file(tmp_output_fpath): return AlignerStatus.NOT_ALIGNED create_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath) log_out_f.write('Filtering alignments...\n') parse_minimap_output(tmp_output_fpath, output_fpath) return AlignerStatus.OK
def draw_mummer_plot(logger, nucmer_fpath, delta_fpath, index, log_out_f, log_err_f): output_dirpath = dirname(dirname(nucmer_fpath)) mummer_plot_fpath = join(output_dirpath, basename(nucmer_fpath) + '_mummerplot.html') return_code = qutils.call_subprocess( [bin_fpath('mummerplot'), '--html', '--layout', '-p', nucmer_fpath, delta_fpath], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code == 0: plot_script_fpath = nucmer_fpath + '.gp' temp_plot_fpath = nucmer_fpath + '.html' if isfile(plot_script_fpath) and isfile(gnuplot_exec_fpath()): qutils.call_subprocess( [gnuplot_exec_fpath(), plot_script_fpath], stdout=open('/dev/null', 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if isfile(temp_plot_fpath): with open(temp_plot_fpath) as template_file: html = template_file.read() html = _embed_css_and_scripts(html) with open(mummer_plot_fpath, 'w') as f_html: f_html.write(html) logger.info(' ' + qutils.index_to_str(index) + 'MUMmer plot saved to ' + mummer_plot_fpath) if not isfile(mummer_plot_fpath): logger.notice(qutils.index_to_str(index) + ' MUMmer plot cannot be created.\n')
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads): assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) err_fpath = os.path.join(out_dirpath, assembly_label + '_genemark.stderr') genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath, num_threads) if not genes: unique_count = None count = None # [None] * len(gene_lengths) else: tool_name = "genemark" out_gff_fpath = os.path.join(out_dirpath, assembly_label + '_' + tool_name + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')) add_genes_to_gff(genes, out_gff_fpath, prokaryote) if OUTPUT_FASTA: out_fasta_fpath = os.path.join(out_dirpath, assembly_label + '_' + tool_name + '_genes.fasta') add_genes_to_fasta(genes, out_fasta_fpath) count = [sum([gene.end - gene.start > x for gene in genes]) for x in gene_lengths] gene_ids = [gene.seq if gene.seq else gene.name for gene in genes] unique_count = len(set(gene_ids)) total_count = len(genes) logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_fpath) return genes, unique_count, count
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess( ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_genemark.stderr') genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath, num_threads) if not genes: unique_count = None count = None # [None] * len(gene_lengths) else: tool_name = "genemark" out_gff_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')) add_genes_to_gff(genes, out_gff_fpath, prokaryote) if OUTPUT_FASTA: out_fasta_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.fasta') add_genes_to_fasta(genes, out_fasta_fpath) count = [sum([gene.end - gene.start > x for gene in genes]) for x in gene_lengths] gene_ids = [gene.seq if gene.seq else gene.name for gene in genes] unique_count = len(set(gene_ids)) total_count = len(genes) logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_fpath) return genes, unique_count, count
def align_contigs(output_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath): log_out_f = open(log_out_fpath, 'w') successful_check_fpath = out_basename + '.sf' log_out_f.write('Aligning contigs to reference...\n') # Checking if there are existing previous alignments. # If they exist, using them to save time. using_existing_alignments = False if isfile(successful_check_fpath) and isfile(output_fpath): if check_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath): log_out_f.write('\tUsing existing alignments...\n') logger.info(' ' + qutils.index_to_str(index) + 'Using existing alignments... ') using_existing_alignments = True if not using_existing_alignments: log_out_f.write('\tAligning contigs to the reference\n') logger.info(' ' + qutils.index_to_str(index) + 'Aligning contigs to the reference') tmp_output_fpath = output_fpath + '_tmp' exit_code = run_minimap(tmp_output_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, threads) if exit_code != 0: return AlignerStatus.ERROR if not isfile(tmp_output_fpath): return AlignerStatus.FAILED if not is_non_empty_file(tmp_output_fpath): return AlignerStatus.NOT_ALIGNED create_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath) log_out_f.write('Filtering alignments...\n') parse_minimap_output(tmp_output_fpath, output_fpath) return AlignerStatus.OK
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) out_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer') err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer.stderr') #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir, # fasta_path, out_path, gene_lengths, err_path) out_gff_path, genes, unique, total, full_genes, partial_genes = glimmerHMM( tool_dirpath, contigs_fpath, out_fpath, gene_lengths, err_fpath, tmp_dirpath, index) if out_gff_path: logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique) + ' unique, ' + str(total) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_path) return genes, unique, full_genes, partial_genes
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess( ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def do(contigs_fpaths, output_dir, logger): logger.print_timestamp() logger.info('Running Barrnap...') n_jobs = min(len(contigs_fpaths), qconfig.max_threads) threads = max(1, qconfig.max_threads // n_jobs) if not os.path.isdir(output_dir): os.makedirs(output_dir) log_fpath = join(output_dir, 'barrnap.log') logger.info('Logging to ' + log_fpath + '...') kingdom = 'bac' if qconfig.prokaryote else 'euk' gff_fpaths = [ join(output_dir, qutils.label_from_fpath_for_fname(contigs_fpath) + '.rna.gff') for contigs_fpath in contigs_fpaths ] barrnap_args = [ (contigs_fpath, gff_fpath, log_fpath, threads, kingdom) for contigs_fpath, gff_fpath in zip(contigs_fpaths, gff_fpaths) ] run_parallel(run, barrnap_args, qconfig.max_threads) if not any(fpath for fpath in gff_fpaths): logger.info('Failed predicting the location of ribosomal RNA genes.') return # saving results for index, (contigs_fpath, gff_fpath) in enumerate(zip(contigs_fpaths, gff_fpaths)): genes = parse_gff(open(gff_fpath), 'rrna') report = reporting.get(contigs_fpath) if not os.path.isfile(gff_fpath): logger.error('Failed running Barrnap for ' + contigs_fpath + '. See ' + log_fpath + ' for information.') continue part_count = len([ gene for gene in genes if 'product' in gene.attributes and 'partial' in gene.attributes['product'] ]) total_count = len(genes) report.add_field( reporting.Fields.RNA_GENES, '%s + %s part' % (total_count - part_count, part_count)) logger.info(' ' + qutils.index_to_str(index) + ' Ribosomal RNA genes = ' + str(total_count)) logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + gff_fpath) logger.info('Done.')
def run_minimap(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads): if qconfig.is_agv_mode: return run_minimap_agv(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads) preset = 'asm5' if qconfig.min_IDY >= 95 and not qconfig.is_combined_ref else 'asm10' # -s -- min CIGAR score, -z -- affects how often to stop alignment extension, -B -- mismatch penalty # -O -- gap penalty, -r -- max gap size mask_level = '1' if qconfig.is_combined_ref else '0.9' num_alignments = '100' if qconfig.is_combined_ref else '50' additional_options = [ '-B5', '-O4,16', '--no-long-join', '-r', str(qconfig.MAX_INDEL_LENGTH), '-N', num_alignments, '-s', str(qconfig.min_alignment), '-z', '200' ] cmdline = [minimap_fpath(), '-c', '-x', preset] + (additional_options if not qconfig.large_genome else []) + \ ['--mask-level', mask_level, '--min-occ', '200', '-g', '2500', '--score-N', '2', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath) tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl') err_file = open(err_fpath, 'w') fasta_name = qutils.name_from_fpath(fasta_fpath) return_code = qutils.call_subprocess( ['perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath, fasta_fpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp') sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name) out_fpath = sub_fasta_fpath + '.gmhmm' heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod') with open(err_fpath, 'a') as err_file: ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath, err_file, index) if ok: genes.extend(parse_gmhmm_out(out_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) return genes
def run_minimap_agb(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads): # run minimap2 for AGB mask_level = '1' if qconfig.min_IDY < 95 else '0.9' cmdline = [minimap_fpath(), '-cx', 'asm20', '--mask-level', mask_level, '-N', '100', '--score-N', '0', '-E', '1,0', '-f', '200', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath) tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl') err_file = open(err_fpath, 'w') fasta_name = qutils.name_from_fpath(fasta_fpath) return_code = qutils.call_subprocess([ 'perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath, fasta_fpath ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp') sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name) out_fpath = sub_fasta_fpath + '.gmhmm' heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod') with open(err_fpath, 'a') as err_file: ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath, err_file, index) if ok: genes.extend(parse_gmhmm_out(out_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) return genes
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl') libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib') err_file = open(err_fpath, 'w') tmp_dirpath += qutils.name_from_fpath(fasta_fpath) if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) return_code = qutils.call_subprocess([ 'perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath ] + (['--fungus'] if qconfig.is_fungus else []), stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] fnames = [ fname for (path, dirs, files) in os.walk(tmp_dirpath) for fname in files ] for fname in fnames: if fname.endswith('gtf'): genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname))) return genes
def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess( [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code
def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess( [tool_exec_fpath, contig_path, '-d', trained_dir, '-g', '-o', tmp_path], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code
def run_nucmer(prefix, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, max_threads): nucmer_cmdline = [bin_fpath('nucmer'), '-c', str(qconfig.min_cluster), '-l', str(qconfig.min_cluster), '--maxmatch', '-p', prefix, '-t', str(max_threads)] env = os.environ.copy() nucmer_cmdline += [ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(nucmer_cmdline, stdout=open(log_out_fpath, 'a'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index), env=env) return return_code
def gmhmm_p(tool_exec, fasta_fpath, heu_fpath, out_fpath, err_file, index): """ Run GeneMark.hmm with this heuristic model (heu_dirpath) prompt> gmhmmp -m heu_11_45.mod sequence prompt> gm -m heu_11_45.mat sequence""" return_code = qutils.call_subprocess( [tool_exec, '-d', '-a', '-p', '0', '-m', heu_fpath, '-o', out_fpath, fasta_fpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) return return_code == 0 and os.path.isfile(out_fpath)
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) out_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer') err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_glimmer.stderr') #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir, # fasta_path, out_path, gene_lengths, err_path) out_gff_path, genes, unique, total, cnt = glimmerHMM(tool_dirpath, contigs_fpath, out_fpath, gene_lengths, err_fpath, tmp_dirpath, index) if out_gff_path: logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique) + ' unique, ' + str(total) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_path) return genes, unique, cnt
def run_nucmer(prefix, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, emem_threads=1): # additional GAGE params of Nucmer: '-l', '30', '-banded' nucmer_cmdline = [bin_fpath('nucmer'), '-c', str(qconfig.min_cluster), '-l', str(qconfig.min_cluster), '--maxmatch', '-p', prefix] if is_emem_aligner(): nucmer_cmdline += ['-t', str(emem_threads)] nucmer_cmdline += [ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(nucmer_cmdline, stdout=open(log_out_fpath, 'a'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath): from quast_libs import reporting ref_reads_stats = None if ref_fpath: ref_name = qutils.name_from_fpath(ref_fpath) stats_fpath = join(output_dir, ref_name + '.stat') if isfile(stats_fpath): ref_reads_stats = parse_reads_stats(stats_fpath) if int(ref_reads_stats['mapped']) == 0: logger.info(' BWA: nothing aligned for reference.') # process all contigs files for index, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) stats_fpath = join(output_dir, assembly_name + '.stat') if ref_reads_stats: report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped']) report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt']) report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons']) report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint']) report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth']) if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS, [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0]) if not isfile(stats_fpath): continue reads_stats = parse_reads_stats(stats_fpath) report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total']) report.add_field(reporting.Fields.LEFT_READS, reads_stats['left']) report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right']) report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped']) report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt']) if int(reads_stats['mapped']) == 0: logger.info(' ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.') report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons']) report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint']) report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.DEPTH, reads_stats['depth']) if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS, [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0])
def run_nucmer(prefix, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, emem_threads=1): # additional GAGE params of Nucmer: '-l', '30', '-banded' nucmer_cmdline = [bin_fpath('nucmer'), '-c', str(qconfig.min_cluster), '-l', str(qconfig.min_cluster), '--maxmatch', '-p', prefix] if is_emem_aligner(): nucmer_cmdline += ['-t', str(emem_threads)] installed_emem_fpath = get_installed_emem() if installed_emem_fpath: nucmer_cmdline += ['--emem', installed_emem_fpath] nucmer_cmdline += [ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(nucmer_cmdline, stdout=open(log_out_fpath, 'a'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def run_nucmer(prefix, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, emem_threads=1): # additional GAGE params of Nucmer: '-l', '30', '-banded' nucmer_cmdline = [bin_fpath('nucmer'), '-c', str(qconfig.min_cluster), '-l', str(qconfig.min_cluster), '--maxmatch', '-p', prefix] env = os.environ.copy() if is_emem_aligner(): nucmer_cmdline += ['--emem'] nucmer_cmdline += ['-t', str(emem_threads)] installed_emem_fpath = get_installed_emem() if installed_emem_fpath: env['NUCMER_E_MEM_OUTPUT_DIRPATH'] = dirname(prefix) nucmer_cmdline += ['--emempath', installed_emem_fpath] nucmer_cmdline += [ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(nucmer_cmdline, stdout=open(log_out_fpath, 'a'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index), env=env) return return_code
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl') libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib') err_file = open(err_fpath, 'w') tmp_dirpath += qutils.name_from_fpath(fasta_fpath) if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) return_code = qutils.call_subprocess( ['perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] fnames = [fname for (path, dirs, files) in os.walk(tmp_dirpath) for fname in files] for fname in fnames: if fname.endswith('gtf'): genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname))) return genes
def run_minimap(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads): if qconfig.is_agb_mode: return run_minimap_agb(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads) if qconfig.min_IDY < 90: preset = 'asm20' elif qconfig.min_IDY < 95: preset = 'asm10' else: preset = 'asm5' # -s -- min CIGAR score, -z -- affects how often to stop alignment extension, -B -- mismatch penalty # -O -- gap penalty, -r -- max gap size mask_level = '1' if qconfig.is_combined_ref else '0.9' num_alignments = '100' if qconfig.is_combined_ref else '50' additional_options = ['-B5', '-O4,16', '--no-long-join', '-r', str(qconfig.MAX_INDEL_LENGTH), '-N', num_alignments, '-s', str(qconfig.min_alignment), '-z', '200'] cmdline = [minimap_fpath(), '-c', '-x', preset] + (additional_options if not qconfig.large_genome else []) + \ ['--mask-level', mask_level, '--min-occ', '200', '-g', '2500', '--score-N', '2', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath): log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') nucmer_successful_check_fpath = nucmer_fpath + '.sf' delta_fpath = nucmer_fpath + '.delta' filtered_delta_fpath = nucmer_fpath + '.fdelta' coords_fpath, _, _, show_snps_fpath, _ = \ get_nucmer_aux_out_fpaths(nucmer_fpath) log_out_f.write('Aligning contigs to reference...\n') # Checking if there are existing previous nucmer alignments. # If they exist, using them to save time. using_existing_alignments = False if isfile(nucmer_successful_check_fpath) and isfile(coords_fpath) and \ (isfile(show_snps_fpath) or isfile(show_snps_fpath + '.gz') or not qconfig.show_snps): if check_nucmer_successful_check(nucmer_successful_check_fpath, old_contigs_fpath, ref_fpath): log_out_f.write('\tUsing existing alignments...\n') logger.info(' ' + qutils.index_to_str(index) + 'Using existing alignments... ') using_existing_alignments = True if not using_existing_alignments: log_out_f.write('\tAligning contigs to the reference\n') logger.info(' ' + qutils.index_to_str(index) + 'Aligning contigs to the reference') if not qconfig.splitted_ref: nucmer_exit_code = run_nucmer(nucmer_fpath, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, threads) if nucmer_exit_code != 0: return NucmerStatus.ERROR else: prefixes_and_chr_files = [(nucmer_fpath + "_" + basename(chr_fname), chr_fname) for chr_fname in qconfig.splitted_ref] # Daemonic processes are not allowed to have children, # so if we are already one of parallel processes # (i.e. daemonic) we can't start new daemonic processes if parallel_by_chr and not qconfig.memory_efficient: n_jobs = min(qconfig.max_threads, len(prefixes_and_chr_files)) threads = max(1, threads // n_jobs) else: n_jobs = 1 threads = 1 if n_jobs > 1: logger.info(' ' + 'Aligning to different chromosomes in parallel' ' (' + str(n_jobs) + ' threads)') # processing each chromosome separately (if we can) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed nucmer_exit_codes = Parallel(n_jobs=n_jobs)(delayed(run_nucmer)( prefix, chr_file, contigs_fpath, log_out_fpath, log_err_fpath + "_part%d" % (i + 1), index, threads) for i, (prefix, chr_file) in enumerate(prefixes_and_chr_files)) log_err_f.write("Stderr outputs for reference parts are in:\n") for i in range(len(prefixes_and_chr_files)): log_err_f.write(log_err_fpath + "_part%d" % (i + 1) + '\n') log_err_f.write("\n") if 0 not in nucmer_exit_codes: return NucmerStatus.ERROR else: # filling common delta file delta_file = open(delta_fpath, 'w') delta_file.write(ref_fpath + " " + contigs_fpath + "\n") delta_file.write("NUCMER\n") for i, (prefix, chr_fname) in enumerate(prefixes_and_chr_files): if nucmer_exit_codes[i] != 0: logger.warning(' ' + qutils.index_to_str(index) + 'Failed aligning contigs %s to reference part %s! Skipping this part. ' % (qutils.label_from_fpath(contigs_fpath), chr_fname) + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) continue chr_delta_fpath = prefix + '.delta' if isfile(chr_delta_fpath): chr_delta_file = open(chr_delta_fpath) chr_delta_file.readline() chr_delta_file.readline() for line in chr_delta_file: delta_file.write(line) chr_delta_file.close() delta_file.close() # By default: filtering by IDY% = 95 (as GAGE did) return_code = qutils.call_subprocess( [bin_fpath('delta-filter'), '-i', str(qconfig.min_IDY), '-l', str(qconfig.min_alignment), delta_fpath], stdout=open(filtered_delta_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Delta filter failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR shutil.move(filtered_delta_fpath, delta_fpath) tmp_coords_fpath = coords_fpath + '_tmp' return_code = qutils.call_subprocess( [bin_fpath('show-coords'), delta_fpath], stdout=open(tmp_coords_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Show-coords failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR # removing waste lines from coords file coords_file = open(coords_fpath, 'w') header = [] tmp_coords_file = open(tmp_coords_fpath) for line in tmp_coords_file: header.append(line) if line.startswith('====='): break coords_file.write(header[-2]) coords_file.write(header[-1]) for line in tmp_coords_file: coords_file.write(line) coords_file.close() tmp_coords_file.close() if not isfile(coords_fpath): return NucmerStatus.FAILED if len(open(coords_fpath).readlines()[-1].split()) < 13: return NucmerStatus.NOT_ALIGNED if qconfig.show_snps: with open(coords_fpath) as coords_file: headless_coords_fpath = coords_fpath + '.headless' headless_coords_f = open(headless_coords_fpath, 'w') coords_file.readline() coords_file.readline() headless_coords_f.write(coords_file.read()) headless_coords_f.close() headless_coords_f = open(headless_coords_fpath) return_code = qutils.call_subprocess( [bin_fpath('show-snps'), '-S', '-T', '-H', delta_fpath], stdin=headless_coords_f, stdout=open(show_snps_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Show-snps failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR create_nucmer_successful_check(nucmer_successful_check_fpath, old_contigs_fpath, ref_fpath) return NucmerStatus.OK
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None, index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'): filename = qutils.name_from_fpath(fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') if using_reads != 'all': sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)): required_files.append(sam_fpath) stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat') index_str = qutils.index_to_str(index) if index is not None else '' reads_fpaths = qconfig.reads_fpaths correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) can_reuse = correct_chr_names is not None if not can_reuse and not reads_fpaths: return None, None, None if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)): if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath) if isfile(stats_fpath) or alignment_only: return correct_chr_names, sam_fpath, bam_fpath logger.info(' ' + index_str + 'Pre-processing reads...') if is_non_empty_file(sam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif is_non_empty_file(bam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths: if is_reference: logger.info(' Running BWA for reference...') else: logger.info(' ' + index_str + 'Running BWA...') # use absolute paths because we will change workdir fpath = abspath(fpath) sam_fpath = abspath(sam_fpath) prev_dir = os.getcwd() os.chdir(output_dirpath) bwa_index(fpath, err_fpath, logger) sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads) if len(sam_fpaths) > 1: merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, main_output_dir, max_threads, err_fpath) elif len(sam_fpaths) == 1: shutil.move(sam_fpaths[0], sam_fpath) sambamba_view(sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) logger.info(' ' + index_str + 'Done.') os.chdir(prev_dir) if not is_non_empty_file(sam_fpath): logger.error(' Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.') return None, None, None correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif not correct_chr_names or not is_non_empty_file(sam_fpath): return None, None, None if is_reference: logger.info(' Sorting SAM-file for reference...') else: logger.info(' ' + index_str + 'Sorting SAM-file...') if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath): logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) else: correct_sam_fpath = join(output_dirpath, filename + '.correct.sam') # write in output dir sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath) sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) qutils.assert_file_exists(bam_fpath, 'bam file') if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath) if is_reference: logger.info(' Analysis for reference is finished.') else: logger.info(' ' + index_str + 'Analysis is finished.') return correct_chr_names, sam_fpath, bam_fpath
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None, index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'): filename = qutils.name_from_fpath(fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') if using_reads != 'all': sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)): required_files.append(sam_fpath) stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat') index_str = qutils.index_to_str(index) if index is not None else '' reads_fpaths = qconfig.reads_fpaths correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) can_reuse = correct_chr_names is not None if not can_reuse and not reads_fpaths: return None, None, None if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)): if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) if isfile(stats_fpath) or alignment_only: return correct_chr_names, sam_fpath, bam_fpath logger.info(' ' + index_str + 'Pre-processing reads...') if is_non_empty_file(sam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif is_non_empty_file(bam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths: if is_reference: logger.info(' Running BWA for reference...') else: logger.info(' ' + index_str + 'Running BWA...') # use absolute paths because we will change workdir fpath = abspath(fpath) sam_fpath = abspath(sam_fpath) prev_dir = os.getcwd() os.chdir(output_dirpath) bwa_index(fpath, err_fpath, logger) sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads) if len(sam_fpaths) > 1: merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath) elif len(sam_fpaths) == 1: shutil.move(sam_fpaths[0], sam_fpath) tmp_bam_fpath = sam_fpaths[0].replace('.sam', '.bam') if is_non_empty_file(tmp_bam_fpath): shutil.move(tmp_bam_fpath, bam_fpath) logger.info(' ' + index_str + 'Done.') os.chdir(prev_dir) if not is_non_empty_file(sam_fpath): logger.error(' Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.') return None, None, None correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif not correct_chr_names or not is_non_empty_file(sam_fpath): return None, None, None if is_reference: logger.info(' Sorting SAM-file for reference...') else: logger.info(' ' + index_str + 'Sorting SAM-file...') if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath): logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) else: correct_sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.correct.sam') # write in output dir sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath) sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) qutils.assert_file_exists(bam_fpath, 'bam file') if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) if is_reference: logger.info(' Analysis for reference is finished.') else: logger.info(' ' + index_str + 'Analysis is finished.') return correct_chr_names, sam_fpath, bam_fpath
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None icarus_gc_fpath = None circos_gc_fpath = None if ref_fpath: reference_lengths = sorted(fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(ref_fpath) if qconfig.create_icarus_html or qconfig.draw_plots: icarus_gc_fpath = join(output_dirpath, 'gc.icarus.txt') save_icarus_GC(ref_fpath, icarus_gc_fpath) if qconfig.draw_plots: circos_gc_fpath = join(output_dirpath, 'gc.circos.txt') save_circos_GC(ref_fpath, reference_length, circos_gc_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC if reference_GC is not None else 'undefined') if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning(' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).') elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_(\d+\.?\d*)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * (cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator corr_lists_of_lengths = [[sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [sum(reference_lengths[((i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum(reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points)] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] if reference_lengths: # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] list_of_GC_contigs_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution, GC_contigs_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) list_of_GC_contigs_distributions.append(GC_contigs_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil((largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if qconfig.html_report and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', []) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx', [reference_length for i in range(len(contigs_fpaths))]) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length') if not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot')) for contigs_fpath, GC_distribution in zip(contigs_fpaths, list_of_GC_contigs_distributions): plotter.contigs_GC_content_plot(contigs_fpath, GC_distribution, join(output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot')) if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.') return icarus_gc_fpath, circos_gc_fpath
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning("GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning(' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) else: successful = install_genemark(os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name)) if not successful: return if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) genes_by_labels = dict() # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) label = qutils.label_from_fpath(fasta_path) genes_by_labels[label], unique_count, count = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if count is not None: report.add_field(reporting.Fields.PREDICTED_GENES, count) if unique_count is None and count is None: logger.error(' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + label + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: for dirpath in glob.iglob(tmp_dirpath + '*'): if os.path.isdir(dirpath): shutil.rmtree(dirpath) logger.main_info('Done.') return genes_by_labels
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1): nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \ get_nucmer_aux_out_fpaths(nucmer_fpath) nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath) if nucmer_status != NucmerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if nucmer_status == NucmerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif nucmer_status == NucmerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif nucmer_status == NucmerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') clean_tmp_files(nucmer_fpath) return nucmer_status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} coords_file = open(coords_fpath) coords_filtered_file = open(coords_filtered_fpath, 'w') coords_filtered_file.write(coords_file.readline()) coords_filtered_file.write(coords_file.readline()) for line in coords_file: if line.strip() == '': break assert line[0] != '=' #Clear leading spaces from nucmer output #Store nucmer lines in an array mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_lens = {} ref_features = {} for name, seq in fastaparser.read_fasta(ref_fpath): name = name.split()[0] # no spaces in reference header ref_lens[name] = len(seq) log_out_f.write('\tLoaded [%s]\n' % name) #Loading the SNP calls if qconfig.show_snps: log_out_f.write('Loading SNPs...\n') used_snps_file = None snps = {} if qconfig.show_snps: prev_line = None for line in open_gzipsafe(show_snps_fpath): #print "$line"; line = line.split() if not line[0].isdigit(): continue if prev_line and line == prev_line: continue ref = line[10] ctg = line[11] pos = int(line[0]) # Kolya: python don't convert int<->str types automatically loc = int(line[3]) # Kolya: same as above # if (! exists $line[11]) { die "Malformed line in SNP file. Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; } if pos in snps.setdefault(ref, {}).setdefault(ctg, {}): snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])) else: snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])] prev_line = line used_snps_file = open_gzipsafe(used_snps_fpath, 'w') # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in ref_lens.items(): regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file, used_snps_f=used_snps_file, icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic) # if qconfig.large_genome: # log_out_f.write('Analyzing large blocks...\n') # large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null' # ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'), # coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w')) # min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD # result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null', # aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0]) # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info)) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') clean_tmp_files(nucmer_fpath) if not qconfig.no_gzip: compress_nucmer_output(logger, nucmer_fpath) if not ref_aligns: return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning( "GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning( ' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) elif not install_genemark(): logger.warning( ' Can\'t copy the license key to ~/.gm_key, skipping gene prediction.' ) else: if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: results = Parallel(n_jobs=n_jobs)( delayed(predict_genes) (index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) else: results = [ predict_genes(index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths) ] if not is_license_valid(out_dirpath, fasta_fpaths): return genes_by_labels = dict() # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) label = qutils.label_from_fpath(fasta_path) genes_by_labels[ label], unique_count, full_genes, partial_genes = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if full_genes is not None: genes = [ '%s + %s part' % (full_cnt, partial_cnt) for full_cnt, partial_cnt in zip(full_genes, partial_genes) ] report.add_field(reporting.Fields.PREDICTED_GENES, genes) if unique_count is None and full_genes is None: logger.error( ' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + label + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: for dirpath in glob.iglob(tmp_dirpath + '*'): if os.path.isdir(dirpath): shutil.rmtree(dirpath) logger.main_info('Done.') return genes_by_labels
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1): tmp_output_dirpath = create_minimap_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) out_basename = join(tmp_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(out_basename) status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath) if status != AlignerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if status == AlignerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif status == AlignerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif status == AlignerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') return status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} with open(coords_fpath) as coords_file: for line in coords_file: mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_features = {} # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in reference_chromosomes.items(): log_out_f.write('\tLoaded [%s]\n' % name) regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'), icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic) log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') total_aligned_bases, indels_info = analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath) total_indels_info += indels_info cov_stats = {'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases} result.update(cov_stats) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') if not ref_aligns: return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1): tmp_output_dirpath = create_minimap_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) out_basename = join(tmp_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join( output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = [ 'S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group' ] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths( out_basename) status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath) if status != AlignerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if status == AlignerStatus.ERROR: logger.error( ' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif status == AlignerStatus.FAILED: log_err_f.write( qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif status == AlignerStatus.NOT_ALIGNED: log_err_f.write( qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') return status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} with open(coords_fpath) as coords_file: for line in coords_file: mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_features = {} # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in reference_chromosomes.items(): log_out_f.write('\tLoaded [%s]\n' % name) regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'), icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic) log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') total_aligned_bases, indels_info = analyze_coverage( ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath) total_indels_info += indels_info cov_stats = { 'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases } result.update(cov_stats) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta( join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join( output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join( output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile( r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall( contig)[0][0] contig_cov = len_cov_pattern.findall( contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') if not ref_aligns: return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = defaultdict(int) logger.info(' ' + qutils.index_to_str(index) + assembly_label) coords_base_fpath = os.path.join(coords_dirpath, corr_assembly_label + '.coords') if qconfig.use_all_alignments: coords_fpath = coords_base_fpath else: coords_fpath = coords_base_fpath + '.filtered' if not os.path.isfile(coords_fpath): logger.error('File with alignment coords (' + coords_fpath + ') not found! Try to restart QUAST.', indent=' ') return None, None # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.items(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) sorted_contig_tuples = sorted(enumerate(contig_tuples), key=lambda x: len(x[1][1]), reverse=True) sorted_contigs_names = [] contigs_order = [] for idx, (name, _) in sorted_contig_tuples: sorted_contigs_names.append(name) contigs_order.append(idx) features_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock gene_searching_enabled = len(containers) if qconfig.memory_efficient and gene_searching_enabled: logger.warning('Run QUAST without genes and operons files to reduce memory consumption.') if gene_searching_enabled: for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] with open(coords_fpath) as coordfile: for line in coordfile: s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + coords_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None if gene_searching_enabled: aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1, contig=contig_name, start_in_contig=s2, end_in_contig=e2)) for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 for chr_name in genome_mapping.keys(): for i in ns_by_chromosomes[chr_name]: genome_mapping[chr_name][i] = 0 ref_lengths[chr_name] = sum(genome_mapping[chr_name]) if qconfig.space_efficient and coords_fpath.endswith('.filtered'): os.remove(coords_fpath) # counting genome coverage and gaps number gaps_count = 0 if qconfig.analyze_gaps: gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null' with open(gaps_fpath, 'w') as gaps_file: for chr_name, chr_len in reference_chromosomes.items(): gaps_file.write(chr_name + '\n') cur_gap_size = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1 or i in ns_by_chromosomes[chr_name]: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n') cur_gap_size = 0 else: cur_gap_size += 1 if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n') results["gaps_count"] = gaps_count results[reporting.Fields.GENES + "_full"] = None results[reporting.Fields.GENES + "_partial"] = None results[reporting.Fields.OPERONS + "_full"] = None results[reporting.Fields.OPERONS + "_partial"] = None # finding genes and operons for container in containers: if not container.region_list: continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_genomic_features_' + container.kind.lower() + '.txt') found_file = open(found_fpath, 'w') found_file.write('%s\t\t%s\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type', 'Contig')) found_file.write('=' * 50 + '\n') # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 gene_blocks = [] if region.id is None: region.id = '# ' + str(region.number + 1) for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if cur_block.seqname != region.seqname: continue if region.end <= cur_block.start or cur_block.end <= region.start: continue elif cur_block.start <= region.start and region.end <= cur_block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 contig_info = cur_block.format_gene_info(region) found_file.write('%s\t\t%d\t%d\tcomplete\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': operons_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig else: features_in_contigs[contig_id] += 1 cur_feature_is_found = True break elif min(region.end, cur_block.end) - max(region.start, cur_block.start) >= qconfig.min_gene_overlap: if found_list[i] == 0: found_list[i] = 2 total_partial += 1 gene_blocks.append(cur_block) if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon contig_info = ','.join([block.format_gene_info(region) for block in sorted(gene_blocks, key=lambda block: block.start)]) found_file.write('%s\t\t%d\t%d\tpartial\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': results[reporting.Fields.OPERONS + "_full"] = total_full results[reporting.Fields.OPERONS + "_partial"] = total_partial else: if results[reporting.Fields.GENES + "_full"] is None: results[reporting.Fields.GENES + "_full"] = 0 results[reporting.Fields.GENES + "_partial"] = 0 results[reporting.Fields.GENES + "_full"] += total_full results[reporting.Fields.GENES + "_partial"] += total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') unsorted_features_in_contigs = [features_in_contigs[idx] for idx in contigs_order] unsorted_operons_in_contigs = [operons_in_contigs[idx] for idx in contigs_order] return ref_lengths, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.main_info('Running NA-NGA calculation...') ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath) reference_length = sum(ref_chr_lengths.values()) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values())) for i, (contigs_fpath, lens, assembly_len) in enumerate( zip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) na75 = N50.NG50(lens, assembly_len, 75) la50 = N50.LG50(lens, assembly_len) la75 = N50.LG50(lens, assembly_len, 75) if not qconfig.is_combined_ref: nga50 = N50.NG50(lens, reference_length) nga75 = N50.NG50(lens, reference_length, 75) lga50 = N50.LG50(lens, reference_length) lga75 = N50.LG50(lens, reference_length, 75) logger.info(' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') + ', LA50 = ' + str(la50) + (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else '')) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LA75, la75) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))]) if json_output_dirpath: from quast_libs.html_saver import json_saver json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths, json_output_dir=json_output_dirpath) if not qconfig.is_combined_ref: plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath) logger.main_info('Done.') return report_dict
def process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = defaultdict(int) logger.info(' ' + qutils.index_to_str(index) + assembly_label) coords_base_fpath = os.path.join(coords_dirpath, corr_assembly_label + '.coords') if qconfig.use_all_alignments: coords_fpath = coords_base_fpath else: coords_fpath = coords_base_fpath + '.filtered' if not os.path.isfile(coords_fpath): logger.error('File with alignment coords (' + coords_fpath + ') not found! Try to restart QUAST.', indent=' ') return None # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.items(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta( contigs_fpath) # list of FASTA entries (in tuples: name, seq) sorted_contig_tuples = sorted(enumerate(contig_tuples), key=lambda x: len(x[1][1]), reverse=True) sorted_contigs_names = [] contigs_order = [] for idx, (name, _) in sorted_contig_tuples: sorted_contigs_names.append(name) contigs_order.append(idx) features_in_contigs = [0] * len( sorted_contigs_names ) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = { } # for gene finding: contig_name --> list of AlignedBlock gene_searching_enabled = len(containers) if qconfig.memory_efficient and gene_searching_enabled: logger.warning( 'Run QUAST without genes and operons files to reduce memory consumption.' ) if gene_searching_enabled: for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] with open(coords_fpath) as coordfile: for line in coordfile: s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + coords_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None if gene_searching_enabled: aligned_blocks_by_contig_name[contig_name].append( AlignedBlock(seqname=chr_name, start=s1, end=e1, contig=contig_name, start_in_contig=s2, end_in_contig=e2)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 for chr_name in genome_mapping.keys(): for i in ns_by_chromosomes[chr_name]: genome_mapping[chr_name][i] = 0 ref_lengths[chr_name] = sum(genome_mapping[chr_name]) if qconfig.space_efficient and coords_fpath.endswith('.filtered'): os.remove(coords_fpath) # counting genome coverage and gaps number gaps_count = 0 if qconfig.analyze_gaps: gaps_fpath = os.path.join( genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null' with open(gaps_fpath, 'w') as gaps_file: for chr_name, chr_len in reference_chromosomes.items(): gaps_file.write(chr_name + '\n') cur_gap_size = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][ i] == 1 or i in ns_by_chromosomes[chr_name]: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write( str(i - cur_gap_size) + ' ' + str(i - 1) + '\n') cur_gap_size = 0 else: cur_gap_size += 1 if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write( str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n') results["gaps_count"] = gaps_count results[reporting.Fields.GENES + "_full"] = None results[reporting.Fields.GENES + "_partial"] = None results[reporting.Fields.OPERONS + "_full"] = None results[reporting.Fields.OPERONS + "_partial"] = None # finding genes and operons for container in containers: if not container.region_list: continue total_full = 0 total_partial = 0 found_fpath = os.path.join( genome_stats_dirpath, corr_assembly_label + '_genomic_features_' + container.kind.lower() + '.txt') found_file = open(found_fpath, 'w') found_file.write('%s\t\t%s\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type', 'Contig')) found_file.write('=' * 50 + '\n') # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 gene_blocks = [] if region.id is None: region.id = '# ' + str(region.number + 1) for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if cur_block.seqname != region.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [ AlignedBlock( seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1, contig=cur_block.contig_name, start_in_contig=cur_block.start_in_contig), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end, contig=cur_block.contig_name, end_in_contig=cur_block.end_in_contig) ] if cur_block.start_in_contig < cur_block.end_in_contig: blocks[0].end_in_contig = blocks[ 0].start_in_contig + (blocks[0].end - blocks[0].start) blocks[1].start_in_contig = blocks[ 0].end_in_contig + 1 else: blocks[0].end_in_contig = blocks[ 0].start_in_contig - (blocks[1].end - blocks[1].start) blocks[1].start_in_contig = blocks[ 0].end_in_contig - 1 else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[ i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 contig_info = block.format_gene_info(region) found_file.write('%s\t\t%d\t%d\tcomplete\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': operons_in_contigs[ contig_id] += 1 # inc number of found genes/operons in id-th contig else: features_in_contigs[contig_id] += 1 cur_feature_is_found = True break elif min(region.end, block.end) - max( region.start, block.start) >= qconfig.min_gene_overlap: if found_list[i] == 0: found_list[i] = 2 total_partial += 1 gene_blocks.append(block) if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon contig_info = ','.join([ block.format_gene_info(region) for block in sorted(gene_blocks, key=lambda block: block.start) ]) found_file.write( '%s\t\t%d\t%d\tpartial\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': results[reporting.Fields.OPERONS + "_full"] = total_full results[reporting.Fields.OPERONS + "_partial"] = total_partial else: if results[reporting.Fields.GENES + "_full"] is None: results[reporting.Fields.GENES + "_full"] = 0 results[reporting.Fields.GENES + "_partial"] = 0 results[reporting.Fields.GENES + "_full"] += total_full results[reporting.Fields.GENES + "_partial"] += total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') unsorted_features_in_contigs = [ features_in_contigs[idx] for idx in contigs_order ] unsorted_operons_in_contigs = [ operons_in_contigs[idx] for idx in contigs_order ] return ref_lengths, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = {} logger.info(' ' + qutils.index_to_str(index) + assembly_label) nucmer_base_fpath = os.path.join(nucmer_path_dirpath, corr_assembly_label + '.coords') if qconfig.use_all_alignments: nucmer_fpath = nucmer_base_fpath else: nucmer_fpath = nucmer_base_fpath + '.filtered' if not os.path.isfile(nucmer_fpath): logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.', indent=' ') return None coordfile = open(nucmer_fpath, 'r') for line in coordfile: if line.startswith('='): break # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.items(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True) sorted_contigs_names = [name for (name, seq) in contig_tuples] genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock gene_searching_enabled = len(genes_container.region_list) or len(operons_container.region_list) if qconfig.memory_efficient and gene_searching_enabled: logger.warning('Run QUAST without genes and operons files to reduce memory consumption.') if gene_searching_enabled: for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] for line in coordfile: if line.strip() == '': break s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None if gene_searching_enabled: aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 coordfile.close() if qconfig.space_efficient and nucmer_fpath.endswith('.filtered'): os.remove(nucmer_fpath) # counting genome coverage and gaps number covered_bp = 0 gaps_count = 0 gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null' gaps_file = open(gaps_fpath, 'w') for chr_name, chr_len in reference_chromosomes.items(): gaps_file.write(chr_name + '\n') cur_gap_size = 0 aligned_len = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n') aligned_len += 1 covered_bp += 1 cur_gap_size = 0 else: cur_gap_size += 1 ref_lengths[chr_name] = aligned_len if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n') gaps_file.close() results["covered_bp"] = covered_bp results["gaps_count"] = gaps_count # finding genes and operons for container, feature_in_contigs, field, suffix in [ (genes_container, genes_in_contigs, reporting.Fields.GENES, '_genes.txt'), (operons_container, operons_in_contigs, reporting.Fields.OPERONS, '_operons.txt')]: if not container.region_list: results[field + "_full"] = None results[field + "_partial"] = None continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + suffix) found_file = open(found_fpath, 'w') found_file.write('%s\t\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type')) found_file.write('=========================================\n') # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if container.chr_names_dict[region.seqname] != cur_block.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)] else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) found_file.write('%s\t\t%d\t%d\tcomplete\n' % (region_id, region.start, region.end)) feature_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig cur_feature_is_found = True break elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap: found_list[i] = 2 total_partial += 1 if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) found_file.write('%s\t\t%d\t%d\tpartial\n' % (region_id, region.start, region.end)) results[field + "_full"] = total_full results[field + "_partial"] = total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') return ref_lengths, (results, genes_in_contigs, operons_in_contigs)
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath): from quast_libs import reporting ref_reads_stats = None ref_lap_score = None if ref_fpath: ref_name = qutils.name_from_fpath(ref_fpath) stats_fpath = join(output_dir, ref_name + '.stat') if isfile(stats_fpath): ref_reads_stats = parse_reads_stats(stats_fpath) if int(ref_reads_stats['mapped']) == 0: logger.info(' BWA: nothing aligned for reference.') lap_out_fpath = get_safe_fpath(output_dir, ref_name + '.lap.out') if is_non_empty_file(lap_out_fpath): with open(lap_out_fpath) as f: l = f.readline() ref_lap_score = float(l.split()[0]) if l else None # process all contigs files for index, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) stats_fpath = join(output_dir, assembly_name + '.stat') if ref_reads_stats: report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped']) report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt']) report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons']) report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint']) report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth']) if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS, [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0]) if not isfile(stats_fpath): continue reads_stats = parse_reads_stats(stats_fpath) report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total']) report.add_field(reporting.Fields.LEFT_READS, reads_stats['left']) report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right']) report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped']) report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt']) if int(reads_stats['mapped']) == 0: logger.info(' ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.') report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons']) report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint']) report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.DEPTH, reads_stats['depth']) if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS, [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0]) lap_out_fpath = get_safe_fpath(output_dir, assembly_name + '.lap.out') if is_non_empty_file(lap_out_fpath): with open(lap_out_fpath) as f: l = f.readline() lap_score = float(l.split()[0]) if l else None report.add_field(reporting.Fields.LAP_SCORE, ('%.3f' % lap_score if lap_score is not None else None)) report.add_field(reporting.Fields.REF_LAP_SCORE, ('%.3f' % ref_lap_score if ref_lap_score is not None else None))
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.main_info('Running NA-NGA calculation...') ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath) reference_length = sum(ref_chr_lengths.values()) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values())) import N50 for i, (contigs_fpath, lens, assembly_len) in enumerate( itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) na75 = N50.NG50(lens, assembly_len, 75) la50 = N50.LG50(lens, assembly_len) la75 = N50.LG50(lens, assembly_len, 75) if not qconfig.is_combined_ref: nga50 = N50.NG50(lens, reference_length) nga75 = N50.NG50(lens, reference_length, 75) lga50 = N50.LG50(lens, reference_length) lga75 = N50.LG50(lens, reference_length, 75) logger.info(' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') + ', LA50 = ' + str(la50) + (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else '')) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LA75, la75) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))]) if json_output_dirpath: from quast_libs.html_saver import json_saver json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) import plotter if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths, json_output_dir=json_output_dirpath) if not qconfig.is_combined_ref: plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath) logger.main_info('Done.') return report_dict
def align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath): log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') nucmer_successful_check_fpath = nucmer_fpath + '.sf' delta_fpath = nucmer_fpath + '.delta' filtered_delta_fpath = nucmer_fpath + '.fdelta' coords_fpath, _, _, show_snps_fpath, _ = \ get_nucmer_aux_out_fpaths(nucmer_fpath) log_out_f.write('Aligning contigs to reference...\n') # Checking if there are existing previous nucmer alignments. # If they exist, using them to save time. using_existing_alignments = False if isfile(nucmer_successful_check_fpath) and isfile(coords_fpath) and \ (isfile(show_snps_fpath) or isfile(show_snps_fpath + '.gz') or not qconfig.show_snps): if check_nucmer_successful_check(nucmer_successful_check_fpath, old_contigs_fpath, ref_fpath): log_out_f.write('\tUsing existing alignments...\n') logger.info(' ' + qutils.index_to_str(index) + 'Using existing alignments... ') using_existing_alignments = True if not using_existing_alignments: log_out_f.write('\tAligning contigs to the reference\n') logger.info(' ' + qutils.index_to_str(index) + 'Aligning contigs to the reference') if not qconfig.splitted_ref: nucmer_exit_code = run_nucmer(nucmer_fpath, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, threads) if nucmer_exit_code != 0: return NucmerStatus.ERROR else: prefixes_and_chr_files = [(nucmer_fpath + "_" + basename(chr_fname), chr_fname) for chr_fname in qconfig.splitted_ref] # Daemonic processes are not allowed to have children, # so if we are already one of parallel processes # (i.e. daemonic) we can't start new daemonic processes if parallel_by_chr and not qconfig.memory_efficient: n_jobs = min(qconfig.max_threads, len(prefixes_and_chr_files)) threads = max(1, threads // n_jobs) else: n_jobs = 1 threads = 1 if n_jobs > 1: logger.info(' ' + 'Aligning to different chromosomes in parallel' ' (' + str(n_jobs) + ' threads)') # processing each chromosome separately (if we can) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed if not qconfig.memory_efficient: nucmer_exit_codes = Parallel(n_jobs=n_jobs)(delayed(run_nucmer)( prefix, chr_file, contigs_fpath, log_out_fpath, log_err_fpath + "_part%d" % (i + 1), index, threads) for i, (prefix, chr_file) in enumerate(prefixes_and_chr_files)) else: nucmer_exit_codes = [run_nucmer(prefix, chr_file, contigs_fpath, log_out_fpath, log_err_fpath + "_part%d" % (i + 1), index, threads) for i, (prefix, chr_file) in enumerate(prefixes_and_chr_files)] log_err_f.write("Stderr outputs for reference parts are in:\n") for i in range(len(prefixes_and_chr_files)): log_err_f.write(log_err_fpath + "_part%d" % (i + 1) + '\n') log_err_f.write("\n") if 0 not in nucmer_exit_codes: return NucmerStatus.ERROR else: # filling common delta file delta_file = open(delta_fpath, 'w') delta_file.write(ref_fpath + " " + contigs_fpath + "\n") delta_file.write("NUCMER\n") for i, (prefix, chr_fname) in enumerate(prefixes_and_chr_files): if nucmer_exit_codes[i] != 0: logger.warning(' ' + qutils.index_to_str(index) + 'Failed aligning contigs %s to reference part %s! Skipping this part. ' % (qutils.label_from_fpath(contigs_fpath), chr_fname) + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) continue chr_delta_fpath = prefix + '.delta' if isfile(chr_delta_fpath): chr_delta_file = open(chr_delta_fpath) chr_delta_file.readline() chr_delta_file.readline() for line in chr_delta_file: delta_file.write(line) chr_delta_file.close() delta_file.close() # By default: filtering by IDY% = 95 (as GAGE did) return_code = qutils.call_subprocess( [bin_fpath('delta-filter'), '-i', str(qconfig.min_IDY), '-l', str(qconfig.min_alignment), delta_fpath], stdout=open(filtered_delta_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Delta filter failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR shutil.move(filtered_delta_fpath, delta_fpath) if qconfig.draw_plots: draw_mummer_plot(logger, nucmer_fpath, delta_fpath, index, log_out_f, log_err_f) tmp_coords_fpath = coords_fpath + '_tmp' return_code = qutils.call_subprocess( [bin_fpath('show-coords'), delta_fpath], stdout=open(tmp_coords_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Show-coords failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR # removing waste lines from coords file coords_file = open(coords_fpath, 'w') header = [] tmp_coords_file = open(tmp_coords_fpath) for line in tmp_coords_file: header.append(line) if line.startswith('====='): break coords_file.write(header[-2]) coords_file.write(header[-1]) for line in tmp_coords_file: coords_file.write(line) coords_file.close() tmp_coords_file.close() if not isfile(coords_fpath): return NucmerStatus.FAILED if len(open(coords_fpath).readlines()[-1].split()) < 13: return NucmerStatus.NOT_ALIGNED if qconfig.show_snps: with open(coords_fpath) as coords_file: headless_coords_fpath = coords_fpath + '.headless' headless_coords_f = open(headless_coords_fpath, 'w') coords_file.readline() coords_file.readline() headless_coords_f.write(coords_file.read()) headless_coords_f.close() headless_coords_f = open(headless_coords_fpath) return_code = qutils.call_subprocess( [bin_fpath('show-snps'), '-S', '-T', '-H', delta_fpath], stdin=headless_coords_f, stdout=open(show_snps_fpath, 'w'), stderr=log_err_f, indent=' ' + qutils.index_to_str(index)) if return_code != 0: log_err_f.write(qutils.index_to_str(index) + ' Show-snps failed for ' + contigs_fpath + '\n') return NucmerStatus.ERROR create_nucmer_successful_check(nucmer_successful_check_fpath, old_contigs_fpath, ref_fpath) return NucmerStatus.OK
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() kmer_len = qconfig.unique_kmer_len logger.main_info('Running analysis based on unique ' + str(kmer_len) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field( reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 7: corr_len = int(stats_content[1].strip().split(': ')[-1]) mis_len = int(stats_content[2].strip().split(': ')[-1]) undef_len = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) translocations = int(stats_content[5].strip().split(': ')[-1]) relocations = int(stats_content[6].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_CORR_LENGTH, '%.2f' % (corr_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_MIS_LENGTH, '%.2f' % (mis_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_UNDEF_LENGTH, '%.2f' % (undef_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_TRANSLOCATIONS, translocations) report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations) report.add_field(reporting.Fields.KMER_MISASSEMBLIES, translocations + relocations) checked_assemblies.append(contigs_fpath) contigs_fpaths = [ fpath for fpath in contigs_fpaths if fpath not in checked_assemblies ] if len(contigs_fpaths) == 0: save_kmers(output_dir) logger.info('Done.') return if qconfig.platform_name == 'linux_32': logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger) global kmc_bin_fpath global kmc_tools_fpath kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) if not exists(kmc_bin_fpath) or not exists( kmc_tools_fpath) or not compile_minimap(logger): logger.warning(' Sorry, can\'t run KMC, skipping...') return None logger.info(' Running KMC on reference...') if not isdir(output_dir): os.makedirs(output_dir) log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, kmer_len, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: logger.warning('KMC failed, check ' + log_fpath + ' and ' + err_fpath + '. Skipping...') return logger.info(' Analyzing assemblies completeness...') kmc_out_fpaths = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, kmer_len, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers( tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info(' Analyzing assemblies correctness...') ref_contigs = [name for name, _ in read_fasta(ref_fpath)] logger.info(' Downsampling k-mers...') ref_kmers, downsampled_kmers_fpath = downsample_kmers( tmp_dirpath, ref_fpath, ref_kmc_out_fpath, kmer_len, log_fpath, err_fpath) for id, (contigs_fpath, kmc_db_fpath) in enumerate(zip(contigs_fpaths, kmc_out_fpaths)): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) report = reporting.get(contigs_fpath) corr_len = None mis_len = None undef_len = None translocations, relocations = None, None total_len = 0 contig_lens = dict() for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(ref_contigs) > MAX_REF_CONTIGS_NUM: logger.warning( 'Reference is too fragmented. Scaffolding accuracy will not be assessed.' ) else: corr_len = 0 mis_len = 0 kmers_by_contig, kmers_pos_by_contig = align_kmers( tmp_dirpath, contigs_fpath, downsampled_kmers_fpath, err_fpath, qconfig.max_threads) is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref cyclic_ref_lens = report.get_field( reporting.Fields.REFLEN) if is_cyclic else None translocations = 0 relocations = 0 with open( join( tmp_dirpath, qutils.label_from_fpath_for_fname(contigs_fpath) + '.misjoins.txt'), 'w') as out: for contig in kmers_by_contig.keys(): contig_markers = [] prev_pos, prev_ref_pos, prev_chrom, marker = None, None, None, None for pos, kmer in sorted(zip(kmers_pos_by_contig[contig], kmers_by_contig[contig]), key=lambda x: x[0]): ref_chrom, ref_pos = ref_kmers[kmer] if prev_pos and prev_chrom: if prev_chrom == ref_chrom and abs( abs(pos - prev_pos) / abs(ref_pos - prev_ref_pos) - 1) <= 0.05: marker = (pos, ref_pos, ref_chrom) elif marker: contig_markers.append(marker) pos, ref_pos, ref_chrom, marker = None, None, None, None prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom if marker: contig_markers.append(marker) prev_pos, prev_ref_pos, prev_chrom = None, None, None is_misassembled = False for marker in contig_markers: pos, ref_pos, ref_chrom = marker if prev_pos and prev_chrom: if ref_chrom != prev_chrom: translocations += 1 out.write( 'Translocation in %s: %s %d | %s %d\n' % (contig, prev_chrom, prev_pos, ref_chrom, pos)) is_misassembled = True elif _get_dist_inconstistency( pos, prev_pos, ref_pos, prev_ref_pos, cyclic_ref_lens) > EXT_RELOCATION_SIZE: relocations += 1 out.write( 'Relocation in %s: %d (%d) | %d (%d)\n' % (contig, prev_pos, prev_ref_pos, pos, ref_pos)) is_misassembled = True prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom if is_misassembled: mis_len += contig_lens[contig] elif len(contig_markers) > 0: corr_len += contig_lens[contig] undef_len = total_len - corr_len - mis_len report.add_field(reporting.Fields.KMER_CORR_LENGTH, '%.2f' % (corr_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_MIS_LENGTH, '%.2f' % (mis_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_UNDEF_LENGTH, '%.2f' % (undef_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_TRANSLOCATIONS, translocations) report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations) report.add_field(reporting.Fields.KMER_MISASSEMBLIES, translocations + relocations) create_kmc_stats_file( output_dir, contigs_fpath, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), corr_len, mis_len, undef_len, total_len, translocations, relocations) save_kmers(output_dir) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container): assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = {} logger.info(' ' + qutils.index_to_str(index) + assembly_label) nucmer_base_fpath = os.path.join(nucmer_path_dirpath, assembly_label + '.coords') if qconfig.use_all_alignments: nucmer_fpath = nucmer_base_fpath else: nucmer_fpath = nucmer_base_fpath + '.filtered' if not os.path.isfile(nucmer_fpath): logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.', indent=' ') return None coordfile = open(nucmer_fpath, 'r') for line in coordfile: if line.startswith('='): break # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.iteritems(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True) sorted_contigs_names = [name for (name, seq) in contig_tuples] genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] for line in coordfile: if line.strip() == '': break line=line.strip('()\n').split(', ') s1 = int(line[0]) e1 = int(line[1]) s2 = int(line[2]) e2 = int(line[3]) contig_name = line[8].strip('\'') chr_name = line[7].strip('\'') # s1 = int(line.split('|')[0].split()[0]) # e1 = int(line.split('|')[0].split()[1]) # s2 = int(line.split('|')[1].split()[0]) # e2 = int(line.split('|')[1].split()[1]) # contig_name = line.split()[12].strip() # chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 coordfile.close() # counting genome coverage and gaps number covered_bp = 0 gaps_count = 0 gaps_fpath = os.path.join(genome_stats_dirpath, assembly_label + '_gaps.txt') gaps_file = open(gaps_fpath, 'w') for chr_name, chr_len in reference_chromosomes.iteritems(): print >>gaps_file, chr_name cur_gap_size = 0 aligned_len = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >>gaps_file, i - cur_gap_size, i - 1 aligned_len += 1 covered_bp += 1 cur_gap_size = 0 else: cur_gap_size += 1 ref_lengths[chr_name] = aligned_len if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >>gaps_file, chr_len - cur_gap_size + 1, chr_len gaps_file.close() results["covered_bp"] = covered_bp results["gaps_count"] = gaps_count # finding genes and operons for container, feature_in_contigs, field, suffix in [ (genes_container, genes_in_contigs, reporting.Fields.GENES, '_genes.txt'), (operons_container, operons_in_contigs, reporting.Fields.OPERONS, '_operons.txt')]: if not container.region_list: results[field + "_full"] = None results[field + "_partial"] = None continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, assembly_label + suffix) found_file = open(found_fpath, 'w') print >>found_file, '%s\t\t%s\t%s\t%s' % ('ID or #', 'Start', 'End', 'Type') print >>found_file, '=========================================' # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if container.chr_names_dict[region.seqname] != cur_block.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)] else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) print >>found_file, '%s\t\t%d\t%d\tcomplete' % (region_id, region.start, region.end) feature_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig cur_feature_is_found = True break elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap: found_list[i] = 2 total_partial += 1 if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) print >>found_file, '%s\t\t%d\t%d\tpartial' % (region_id, region.start, region.end) results[field + "_full"] = total_full results[field + "_partial"] = total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') return ref_lengths, (results, genes_in_contigs, operons_in_contigs)
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None if ref_fpath: reference_lengths = sorted( fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content( ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC if reference_GC is not None else 'undefined') if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning( ' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).' ) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_(\d+\.?\d*)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold( seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * ( cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] num_contigs = max( [len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator corr_lists_of_lengths = [[ sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length) ] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [ sum(reference_lengths[( (i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum( reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points) ] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append( sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] if reference_lengths: # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] list_of_GC_contigs_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate( zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution, GC_contigs_distribution = GC_content( contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) list_of_GC_contigs_distributions.append(GC_contigs_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field( reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil( (largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if qconfig.html_report and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', []) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx', [reference_length for i in range(len(contigs_fpaths))]) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length') if not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot')) for contigs_fpath, GC_distribution in zip( contigs_fpaths, list_of_GC_contigs_distributions): plotter.contigs_GC_content_plot( contigs_fpath, GC_distribution, join( output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot')) if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.')
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1): nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \ get_nucmer_aux_out_fpaths(nucmer_fpath) nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath) if nucmer_status != NucmerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if nucmer_status == NucmerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif nucmer_status == NucmerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif nucmer_status == NucmerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') clean_tmp_files(nucmer_fpath) return nucmer_status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} coords_file = open(coords_fpath) coords_filtered_file = open(coords_filtered_fpath, 'w') coords_filtered_file.write(coords_file.readline()) coords_filtered_file.write(coords_file.readline()) for line in coords_file: if line.strip() == '': break assert line[0] != '=' #Clear leading spaces from nucmer output #Store nucmer lines in an array mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up references = {} ref_features = {} for name, seq in fastaparser.read_fasta(ref_fpath): name = name.split()[0] # no spaces in reference header references[name] = seq log_out_f.write('\tLoaded [%s]\n' % name) #Loading the SNP calls if qconfig.show_snps: log_out_f.write('Loading SNPs...\n') used_snps_file = None snps = {} if qconfig.show_snps: prev_line = None for line in open_gzipsafe(show_snps_fpath): #print "$line"; line = line.split() if not line[0].isdigit(): continue if prev_line and line == prev_line: continue ref = line[10] ctg = line[11] pos = int(line[0]) # Kolya: python don't convert int<->str types automatically loc = int(line[3]) # Kolya: same as above # if (! exists $line[11]) { die "Malformed line in SNP file. Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; } if pos in snps.setdefault(ref, {}).setdefault(ctg, {}): snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])) else: snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])] prev_line = line used_snps_file = open_gzipsafe(used_snps_fpath, 'w') # Loading the regions (if any) regions = {} ref_lens = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq in references.items(): regions.setdefault(name, []).append([1, len(seq)]) ref_lens[name] = len(seq) total_regions += 1 total_reg_len += ref_lens[name] log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file, used_snps_f=used_snps_file, icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic) log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info)) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') clean_tmp_files(nucmer_fpath) if not qconfig.no_gzip: compress_nucmer_output(logger, nucmer_fpath) if not ref_aligns: return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs