def partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template, labels): # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) n_jobs = min(qconfig.max_threads, len(assemblies)) parallel_run_args = [(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template) for asm in assemblies] assemblies_dicts, not_aligned_assemblies = run_parallel( parallel_partition_contigs, parallel_run_args, n_jobs) assemblies_by_ref = [] for ref_fpath in ref_fpaths: ref_name = qutils.name_from_fpath(ref_fpath) not_sorted_assemblies = set([ val for sublist in (assemblies_dicts[i][ref_name] for i in range(len(assemblies_dicts))) for val in sublist ]) sorted_assemblies = [] for label in labels: # sort by label for assembly in not_sorted_assemblies: if assembly.label == label: sorted_assemblies.append(assembly) break assemblies_by_ref.append((ref_fpath, sorted_assemblies)) return assemblies_by_ref, not_aligned_assemblies
def partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template, labels): # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) n_jobs = min(qconfig.max_threads, len(assemblies)) if is_python2(): from joblib2 import Parallel, delayed else: from joblib3 import Parallel, delayed assemblies = Parallel(n_jobs=n_jobs)(delayed(parallel_partition_contigs)( asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template) for asm in assemblies) assemblies_dicts = [assembly[0] for assembly in assemblies] assemblies_by_ref = [] for ref_fpath in ref_fpaths: ref_name = qutils.name_from_fpath(ref_fpath) not_sorted_assemblies = set([ val for sublist in (assemblies_dicts[i][ref_name] for i in range(len(assemblies_dicts))) for val in sublist ]) sorted_assemblies = [] for label in labels: # sort by label for assembly in not_sorted_assemblies: if assembly.label == label: sorted_assemblies.append(assembly) break assemblies_by_ref.append((ref_fpath, sorted_assemblies)) not_aligned_assemblies = [assembly[1] for assembly in assemblies] return assemblies_by_ref, not_aligned_assemblies
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath): from quast_libs import reporting ref_reads_stats = None if ref_fpath: ref_name = qutils.name_from_fpath(ref_fpath) stats_fpath = join(output_dir, ref_name + '.stat') if isfile(stats_fpath): ref_reads_stats = parse_reads_stats(stats_fpath) if int(ref_reads_stats['mapped']) == 0: logger.info(' BWA: nothing aligned for reference.') # process all contigs files for index, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) stats_fpath = join(output_dir, assembly_name + '.stat') if ref_reads_stats: report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped']) report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt']) report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons']) report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint']) report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth']) if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS, [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0]) if not isfile(stats_fpath): continue reads_stats = parse_reads_stats(stats_fpath) report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total']) report.add_field(reporting.Fields.LEFT_READS, reads_stats['left']) report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right']) report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped']) report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt']) if int(reads_stats['mapped']) == 0: logger.info(' ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.') report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons']) report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint']) report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.DEPTH, reads_stats['depth']) if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS, [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0])
def prepare_regular_quast_args(quast_py_args, combined_output_dirpath): opts_with_args_to_remove = ['--contig-thresholds', '--sv-bed',] opts_to_remove = ['-s', '--scaffolds', '--combined-ref'] for opt in opts_with_args_to_remove: remove_from_quast_py_args(quast_py_args, opt, arg=True) for opt in opts_to_remove: remove_from_quast_py_args(quast_py_args, opt) quast_py_args += ['--no-check-meta'] qconfig.contig_thresholds = ','.join([str(threshold) for threshold in qconfig.contig_thresholds if threshold >= qconfig.min_contig]) if not qconfig.contig_thresholds: qconfig.contig_thresholds = 'None' quast_py_args += ['--contig-thresholds'] quast_py_args += [qconfig.contig_thresholds] reads_stats_dirpath = os.path.join(combined_output_dirpath, qconfig.reads_stats_dirname) reference_name = qutils.name_from_fpath(qconfig.combined_ref_name) qconfig.bed = qconfig.bed or os.path.join(reads_stats_dirpath, reference_name + '.bed') qconfig.cov_fpath = qconfig.cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.cov') qconfig.phys_cov_fpath = qconfig.phys_cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.physical.cov') if qconfig.bed and is_non_empty_file(qconfig.bed): quast_py_args += ['--sv-bed'] quast_py_args += [qconfig.bed] if qconfig.cov_fpath and is_non_empty_file(qconfig.cov_fpath): quast_py_args += ['--cov'] quast_py_args += [qconfig.cov_fpath] if qconfig.phys_cov_fpath and is_non_empty_file(qconfig.phys_cov_fpath): quast_py_args += ['--phys-cov'] quast_py_args += [qconfig.phys_cov_fpath]
def search_sv_with_gridss(main_ref_fpath, bam_fpath, meta_ref_fpaths, output_dirpath, err_fpath): logger.info(' Searching structural variations with GRIDSS...') final_bed_fpath = join(output_dirpath, qutils.name_from_fpath(main_ref_fpath) + '_' + qconfig.sv_bed_fname) if isfile(final_bed_fpath): logger.info(' Using existing file: ' + final_bed_fpath) return final_bed_fpath if not get_path_to_program('java') or not check_java_version(1.8): logger.warning('Java 1.8 or later is required to run GRIDSS. Please install it and rerun QUAST.') return None if not get_path_to_program('Rscript'): logger.warning('R is required to run GRIDSS. Please install it and rerun QUAST.') return None if meta_ref_fpaths: n_jobs = min(len(meta_ref_fpaths), qconfig.max_threads) threads_per_job = max(1, qconfig.max_threads // n_jobs) parallel_args = [(cur_ref_fpath, output_dirpath, err_fpath, threads_per_job) for cur_ref_fpath in meta_ref_fpaths] bed_fpaths = run_parallel(process_one_ref, parallel_args, n_jobs, filter_results=True) if bed_fpaths: qutils.cat_files(bed_fpaths, final_bed_fpath) else: process_one_ref(main_ref_fpath, output_dirpath, err_fpath, qconfig.max_threads, bam_fpath=bam_fpath, bed_fpath=final_bed_fpath) logger.info(' Saving to: ' + final_bed_fpath) return final_bed_fpath
def save_total_report(output_dirpath, min_contig, ref_fpath): from quast_libs import reporting asm_names = [ qutils.label_from_fpath(this) for this in reporting.assembly_fpaths ] report = reporting.table(reporting.Fields.grouped_order) subreports = [] ref_names = [] if qconfig.is_combined_ref and ref_labels_by_chromosomes: ref_names = sorted( list(set([ref for ref in ref_labels_by_chromosomes.values()]))) subreports = [ reporting.table(reporting.Fields.grouped_order, ref_name=ref_name) for ref_name in ref_names ] t = datetime.datetime.now() return save( join(output_dirpath, total_report_fname), { 'date': t.strftime('%d %B %Y, %A, %H:%M:%S'), 'assembliesNames': asm_names, 'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '', 'order': [i for i, _ in enumerate(asm_names)], 'report': report, 'subreferences': ref_names, 'subreports': subreports, 'minContig': min_contig })
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath) tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl') err_file = open(err_fpath, 'w') fasta_name = qutils.name_from_fpath(fasta_fpath) return_code = qutils.call_subprocess( ['perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath, fasta_fpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp') sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name) out_fpath = sub_fasta_fpath + '.gmhmm' heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod') with open(err_fpath, 'a') as err_file: ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath, err_file, index) if ok: genes.extend(parse_gmhmm_out(out_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) return genes
def _run_quast_per_ref(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, total_num_notifications, is_parallel_run=False): ref_name = qutils.name_from_fpath(ref_fpath) if not ref_assemblies: logger.main_info('\nNo contigs were aligned to the reference ' + ref_name + ', skipping..') return None, None, total_num_notifications else: output_dirpath = os.path.join(output_dirpath_per_ref, ref_name) run_name = 'for the contigs aligned to ' + ref_name logger.main_info( '\nStarting quast.py ' + run_name + '... (logging to ' + os.path.join(output_dirpath, qconfig.LOGGER_DEFAULT_NAME) + '.log)') return_code, total_num_notifications = _start_quast_main( quast_py_args, assemblies=ref_assemblies, reference_fpath=ref_fpath, output_dirpath=output_dirpath, num_notifications_tuple=total_num_notifications, is_parallel_run=is_parallel_run) json_text = None if qconfig.html_report: from quast_libs.html_saver import json_saver json_text = json_saver.json_text return ref_name, json_text, total_num_notifications
def get(assembly_fpath, ref_name=None): if not ref_name and qconfig.reference: ref_name = qutils.name_from_fpath(qconfig.reference) if assembly_fpath not in assembly_fpaths: assembly_fpaths.append(assembly_fpath) return reports.setdefault((os.path.abspath(assembly_fpath), ref_name), Report(qutils.label_from_fpath(assembly_fpath)))
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath) tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl') err_file = open(err_fpath, 'w') fasta_name = qutils.name_from_fpath(fasta_fpath) return_code = qutils.call_subprocess([ 'perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath, fasta_fpath ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp') sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name) out_fpath = sub_fasta_fpath + '.gmhmm' heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod') with open(err_fpath, 'a') as err_file: ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath, err_file, index) if ok: genes.extend(parse_gmhmm_out(out_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) return genes
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl') libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib') err_file = open(err_fpath, 'w') tmp_dirpath += qutils.name_from_fpath(fasta_fpath) if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) return_code = qutils.call_subprocess([ 'perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath ] + (['--fungus'] if qconfig.is_fungus else []), stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] fnames = [ fname for (path, dirs, files) in os.walk(tmp_dirpath) for fname in files ] for fname in fnames: if fname.endswith('gtf'): genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname))) return genes
def get_unique_covered_regions(ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=False): red_genome_dir = os.path.join(tmp_dir, 'tmp_red') if isdir(red_genome_dir): shutil.rmtree(red_genome_dir) os.makedirs(red_genome_dir) ref_name = qutils.name_from_fpath(ref_fpath) ref_symlink = os.path.join(red_genome_dir, ref_name + '.fa') ## Red recognizes only *.fa files if os.path.islink(ref_symlink): os.remove(ref_symlink) os.symlink(ref_fpath, ref_symlink) logger.info(' ' + 'Running repeat masking tool...') repeats_fpath = os.path.join(tmp_dir, ref_name + '.rpt') if is_non_empty_file(repeats_fpath): return_code = 0 logger.info(' ' + 'Using existing file ' + repeats_fpath + '...') else: return_code = qutils.call_subprocess([binary_fpath, '-gnm', red_genome_dir, '-rpt', tmp_dir, '-frm', '2', '-min', '5'], stdout=open(log_fpath, 'w'), stderr=open(log_fpath, 'w'), indent=' ') if return_code == 0 and repeats_fpath and exists(repeats_fpath): long_repeats_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.long.rpt') with open(long_repeats_fpath, 'w') as out: with open(repeats_fpath) as in_f: for line in in_f: l = line.split('\t') repeat_len = int(l[2]) - int(l[1]) if repeat_len >= insert_size: out.write(line[1:]) repeats_fasta_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.fasta') coords_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.rpt.coords.txt') if not is_non_empty_file(coords_fpath): fasta_index_fpath = ref_fpath + '.fai' if exists(fasta_index_fpath): os.remove(fasta_index_fpath) qutils.call_subprocess([bedtools_fpath('bedtools'), 'getfasta', '-fi', ref_fpath, '-bed', long_repeats_fpath, '-fo', repeats_fasta_fpath], stderr=open(log_fpath, 'w'), indent=' ') cmdline = [minimap_fpath(), '-c', '-x', 'asm10', '-N', '50', '--mask-level', '1', '--no-long-join', '-r', '100', '-t', str(qconfig.max_threads), '-z', '200', ref_fpath, repeats_fasta_fpath] qutils.call_subprocess(cmdline, stdout=open(coords_fpath, 'w'), stderr=open(log_fpath, 'a')) filtered_repeats_fpath, repeats_regions = check_repeats_instances(coords_fpath, long_repeats_fpath, use_long_reads) unique_covered_regions = remove_repeat_regions(ref_fpath, filtered_repeats_fpath, uncovered_fpath) return unique_covered_regions, repeats_regions return None, None
def align_reference(ref_fpath, output_dir, using_reads='all', calculate_coverage=False): required_files = [] ref_name = qutils.name_from_fpath(ref_fpath) cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') if using_reads != 'all': cov_fpath = add_suffix(cov_fpath, using_reads) uncovered_fpath = add_suffix(uncovered_fpath, using_reads) insert_size_fpath = join(output_dir, ref_name + '.is.txt') if not is_non_empty_file(uncovered_fpath): required_files.append(uncovered_fpath) if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'pe'): required_files.append(insert_size_fpath) temp_output_dir = join(output_dir, 'temp_output') if not isdir(temp_output_dir): os.makedirs(temp_output_dir) log_path = join(output_dir, 'reads_stats.log') err_fpath = join(output_dir, 'reads_stats.err') correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, qconfig.max_threads, sam_fpath=qconfig.reference_sam, bam_fpath=qconfig.reference_bam, required_files=required_files, is_reference=True, alignment_only=True, using_reads=using_reads) if not qconfig.optimal_assembly_insert_size or qconfig.optimal_assembly_insert_size == 'auto': if using_reads == 'pe' and sam_fpath: insert_size, std_dev = calculate_insert_size(sam_fpath, output_dir, ref_name) if not insert_size: logger.info(' Failed calculating insert size.') else: qconfig.optimal_assembly_insert_size = insert_size elif using_reads == 'all' and is_non_empty_file(insert_size_fpath): try: insert_size = int(open(insert_size_fpath).readline()) if insert_size: qconfig.optimal_assembly_insert_size = insert_size except: pass if not required_files: return sam_fpath, bam_fpath, uncovered_fpath if not sam_fpath: logger.info(' Failed detecting uncovered regions.') return None, None if calculate_coverage: bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(bam_sorted_fpath): logger.info(' Using existing sorted BAM-file: ' + bam_sorted_fpath) else: sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) if not is_non_empty_file(uncovered_fpath) and calculate_coverage: get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return sam_fpath, bam_fpath, uncovered_fpath
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger): ref_misassemblies = [result['istranslocations_by_refs'] if result else [] for result in results] potential_misassemblies_by_refs = [result['potential_misassemblies_by_refs'] if result else [] for result in results] all_refs = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()]))) misassemblies_by_refs_rows = [] row = {'metricName': 'References', 'values': all_refs} misassemblies_by_refs_rows.append(row) if ref_misassemblies: for i, fpath in enumerate(contigs_fpaths): row = {'metricName': qutils.label_from_fpath(fpath), 'values': []} misassemblies_by_refs_rows.append(row) if ref_misassemblies[i]: assembly_name = qutils.name_from_fpath(fpath) all_rows = [] row = {'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))]} all_rows.append(row) for k in all_refs: row = {'metricName': k, 'values': []} for ref in all_refs: if ref == k or ref not in ref_misassemblies[i]: row['values'].append(None) else: row['values'].append(ref_misassemblies[i][ref][k]) misassemblies_by_refs_rows[-1]['values'].append(max(0, sum([r for r in row['values'] if r]) + potential_misassemblies_by_refs[i][k])) all_rows.append(row) misassembly_by_ref_fpath = os.path.join(output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name) with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file: misassembly_by_ref_file.write('Number of interspecies translocations by references: \n') print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True) with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file: misassembly_by_ref_file.write('References:\n') for ref_num, ref in enumerate(all_refs): misassembly_by_ref_file.write(str(ref_num + 1) + ' - ' + ref + '\n') logger.info(' Information about interspecies translocations by references for %s is saved to %s' % (assembly_name, misassembly_by_ref_fpath)) misassemblies = [] if qconfig.draw_plots: from quast_libs import plotter aligned_contigs_labels = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: aligned_contigs_labels.append(row['metricName']) else: misassemblies_by_refs_rows.remove(row) for i in range(len(all_refs)): cur_results = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: cur_results.append(row['values'][i]) misassemblies.append(cur_results) is_translocations_plot_fpath = os.path.join(output_dir, 'intergenomic_misassemblies.' + qconfig.plot_extension) plotter.draw_meta_summary_plot('', output_dir, aligned_contigs_labels, all_refs, misassemblies_by_refs_rows, misassemblies, is_translocations_plot_fpath, title='Intergenomic misassemblies (found and supposed)', reverse=False, yaxis_title=None, print_all_refs=True)
def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name if not qconfig.no_check: corr_seq = correct_seq(seq, ref_fpath) if not corr_seq: return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath
def align_reference(ref_fpath, output_dir, using_reads='all'): required_files = [] ref_name = qutils.name_from_fpath(ref_fpath) cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') if using_reads != 'all': cov_fpath = add_suffix(cov_fpath, using_reads) uncovered_fpath = add_suffix(uncovered_fpath, using_reads) insert_size_fpath = join(output_dir, ref_name + '.is.txt') if not is_non_empty_file(uncovered_fpath): required_files.append(uncovered_fpath) if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'paired_end'): required_files.append(insert_size_fpath) temp_output_dir = join(output_dir, 'temp_output') if not isdir(temp_output_dir): os.makedirs(temp_output_dir) log_path = join(output_dir, 'reads_stats.log') err_fpath = join(output_dir, 'reads_stats.err') correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, qconfig.max_threads, sam_fpath=qconfig.reference_sam, bam_fpath=qconfig.reference_bam, required_files=required_files, is_reference=True, alignment_only=True, using_reads=using_reads) qconfig.reference_sam = sam_fpath qconfig.reference_bam = bam_fpath if not qconfig.ideal_assembly_insert_size or qconfig.ideal_assembly_insert_size == 'auto': if using_reads == 'paired_end' and sam_fpath: insert_size = calculate_insert_size(sam_fpath, output_dir, ref_name) if not insert_size: logger.info(' Failed calculating insert size.') else: qconfig.ideal_assembly_insert_size = insert_size if not required_files: return bam_fpath, uncovered_fpath if not sam_fpath: logger.info(' Failed detecting uncovered regions.') return None, None bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(bam_sorted_fpath): logger.info(' Using existing sorted BAM-file: ' + bam_sorted_fpath) else: sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) if not is_non_empty_file(uncovered_fpath): get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return bam_fpath, uncovered_fpath
def group_references(chr_names, contig_names_by_refs, chromosomes_length, ref_fpath): if contig_names_by_refs: added_refs = set() chr_full_names = [added_refs.add(ref) or ref for ref in [contig_names_by_refs[contig] for contig in chr_names] if ref not in added_refs] elif sum(chromosomes_length.values()) < qconfig.MAX_SIZE_FOR_COMB_PLOT and len(chr_names) > 1: chr_full_names = [qutils.name_from_fpath(ref_fpath)] else: contig_names_by_refs = dict() chr_full_names = chr_names for i in range(len(chr_names)): contig_names_by_refs[chr_names[i]] = chr_full_names[i] return chr_full_names, contig_names_by_refs
def save_total_report(output_dirpath, min_contig, ref_fpath): from quast_libs import reporting asm_names = [qutils.label_from_fpath(this) for this in reporting.assembly_fpaths] report = reporting.table(reporting.Fields.grouped_order) t = datetime.datetime.now() return save(join(output_dirpath, total_report_fname), { 'date': t.strftime('%d %B %Y, %A, %H:%M:%S'), 'assembliesNames': asm_names, 'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '', 'order': [i for i, _ in enumerate(asm_names)], 'report': report, 'minContig': min_contig, 'assembliesWithNs': qconfig.potential_scaffolds_assemblies if qconfig.potential_scaffolds_assemblies else None })
def partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template, labels): # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) n_jobs = min(qconfig.max_threads, len(assemblies)) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed assemblies = Parallel(n_jobs=n_jobs)(delayed(parallel_partition_contigs)(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template) for asm in assemblies) assemblies_dicts = [assembly[0] for assembly in assemblies] assemblies_by_ref = [] for ref_fpath in ref_fpaths: ref_name = qutils.name_from_fpath(ref_fpath) not_sorted_assemblies = set([val for sublist in (assemblies_dicts[i][ref_name] for i in range(len(assemblies_dicts))) for val in sublist]) sorted_assemblies = [] for label in labels: # sort by label for assembly in not_sorted_assemblies: if assembly.label == label: sorted_assemblies.append(assembly) break assemblies_by_ref.append((ref_fpath, sorted_assemblies)) not_aligned_assemblies = [assembly[1] for assembly in assemblies] return assemblies_by_ref, not_aligned_assemblies
def save_total_report(output_dirpath, min_contig, ref_fpath): from quast_libs import reporting asm_names = map(qutils.label_from_fpath, reporting.assembly_fpaths) report = reporting.table(reporting.Fields.grouped_order) t = datetime.datetime.now() return save(join(output_dirpath, total_report_fname), { 'date': t.strftime('%d %B %Y, %A, %H:%M:%S'), 'assembliesNames': asm_names, 'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '', 'order': [i for i, _ in enumerate(asm_names)], 'report': report, 'minContig': min_contig, 'assembliesWithNs': qconfig.potential_scaffolds_assemblies if qconfig.potential_scaffolds_assemblies else None })
def _start_quast_main(args, assemblies, reference_fpath=None, output_dirpath=None, num_notifications_tuple=None, labels=None, run_regular_quast=False, is_combined_ref=False, is_parallel_run=False): args = args[:] args.extend([asm.fpath for asm in assemblies]) if reference_fpath: args.append('-R') args.append(reference_fpath) if output_dirpath: args.append('-o') args.append(output_dirpath) args.append('--labels') def quote(line): if ' ' in line: line = '"%s"' % line return line args.append(quote(', '.join([asm.label for asm in assemblies]))) import quast try: import imp imp.reload(quast) except: reload(quast) quast.logger.set_up_console_handler(indent_val=1, debug=qconfig.debug) if not run_regular_quast: reference_name = os.path.basename(qutils.name_from_fpath(reference_fpath)) if reference_fpath else None quast.logger.set_up_metaquast(is_parallel_run=is_parallel_run, ref_name=reference_name) if is_combined_ref: logger.info_to_file('(logging to ' + os.path.join(output_dirpath, qconfig.LOGGER_DEFAULT_NAME + '.log)')) return_code = quast.main(args) if num_notifications_tuple: cur_num_notifications = quast.logger.get_numbers_of_notifications() num_notifications_tuple = list(map(sum, zip(num_notifications_tuple, cur_num_notifications))) if is_combined_ref: labels[:] = [qconfig.assembly_labels_by_fpath[fpath] for fpath in qconfig.assemblies_fpaths] assemblies[:] = [Assembly(fpath, qconfig.assembly_labels_by_fpath[fpath]) for fpath in qconfig.assemblies_fpaths] return return_code, num_notifications_tuple
def _start_quast_main(args, assemblies, reference_fpath=None, output_dirpath=None, num_notifications_tuple=None, labels=None, run_regular_quast=False, is_combined_ref=False, is_parallel_run=False): args = args[:] args.extend([asm.fpath for asm in assemblies]) if reference_fpath: args.append('-R') args.append(reference_fpath) if output_dirpath: args.append('-o') args.append(output_dirpath) args.append('--labels') def quote(line): if ' ' in line: line = '"%s"' % line return line args.append(quote(', '.join([asm.label for asm in assemblies]))) import quast try: import importlib importlib.reload(quast) except (ImportError, AttributeError): reload(quast) quast.logger.set_up_console_handler(indent_val=1, debug=qconfig.debug) if not run_regular_quast: reference_name = os.path.basename(qutils.name_from_fpath(reference_fpath)) if reference_fpath else None quast.logger.set_up_metaquast(is_parallel_run=is_parallel_run, ref_name=reference_name) if is_combined_ref: logger.info_to_file('(logging to ' + os.path.join(output_dirpath, qconfig.LOGGER_DEFAULT_NAME + '.log)')) return_code = quast.main(args) if num_notifications_tuple: cur_num_notifications = quast.logger.get_numbers_of_notifications() num_notifications_tuple = list(map(sum, zip(num_notifications_tuple, cur_num_notifications))) if is_combined_ref: labels[:] = [qconfig.assembly_labels_by_fpath[fpath] for fpath in qconfig.assemblies_fpaths] assemblies[:] = [Assembly(fpath, qconfig.assembly_labels_by_fpath[fpath]) for fpath in qconfig.assemblies_fpaths] return return_code, num_notifications_tuple
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(cur_ref_fpath) if not bam_fpath: sam_fpath = join(output_dirpath, ref_name + '.sam') bam_fpath = join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam') else: sam_fpath = bam_fpath.replace('.bam', '.sam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath if not isfile(bam_sorted_fpath): sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads) if not is_non_empty_file(bam_sorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath], stderr=open(err_fpath, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss') vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf') if not is_non_empty_file(vcf_fpath): if isdir(vcf_output_dirpath): shutil.rmtree(vcf_output_dirpath, ignore_errors=True) os.makedirs(vcf_output_dirpath) max_mem = get_gridss_memory() env = os.environ.copy() env["PATH"] += os.pathsep + bwa_dirpath bwa_index(cur_ref_fpath, err_fpath, logger) qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true', '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true', '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath, 'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath, 'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath], stderr=open(err_fpath, 'a'), logger=logger, env=env) if is_non_empty_file(vcf_fpath): raw_bed_fpath = add_suffix(bed_fpath, 'raw') filtered_bed_fpath = add_suffix(bed_fpath, 'filtered') qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe', 'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath, 'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger) reformat_bedpe(raw_bed_fpath, bed_fpath) return bed_fpath
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(cur_ref_fpath) if not bam_fpath: sam_fpath = join(output_dirpath, ref_name + '.sam') bam_fpath = join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam') else: sam_fpath = bam_fpath.replace('.bam', '.sam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath if not isfile(bam_sorted_fpath): sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped and proper_pair') sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads) if not is_non_empty_file(bam_sorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath], stderr=open(err_fpath, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss') vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf') if not is_non_empty_file(vcf_fpath): if isdir(vcf_output_dirpath): shutil.rmtree(vcf_output_dirpath, ignore_errors=True) os.makedirs(vcf_output_dirpath) max_mem = get_gridss_memory() env = os.environ.copy() env["PATH"] += os.pathsep + bwa_dirpath bwa_index(cur_ref_fpath, err_fpath, logger) qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true', '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true', '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath, 'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath, 'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath], stderr=open(err_fpath, 'a'), logger=logger, env=env) if is_non_empty_file(vcf_fpath): raw_bed_fpath = add_suffix(bed_fpath, 'raw') filtered_bed_fpath = add_suffix(bed_fpath, 'filtered') qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe', 'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath, 'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger) reformat_bedpe(raw_bed_fpath, bed_fpath) return bed_fpath
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None): ref = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam') ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted.bam') ref_bed_fpath = bed_fpath if bed_fpath else os.path.join(output_dirpath, ref + '.bed') if os.path.getsize(ref_sam_fpath) < 1024 * 1024: # TODO: make it better (small files will cause Manta crush -- "not enough reads...") logger.info(' SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024)) return None if is_non_empty_file(ref_bed_fpath): logger.info(' Using existing Manta BED-file: ' + ref_bed_fpath) return ref_bed_fpath if not os.path.exists(ref_bamsorted_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-S', '-f', 'bam', ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), ref_bam_fpath, '-o', ref_bamsorted_fpath], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(ref_bamsorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', ref_bamsorted_fpath], stderr=open(err_path, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta') found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz') unpacked_SV_fpath = found_SV_fpath + '.unpacked' if not is_non_empty_file(found_SV_fpath): if os.path.exists(vcfoutput_dirpath): shutil.rmtree(vcfoutput_dirpath, ignore_errors=True) os.makedirs(vcfoutput_dirpath) qutils.call_subprocess([config_manta_fpath, '--normalBam', ref_bamsorted_fpath, '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath], stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger) if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')): return None env = os.environ.copy() env['LC_ALL'] = 'C' qutils.call_subprocess([os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads)], stderr=open(err_path, 'a'), logger=logger, env=env) if not is_non_empty_file(unpacked_SV_fpath): cmd = 'gunzip -c %s' % found_SV_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) from quast_libs.manta import vcfToBedpe vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w')) return ref_bed_fpath
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None): ref = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam') ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted.bam') ref_bed_fpath = bed_fpath if bed_fpath else os.path.join(output_dirpath, ref + '.bed') if os.path.getsize(ref_sam_fpath) < 1024 * 1024: # TODO: make it better (small files will cause Manta crush -- "not enough reads...") logger.info(' SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024)) return None if is_non_empty_file(ref_bed_fpath): logger.info(' Using existing Manta BED-file: ' + ref_bed_fpath) return ref_bed_fpath if not os.path.exists(ref_bamsorted_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-S', '-f', 'bam', ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), ref_bam_fpath, '-o', ref_bamsorted_fpath], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(ref_bamsorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', ref_bamsorted_fpath], stderr=open(err_path, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta') found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz') unpacked_SV_fpath = found_SV_fpath + '.unpacked' if not is_non_empty_file(found_SV_fpath): if os.path.exists(vcfoutput_dirpath): shutil.rmtree(vcfoutput_dirpath, ignore_errors=True) os.makedirs(vcfoutput_dirpath) qutils.call_subprocess([get_manta_fpath(), '--normalBam', ref_bamsorted_fpath, '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath], stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger) if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')): return None env = os.environ.copy() env['LC_ALL'] = 'C' qutils.call_subprocess([os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads)], stderr=open(err_path, 'a'), logger=logger, env=env) if not is_non_empty_file(unpacked_SV_fpath): cmd = 'gunzip -c %s' % found_SV_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) from quast_libs.ra_utils import vcfToBedpe vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w')) return ref_bed_fpath
def run_aligner(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type): bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads) insert_sizes = [] for idx, reads in enumerate(read_fpaths): if isinstance(reads, str): if reads_type == 'pacbio' or reads_type == 'nanopore': if reads_type == 'pacbio': preset = ' -ax map-pb ' else: preset = ' -ax map-ont ' cmdline = minimap_fpath() + ' -t ' + str(max_threads) + preset + ref_fpath + ' ' + reads else: cmdline = bwa_cmd + (' -p ' if reads_type == 'pe' else ' ') + ref_fpath + ' ' + reads else: read1, read2 = reads cmdline = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2 output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1)) bam_fpath = output_fpath.replace('.sam', '.bam') if not is_non_empty_file(output_fpath): qutils.call_subprocess(shlex.split(cmdline), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_fpath): if not is_non_empty_file(bam_fpath): sambamba_view(output_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) if reads_type == 'pe': bam_dedup_fpath = add_suffix(bam_fpath, 'dedup') qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir', output_dir, bam_fpath, bam_dedup_fpath], stderr=open(err_fpath, 'a'), logger=logger) if exists(bam_dedup_fpath): shutil.move(bam_dedup_fpath, bam_fpath) if reads_type == 'pe': insert_size, std_dev = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath)) if insert_size < qconfig.optimal_assembly_max_IS: insert_sizes.append(insert_size) out_sam_fpaths.append(output_fpath) if insert_sizes: qconfig.optimal_assembly_insert_size = max(insert_sizes) ref_name = qutils.name_from_fpath(ref_fpath) insert_size_fpath = join(output_dir, '..', ref_name + '.is.txt') with open(insert_size_fpath, 'w') as out: out.write(str(qconfig.optimal_assembly_insert_size))
def save_total_report(output_dirpath, min_contig, ref_fpath): from quast_libs import reporting asm_names = [qutils.label_from_fpath(this) for this in reporting.assembly_fpaths] report = reporting.table(reporting.Fields.grouped_order) subreports = [] ref_names = [] if qconfig.is_combined_ref and ref_labels_by_chromosomes: ref_names = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()]))) subreports = [reporting.table(reporting.Fields.grouped_order, ref_name=ref_name) for ref_name in ref_names] t = datetime.datetime.now() return save(join(output_dirpath, total_report_fname), { 'date': t.strftime('%d %B %Y, %A, %H:%M:%S'), 'assembliesNames': asm_names, 'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else '', 'order': [i for i, _ in enumerate(asm_names)], 'report': report, 'subreferences': ref_names, 'subreports': subreports, 'minContig': min_contig })
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl') libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib') err_file = open(err_fpath, 'w') tmp_dirpath += qutils.name_from_fpath(fasta_fpath) if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) return_code = qutils.call_subprocess( ['perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] fnames = [fname for (path, dirs, files) in os.walk(tmp_dirpath) for fname in files] for fname in fnames: if fname.endswith('gtf'): genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname))) return genes
def _run_quast_per_ref(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, total_num_notifications, is_parallel_run=False): ref_name = qutils.name_from_fpath(ref_fpath) if not ref_assemblies: logger.main_info('\nNo contigs were aligned to the reference ' + ref_name + ', skipping..') return None, None, total_num_notifications else: output_dirpath = os.path.join(output_dirpath_per_ref, ref_name) run_name = 'for the contigs aligned to ' + ref_name logger.main_info('\nStarting quast.py ' + run_name + '... (logging to ' + os.path.join(output_dirpath, qconfig.LOGGER_DEFAULT_NAME) + '.log)') return_code, total_num_notifications = _start_quast_main(quast_py_args, assemblies=ref_assemblies, reference_fpath=ref_fpath, output_dirpath=output_dirpath, num_notifications_tuple=total_num_notifications, is_parallel_run=is_parallel_run) json_text = None if qconfig.html_report: from quast_libs.html_saver import json_saver json_text = json_saver.json_text return ref_name, json_text, total_num_notifications
def analyse_coverage(output_dirpath, fpath, chr_names, bam_fpath, stats_fpath, err_fpath, logger): filename = qutils.name_from_fpath(fpath) bed_fpath = bam_to_bed(output_dirpath, filename, bam_fpath, err_fpath, logger) chr_len_fpath = get_chr_len_fpath(fpath, chr_names) cov_fpath = join(output_dirpath, filename + '.genomecov') calculate_genome_cov(bed_fpath, cov_fpath, chr_len_fpath, err_fpath, logger, print_all_positions=False) avg_depth = 0 coverage_for_thresholds = [0 for threshold in qconfig.coverage_thresholds] with open(cov_fpath) as f: for line in f: l = line.split() # genome; depth; number of bases; size of genome; fraction of bases with depth depth, genome_fraction = int(l[1]), float(l[4]) if l[0] == 'genome': avg_depth += depth * genome_fraction for i, threshold in enumerate(qconfig.coverage_thresholds): if depth >= threshold: coverage_for_thresholds[i] += genome_fraction with open(stats_fpath, 'a') as out_f: out_f.write('%s depth\n' % int(avg_depth)) for i, threshold in enumerate(qconfig.coverage_thresholds): out_f.write('%.2f coverage >= %sx\n' % (coverage_for_thresholds[i] * 100, threshold))
def align_ideal_assembly(ref_fpath, assembly_fpath, output_dir, log_fpath, err_fpath): sam_fpath = join(output_dir, basename(assembly_fpath) + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') bam_mapped_fpath = add_suffix(bam_fpath, 'mapped') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') if not is_non_empty_file(bam_fpath): bwa_index(ref_fpath, err_fpath, logger) qutils.call_subprocess([bwa_fpath('bwa'), 'mem', '-t', str(qconfig.max_threads), ref_fpath, assembly_fpath], stdout=open(sam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-S', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped', bam_fpath], stdout=open(bam_mapped_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) cov_fpath = join(output_dir, basename(assembly_fpath) + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') ref_name = qutils.name_from_fpath(ref_fpath) correct_chr_names = get_correct_names_for_chroms(output_dir, ref_fpath, sam_fpath, err_fpath, assembly_fpath, logger) get_coverage(output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_fpath, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return uncovered_fpath
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir, log_path, err_fpath): required_files = [] bed_fpath, cov_fpath, physical_cov_fpath = None, None, None if main_ref_fpath: ref_name = qutils.name_from_fpath(main_ref_fpath) bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed') cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov') required_files = [bed_fpath, cov_fpath, physical_cov_fpath] if qconfig.no_sv: logger.info(' Will not search Structural Variations (--fast or --no-sv is specified)') bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) elif not qconfig.forward_reads and not qconfig.interlaced_reads: if not qconfig.reference_sam and not qconfig.reference_bam: logger.info(' Will not search Structural Variations (needs paired-end reads)') bed_fpath = None qconfig.no_sv = True if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info(' Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)') cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): required_files = [] n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1) max_threads_per_job = max(1, qconfig.max_threads // n_jobs) sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths) bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths) parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)] if main_ref_fpath: parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True)) correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs) qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)] qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)] add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath) save_reads(output_dir) if not main_ref_fpath: return None, None, None correct_chr_names = correct_chr_names[-1] sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1] qconfig.reference_sam = sam_fpath qconfig.reference_bam = bam_fpath if not required_files: return bed_fpath, cov_fpath, physical_cov_fpath if not all([sam_fpath, bam_fpath]): logger.info(' Failed searching structural variations.') return None, None, None sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted')) bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: if not is_non_empty_file(bam_sorted_fpath): sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger) if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_lengths = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_lengths[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False ref_files = {} if meta_ref_fpaths: global ref_sam_fpaths for cur_ref_fpath in meta_ref_fpaths: cur_ref_name = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam') ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath if is_non_empty_file(ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath)) ref_files[cur_ref_name] = None else: ref_sam_file = open(ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): ref_sam_file.write(headers[0] + '\n') for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name: ref_sam_file.write(h + '\n') ref_sam_file.write(headers[-1] + '\n') ref_files[cur_ref_name] = ref_sam_file need_ref_splitting = True trivial_deletions_fpath = \ search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting) if get_gridss_fpath() and isfile(get_gridss_fpath()): try: gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath) qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info(' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None, index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'): filename = qutils.name_from_fpath(fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') if using_reads != 'all': sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)): required_files.append(sam_fpath) stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat') index_str = qutils.index_to_str(index) if index is not None else '' reads_fpaths = qconfig.reads_fpaths correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) can_reuse = correct_chr_names is not None if not can_reuse and not reads_fpaths: return None, None, None if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)): if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath) if isfile(stats_fpath) or alignment_only: return correct_chr_names, sam_fpath, bam_fpath logger.info(' ' + index_str + 'Pre-processing reads...') if is_non_empty_file(sam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif is_non_empty_file(bam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths: if is_reference: logger.info(' Running BWA for reference...') else: logger.info(' ' + index_str + 'Running BWA...') # use absolute paths because we will change workdir fpath = abspath(fpath) sam_fpath = abspath(sam_fpath) prev_dir = os.getcwd() os.chdir(output_dirpath) bwa_index(fpath, err_fpath, logger) sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads) if len(sam_fpaths) > 1: merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, main_output_dir, max_threads, err_fpath) elif len(sam_fpaths) == 1: shutil.move(sam_fpaths[0], sam_fpath) sambamba_view(sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) logger.info(' ' + index_str + 'Done.') os.chdir(prev_dir) if not is_non_empty_file(sam_fpath): logger.error(' Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.') return None, None, None correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif not correct_chr_names or not is_non_empty_file(sam_fpath): return None, None, None if is_reference: logger.info(' Sorting SAM-file for reference...') else: logger.info(' ' + index_str + 'Sorting SAM-file...') if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath): logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) else: correct_sam_fpath = join(output_dirpath, filename + '.correct.sam') # write in output dir sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath) sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) qutils.assert_file_exists(bam_fpath, 'bam file') if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath) if is_reference: logger.info(' Analysis for reference is finished.') else: logger.info(' ' + index_str + 'Analysis is finished.') return correct_chr_names, sam_fpath, bam_fpath
def save_result(result, report, fname, ref_fpath, genome_size): region_misassemblies = result['region_misassemblies'] misassemblies_by_ref = result['misassemblies_by_ref'] misassembled_contigs = result['misassembled_contigs'] misassembled_bases = result['misassembled_bases'] misassembly_internal_overlap = result['misassembly_internal_overlap'] unaligned = result['unaligned'] partially_unaligned = result['partially_unaligned'] partially_unaligned_bases = result['partially_unaligned_bases'] fully_unaligned_bases = result['fully_unaligned_bases'] ambiguous_contigs = result['ambiguous_contigs'] ambiguous_contigs_extra_bases = result['ambiguous_contigs_extra_bases'] SNPs = result['SNPs'] indels_list = result['indels_list'] aligned_ref_bases = result['aligned_ref_bases'] aligned_assembly_bases = result['aligned_assembly_bases'] half_unaligned_with_misassembly = result['half_unaligned_with_misassembly'] report.add_field(reporting.Fields.MISLOCAL, region_misassemblies.count(Misassembly.LOCAL)) report.add_field( reporting.Fields.MISASSEMBL, region_misassemblies.count(Misassembly.RELOCATION) + region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) + region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.MISCONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MISCONTIGSBASES, misassembled_bases) report.add_field(reporting.Fields.MISINTERNALOVERLAP, misassembly_internal_overlap) if qconfig.bed: report.add_field(reporting.Fields.STRUCT_VARIATIONS, region_misassemblies.count(Misassembly.MATCHED_SV)) if qconfig.large_genome: report.add_field(reporting.Fields.POTENTIAL_MGE, region_misassemblies.count(Misassembly.POTENTIAL_MGE)) report.add_field(reporting.Fields.UNALIGNED, '%d + %d part' % (unaligned, partially_unaligned)) report.add_field(reporting.Fields.UNALIGNEDBASES, (fully_unaligned_bases + partially_unaligned_bases)) report.add_field(reporting.Fields.AMBIGUOUS, ambiguous_contigs) report.add_field(reporting.Fields.AMBIGUOUSEXTRABASES, ambiguous_contigs_extra_bases) report.add_field(reporting.Fields.MISMATCHES, SNPs) # different types of indels: if indels_list is not None: report.add_field(reporting.Fields.INDELS, len(indels_list)) report.add_field(reporting.Fields.INDELSBASES, sum(indels_list)) report.add_field( reporting.Fields.MIS_SHORT_INDELS, len([i for i in indels_list if i <= qconfig.SHORT_INDEL_THRESHOLD])) report.add_field( reporting.Fields.MIS_LONG_INDELS, len([i for i in indels_list if i > qconfig.SHORT_INDEL_THRESHOLD])) if aligned_ref_bases: genome_fraction = aligned_ref_bases * 100.0 / genome_size duplication_ratio = float( aligned_assembly_bases + misassembly_internal_overlap + ambiguous_contigs_extra_bases) / aligned_ref_bases report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) report.add_field( reporting.Fields.SUBSERROR, "%.2f" % (float(SNPs) * 100000.0 / float(aligned_assembly_bases))) report.add_field( reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS)) * 100000.0 / float(aligned_assembly_bases))) # for misassemblies report: report.add_field( reporting.Fields.MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.RELOCATION) + region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) + region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.MIS_RELOCATION, region_misassemblies.count(Misassembly.RELOCATION)) report.add_field(reporting.Fields.MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.TRANSLOCATION)) report.add_field(reporting.Fields.MIS_INVERTION, region_misassemblies.count(Misassembly.INVERSION)) report.add_field(reporting.Fields.MIS_EXTENSIVE_CONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MIS_EXTENSIVE_BASES, misassembled_bases) report.add_field(reporting.Fields.MIS_LOCAL, region_misassemblies.count(Misassembly.LOCAL)) # special case for separating contig and scaffold misassemblies report.add_field( reporting.Fields.SCF_MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.SCF_RELOCATION) + region_misassemblies.count(Misassembly.SCF_INVERSION) + region_misassemblies.count(Misassembly.SCF_TRANSLOCATION) + region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.SCF_MIS_RELOCATION, region_misassemblies.count(Misassembly.SCF_RELOCATION)) report.add_field(reporting.Fields.SCF_MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.SCF_TRANSLOCATION)) report.add_field(reporting.Fields.SCF_MIS_INVERTION, region_misassemblies.count(Misassembly.SCF_INVERSION)) report.add_field( reporting.Fields.CTG_MIS_ALL_EXTENSIVE, report.get_field(reporting.Fields.MIS_ALL_EXTENSIVE) - report.get_field(reporting.Fields.SCF_MIS_ALL_EXTENSIVE)) report.add_field( reporting.Fields.CTG_MIS_RELOCATION, region_misassemblies.count(Misassembly.RELOCATION) - region_misassemblies.count(Misassembly.SCF_RELOCATION)) report.add_field( reporting.Fields.CTG_MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.TRANSLOCATION) - region_misassemblies.count(Misassembly.SCF_TRANSLOCATION)) report.add_field( reporting.Fields.CTG_MIS_INVERTION, region_misassemblies.count(Misassembly.INVERSION) - region_misassemblies.count(Misassembly.SCF_INVERSION)) if qconfig.is_combined_ref: report.add_field( reporting.Fields.MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field( reporting.Fields.SCF_MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION)) report.add_field( reporting.Fields.CTG_MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION) - region_misassemblies.count(Misassembly.SCF_INTERSPECTRANSLOCATION)) report.add_field( reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) report.add_field( reporting.Fields.POSSIBLE_MISASSEMBLIES, region_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) all_references = sorted( list(set([ref for ref in ref_labels_by_chromosomes.values()]))) for ref_name in all_references: subreport = reporting.get(fname, ref_name=ref_name) ref_misassemblies = misassemblies_by_ref[ref_name] subreport.add_field( reporting.Fields.MIS_ALL_EXTENSIVE, ref_misassemblies.count(Misassembly.RELOCATION) + ref_misassemblies.count(Misassembly.INVERSION) + ref_misassemblies.count(Misassembly.TRANSLOCATION) + ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) subreport.add_field( reporting.Fields.MIS_RELOCATION, ref_misassemblies.count(Misassembly.RELOCATION)) subreport.add_field( reporting.Fields.MIS_TRANSLOCATION, ref_misassemblies.count(Misassembly.TRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_INVERTION, ref_misassemblies.count(Misassembly.INVERSION)) subreport.add_field( reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_LOCAL, ref_misassemblies.count(Misassembly.LOCAL)) subreport.add_field( reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) subreport.add_field( reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) if fname not in qconfig.dict_of_broken_scaffolds: subreport.add_field( reporting.Fields.MIS_SCAFFOLDS_GAP, ref_misassemblies.count(Misassembly.SCAFFOLD_GAP)) subreport.add_field( reporting.Fields.MIS_LOCAL_SCAFFOLDS_GAP, ref_misassemblies.count(Misassembly.LOCAL_SCAFFOLD_GAP)) if qconfig.check_for_fragmented_ref: subreport.add_field( reporting.Fields.MIS_FRAGMENTED, ref_misassemblies.count(Misassembly.FRAGMENTED)) elif intergenomic_misassemblies_by_asm: label = qutils.label_from_fpath(fname) ref_name = qutils.name_from_fpath(ref_fpath) ref_misassemblies = intergenomic_misassemblies_by_asm[label][ref_name] report.add_field( reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field( reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) report.add_field( reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) if fname not in qconfig.dict_of_broken_scaffolds: report.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, region_misassemblies.count(Misassembly.SCAFFOLD_GAP)) report.add_field( reporting.Fields.MIS_LOCAL_SCAFFOLDS_GAP, region_misassemblies.count(Misassembly.LOCAL_SCAFFOLD_GAP)) if qconfig.check_for_fragmented_ref: report.add_field(reporting.Fields.MIS_FRAGMENTED, region_misassemblies.count(Misassembly.FRAGMENTED)) # for unaligned report: report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned) report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH, fully_unaligned_bases) report.add_field(reporting.Fields.UNALIGNED_PART_CNTGS, partially_unaligned) report.add_field(reporting.Fields.UNALIGNED_PART_LENGTH, partially_unaligned_bases) report.add_field(reporting.Fields.UNALIGNED_MISASSEMBLED_CTGS, half_unaligned_with_misassembly) return report
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name) chromosomes_by_refs = {} def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name if not qconfig.no_check: corr_seq = correct_seq(seq, ref_fpath) if not corr_seq: return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) excluded_ref_fpaths = [] ref_names = qutils.process_labels(ref_fpaths) for ref_fpath, ref_name in zip(ref_fpaths, ref_names): total_references = 0 ref_fname = os.path.basename(ref_fpath) _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) chromosomes_by_refs[ref_name] = [] used_seq_names = defaultdict(int) corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1) uniq_seq_name = get_uniq_name(seq_name, used_seq_names) used_seq_names[seq_name] += 1 corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a') elif downloaded_refs: logger.warning('Skipping ' + ref_fpath + ' because it' ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!') # cleaning for corr_seq_name, _ in chromosomes_by_refs[ref_name]: del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] del chromosomes_by_refs[ref_name] corrected_ref_fpaths.pop() excluded_ref_fpaths.append(ref_fpath) else: logger.error('Reference file ' + ref_fpath + ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!', exit_with_code=1) for excluded in excluded_ref_fpaths: ref_fpaths.remove(excluded) if len(chromosomes_by_refs) > 0: logger.main_info(' All references were combined in ' + qconfig.combined_ref_name) else: logger.warning('All references were skipped!') return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.main_info('Running NA-NGA calculation...') ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath) reference_length = sum(ref_chr_lengths.values()) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values())) for i, (contigs_fpath, lens, assembly_len) in enumerate( zip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) na75 = N50.NG50(lens, assembly_len, 75) la50 = N50.LG50(lens, assembly_len) la75 = N50.LG50(lens, assembly_len, 75) if not qconfig.is_combined_ref: nga50 = N50.NG50(lens, reference_length) nga75 = N50.NG50(lens, reference_length, 75) lga50 = N50.LG50(lens, reference_length) lga75 = N50.LG50(lens, reference_length, 75) logger.info(' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') + ', LA50 = ' + str(la50) + (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else '')) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LA75, la75) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))]) if json_output_dirpath: from quast_libs.html_saver import json_saver json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths, json_output_dir=json_output_dirpath) if not qconfig.is_combined_ref: plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath) logger.main_info('Done.') return report_dict
def main(args): check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '.\n' + 'Please, put QUAST in a different directory, then try again.\n', exit_code=3) if not args: qconfig.usage(meta=True) sys.exit(0) metaquast_path = [os.path.realpath(__file__)] quast_py_args, contigs_fpaths = parse_options(logger, metaquast_path + args, is_metaquast=True) output_dirpath, ref_fpaths, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels html_report = qconfig.html_report test_mode = qconfig.test # Directories output_dirpath, _, _ = qutils.set_up_output_dir( output_dirpath, None, not output_dirpath, save_json=False) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) qconfig.set_max_threads(logger) qutils.logger = logger ######################################################################## from quast_libs import reporting try: import imp imp.reload(reporting) except: reload(reporting) from quast_libs import plotter if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCES if ref_fpaths: logger.main_info() logger.main_info('Reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath) # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') qconfig.no_check_meta = True assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels) if not assemblies: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.") return 4 # Running QUAST(s) quast_py_args += ['--meta'] downloaded_refs = False # SEARCHING REFERENCES if not ref_fpaths: logger.main_info() if qconfig.max_references == 0: logger.notice("Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled") else: if qconfig.references_txt: logger.main_info("List of references was provided, starting to download reference genomes from NCBI...") else: logger.main_info("No references are provided, starting to search for reference genomes in SILVA 16S rRNA database " "and to download them from NCBI...") downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname) if not os.path.isdir(downloaded_dirpath): os.mkdir(downloaded_dirpath) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, corrected_dirpath, qconfig.references_txt) if ref_fpaths: search_references_meta.is_quast_first_run = True if not qconfig.references_txt: downloaded_refs = True logger.main_info() logger.main_info('Downloaded reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath) elif test_mode and not ref_fpaths: logger.error('Failed to download or setup SILVA 16S rRNA database for working without ' 'references on metagenome datasets!', to_stderr=True, exit_with_code=4) if not ref_fpaths: # No references, running regular quast with MetaGenemark gene finder logger.main_info() logger.notice('No references are provided, starting regular QUAST with MetaGeneMark gene finder') _start_quast_main(quast_py_args, assemblies=assemblies, output_dirpath=output_dirpath, run_regular_quast=True) exit(0) # Running combined reference combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name) reads_fpaths = [] if qconfig.forward_reads: reads_fpaths.append(qconfig.forward_reads) if qconfig.reverse_reads: reads_fpaths.append(qconfig.reverse_reads) cov_fpath = qconfig.cov_fpath physical_cov_fpath = qconfig.phys_cov_fpath if (reads_fpaths or qconfig.sam or qconfig.bam) and ref_fpaths: bed_fpath, cov_fpath, physical_cov_fpath = reads_analyzer.do(combined_ref_fpath, contigs_fpaths, reads_fpaths, corrected_ref_fpaths, os.path.join(combined_output_dirpath, qconfig.variation_dirname), external_logger=logger, sam_fpath=qconfig.sam, bam_fpath=qconfig.bam, bed_fpath=qconfig.bed) qconfig.bed = bed_fpath if qconfig.bed: quast_py_args += ['--sv-bed'] quast_py_args += [qconfig.bed] if cov_fpath: quast_py_args += ['--cov'] quast_py_args += [cov_fpath] if physical_cov_fpath: quast_py_args += ['--phys-cov'] quast_py_args += [physical_cov_fpath] if qconfig.sam: quast_py_args += ['--sam'] quast_py_args += [qconfig.sam] if qconfig.bam: quast_py_args += ['--bam'] quast_py_args += [qconfig.bam] quast_py_args += ['--combined-ref'] if qconfig.draw_plots or qconfig.html_report: if plotter.dict_color_and_ls: colors_and_ls = [plotter.dict_color_and_ls[asm.label] for asm in assemblies] quast_py_args += ['--colors'] quast_py_args += [','.join([style[0] for style in colors_and_ls])] quast_py_args += ['--ls'] quast_py_args += [','.join([style[1] for style in colors_and_ls])] run_name = 'for the combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') total_num_notices = 0 total_num_warnings = 0 total_num_nf_errors = 0 total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors) if qconfig.html_report: from quast_libs.html_saver import json_saver json_texts = [] else: json_texts = None if qconfig.unique_mapping: ambiguity_opts = [] else: ambiguity_opts = ["--ambiguity-usage", 'all'] return_code, total_num_notifications, assemblies, labels = \ _start_quast_main(quast_py_args + ambiguity_opts, assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_first_run=True) if json_texts is not None: json_texts.append(json_saver.json_text) search_references_meta.is_quast_first_run = False genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats') genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt') if not os.path.exists(genome_info_fpath): logger.main_info('') if not downloaded_refs: msg = 'Try to restart MetaQUAST with another references.' else: msg = 'Try to use option --max-ref-number to change maximum number of references (per each assembly) to download.' logger.main_info('Failed aligning the contigs for all the references. ' + msg) logger.main_info('') cleanup(corrected_dirpath) logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) if downloaded_refs: logger.main_info() logger.main_info('Excluding downloaded references with low genome fraction from further analysis..') corr_ref_fpaths = get_downloaded_refs_with_alignments(genome_info_fpath, ref_fpaths, chromosomes_by_refs) if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths: logger.main_info() logger.main_info('Filtered reference(s):') os.remove(combined_ref_fpath) contigs_analyzer.ref_labels_by_chromosomes = {} corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names = \ correct_meta_references(corr_ref_fpaths, corrected_dirpath) run_name = 'for the corrected combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications, assemblies, labels = \ _start_quast_main(quast_py_args + ambiguity_opts, assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_first_run=True) if json_texts is not None: json_texts = json_texts[:-1] json_texts.append(json_saver.json_text) elif corr_ref_fpaths == ref_fpaths: logger.main_info('All downloaded references have genome fraction more than 10%. Nothing was excluded.') else: logger.main_info('All downloaded references have low genome fraction. Nothing was excluded for now.') if qconfig.calculate_read_support: calculate_ave_read_support(combined_output_dirpath, assemblies) for arg in args: if arg in ('-s', "--scaffolds"): quast_py_args.remove(arg) quast_py_args += ['--no-check-meta'] qconfig.contig_thresholds = ','.join([str(threshold) for threshold in qconfig.contig_thresholds if threshold > qconfig.min_contig]) if not qconfig.contig_thresholds: qconfig.contig_thresholds = 'None' quast_py_args = remove_from_quast_py_args(quast_py_args, '--contig-thresholds', qconfig.contig_thresholds) quast_py_args += ['--contig-thresholds'] quast_py_args += [qconfig.contig_thresholds] quast_py_args.remove('--combined-ref') logger.main_info() logger.main_info('Partitioning contigs into bins aligned to each reference..') assemblies_by_reference, not_aligned_assemblies = partition_contigs( assemblies, corrected_ref_fpaths, corrected_dirpath, os.path.join(combined_output_dirpath, 'contigs_reports', 'alignments_%s.tsv'), labels) ref_names = [] output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname) for ref_fpath, ref_assemblies in assemblies_by_reference: ref_name = qutils.name_from_fpath(ref_fpath) logger.main_info('') if not ref_assemblies: logger.main_info('No contigs were aligned to the reference ' + ref_name + ', skipping..') else: ref_names.append(ref_name) run_name = 'for the contigs aligned to ' + ref_name logger.main_info('Starting quast.py ' + run_name) return_code, total_num_notifications = _start_quast_main(quast_py_args, assemblies=ref_assemblies, reference_fpath=ref_fpath, output_dirpath=os.path.join(output_dirpath_per_ref, ref_name), num_notifications_tuple=total_num_notifications) if json_texts is not None: json_texts.append(json_saver.json_text) # Finally running for the contigs that has not been aligned to any reference no_unaligned_contigs = True for assembly in not_aligned_assemblies: if os.path.isfile(assembly.fpath) and os.stat(assembly.fpath).st_size != 0: no_unaligned_contigs = False break run_name = 'for the contigs not aligned anywhere' logger.main_info() if no_unaligned_contigs: logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)') else: logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications = _start_quast_main(quast_py_args, assemblies=not_aligned_assemblies, output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name), num_notifications_tuple=total_num_notifications) if return_code not in [0, 4]: logger.error('Error running quast.py for the contigs not aligned anywhere') elif return_code == 4: # no unaligned contigs, i.e. everything aligned no_unaligned_contigs = True if not no_unaligned_contigs: if json_texts is not None: json_texts.append(json_saver.json_text) if ref_names: logger.print_timestamp() logger.main_info("Summarizing results...") summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir) if not os.path.isdir(summary_output_dirpath): os.makedirs(summary_output_dirpath) if html_report and json_texts: from quast_libs.html_saver import html_saver html_summary_report_fpath = html_saver.init_meta_report(output_dirpath) else: html_summary_report_fpath = None from quast_libs import create_meta_summary metrics_for_plots = reporting.Fields.main_metrics misassembl_metrics = [reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION, reporting.Fields.MIS_ISTRANSLOCATIONS] if no_unaligned_contigs: full_ref_names = ref_names else: full_ref_names = ref_names + [qconfig.not_aligned_name] create_meta_summary.do(html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath, output_dirpath_per_ref, metrics_for_plots, misassembl_metrics, full_ref_names) if html_report and json_texts: html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls, meta=True) if qconfig.create_icarus_html: icarus_html_fpath = html_saver.create_meta_icarus(output_dirpath, ref_names) logger.main_info(' Icarus (contig browser) is saved to %s' % icarus_html_fpath) html_saver.create_meta_report(output_dirpath, json_texts) cleanup(corrected_dirpath) logger.main_info('') logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path, sam_fpath=None, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(main_ref_fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') sam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(sam_fpath, 'sorted')) bam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(bam_fpath, 'sorted')) bed_fpath = bed_fpath or os.path.join(res_path, ref_name + '.bed') cov_fpath = os.path.join(res_path, ref_name + '.cov') physical_cov_fpath = os.path.join(res_path, ref_name + '.physical.cov') if qconfig.no_sv: logger.info( ' Will not search Structural Variations (--fast or --no-sv is specified)' ) bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info( ' Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)' ) cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): return bed_fpath, cov_fpath, physical_cov_fpath logger.info(' ' + 'Pre-processing reads...') correct_chr_names = None if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms( output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths) elif is_non_empty_file(bam_fpath): logger.info(' Using existing BAM-file: ' + bam_fpath) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_fpath ], stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) correct_chr_names = get_correct_names_for_chroms( output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths) if not correct_chr_names and reads_fpaths: logger.info(' Running BWA...') # use absolute paths because we will change workdir sam_fpath = os.path.abspath(sam_fpath) abs_reads_fpaths = [] for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) if len(abs_reads_fpaths) != 2: logger.error( ' You should specify files with forward and reverse reads.') logger.info(' Failed searching structural variations.') return None, None, None if not qconfig.no_check: if not paired_reads_names_are_equal(reads_fpaths, logger): logger.error( ' Read names are discordant, skipping reads analysis!') logger.info(' Failed searching structural variations.') return None, None, None prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bwa_fpath('bwa'), 'index', '-p', ref_name, main_ref_fpath] if os.path.getsize( main_ref_fpath ) > 2 * 1024**3: # if reference size bigger than 2GB cmd += ['-a', 'bwtsw'] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bwa_fpath('bwa') + ' mem -t ' + str( qconfig.max_threads) + ' ' + ref_name + ' ' + abs_reads_fpaths[ 0] + ' ' + abs_reads_fpaths[1] qutils.call_subprocess(shlex.split(cmd), stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running BWA for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None, None, None elif not correct_chr_names: logger.info(' Failed searching structural variations.') return None, None, None logger.info(' Sorting SAM-file...') if (is_non_empty_file(sam_sorted_fpath) and all_read_names_correct(sam_sorted_fpath) ) and is_non_empty_file(bam_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: correct_sam_fpath = os.path.join(output_dirpath, ref_name + '.sam.correct') # write in output dir clean_read_names(sam_fpath, correct_sam_fpath) bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped', '-S', correct_sam_fpath ], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath ], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_sorted_fpath ], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if qconfig.create_icarus_html and ( not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage( output_dirpath, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info( ' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[ seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info( ' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: if mapping.ref == '*': continue # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid( ): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_bad( position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip( ) == '=' or cur_ref == ref_labels[ mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if get_manta_fpath() and isfile(get_manta_fpath()): try: manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if os.path.exists( trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info( ' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath
def js_data_gen(assemblies, contigs_fpaths, chromosomes_length, output_dirpath, structures_by_labels, contigs_by_assemblies, ambiguity_alignments_by_labels=None, contig_names_by_refs=None, ref_fpath=None, stdout_pattern=None, features_data=None, cov_fpath=None, physical_cov_fpath=None, json_output_dir=None): chr_names = [] if chromosomes_length and assemblies: chr_to_aligned_blocks = OrderedDict() chr_names = list(chromosomes_length.keys()) for assembly in assemblies.assemblies: chr_to_aligned_blocks[assembly.label] = defaultdict(list) similar_correct = 0 similar_misassembled = 0 for align in assembly.alignments: chr_to_aligned_blocks[assembly.label][align.ref_name].append(align) if align.similar: if align.misassembled: similar_misassembled += 1 else: similar_correct += 1 report = reporting.get(assembly.fpath) report.add_field(reporting.Fields.SIMILAR_CONTIGS, similar_correct) report.add_field(reporting.Fields.SIMILAR_MIS_BLOCKS, similar_misassembled) main_menu_fpath = os.path.join(output_dirpath, qconfig.icarus_html_fname) output_all_files_dir_path = os.path.join(output_dirpath, qconfig.icarus_dirname) if not os.path.exists(output_all_files_dir_path): os.mkdir(output_all_files_dir_path) chr_full_names, contig_names_by_refs = group_references(chr_names, contig_names_by_refs, chromosomes_length, ref_fpath) cov_data, not_covered, max_depth = parse_cov_fpath(cov_fpath, chr_names, chr_full_names, contig_names_by_refs) physical_cov_data, not_covered, physical_max_depth = parse_cov_fpath(physical_cov_fpath, chr_names, chr_full_names, contig_names_by_refs) chr_sizes = {} num_contigs = {} aligned_bases = genome_analyzer.get_ref_aligned_lengths() nx_marks = [reporting.Fields.N50, reporting.Fields.N75, reporting.Fields.NG50, reporting.Fields.NG75] assemblies_data, assemblies_contig_size_data, assemblies_n50 = get_assemblies_data(contigs_fpaths, output_all_files_dir_path, stdout_pattern, nx_marks) ref_contigs_dict = {} chr_lengths_dict = {} ref_data = 'var references_by_id = {};\n' chr_names_by_id = dict((chrom, str(i)) for i, chrom in enumerate(chr_names)) for chrom, i in chr_names_by_id.items(): ref_data += 'references_by_id["' + str(i) + '"] = "' + chrom + '";\n' for i, chr in enumerate(chr_full_names): if contig_names_by_refs: ref_contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr] elif len(chr_full_names) == 1: ref_contigs = chr_names else: ref_contigs = [chr] ref_contigs_dict[chr] = ref_contigs chr_lengths_dict[chr] = [0] + [chromosomes_length[contig] for contig in ref_contigs] num_misassemblies = defaultdict(int) aligned_bases_by_chr = defaultdict(list) aligned_assemblies = defaultdict(set) for i, chr in enumerate(chr_full_names): ref_contigs = ref_contigs_dict[chr] chr_lengths = chr_lengths_dict[chr] chr_size = sum([chromosomes_length[contig] for contig in ref_contigs]) chr_sizes[chr] = chr_size num_contigs[chr] = len(ref_contigs) data_str = [] data_str.append('var chromosomes_len = {};') for ref_contig in ref_contigs: l = chromosomes_length[ref_contig] data_str.append('chromosomes_len["' + ref_contig + '"] = ' + str(l) + ';') aligned_bases_by_chr[chr].extend(aligned_bases[ref_contig]) cov_data_str = format_cov_data(cov_data, max_depth, chr, 'coverage_data', 'reads_max_depth') if cov_data else None physical_cov_data_str = format_cov_data(physical_cov_data, physical_max_depth, chr, 'physical_coverage_data', 'physical_max_depth') \ if physical_cov_data else None alignment_viewer_fpath, ref_data_str, contigs_structure_str, additional_assemblies_data, ms_selectors, num_misassemblies[chr], aligned_assemblies[chr] = \ prepare_alignment_data_for_one_ref(chr, chr_full_names, chr_names_by_id, ref_contigs, data_str, chr_to_aligned_blocks, structures_by_labels, contigs_by_assemblies, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels, cov_data_str=cov_data_str, physical_cov_data_str=physical_cov_data_str, contig_names_by_refs=contig_names_by_refs, output_dir_path=output_all_files_dir_path) ref_name = qutils.name_from_fpath(ref_fpath) save_alignment_data_for_one_ref(chr, ref_contigs, ref_name, json_output_dir, alignment_viewer_fpath, ref_data_str, ms_selectors, ref_data=ref_data, features_data=features_data, assemblies_data=assemblies_data, contigs_structure_str=contigs_structure_str, additional_assemblies_data=additional_assemblies_data) contigs_sizes_str, too_many_contigs = get_contigs_data(contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels, contig_names_by_refs, chr_names, chr_full_names) all_data = assemblies_data + assemblies_contig_size_data + contigs_sizes_str save_contig_size_html(output_all_files_dir_path, json_output_dir, too_many_contigs, all_data) icarus_links = defaultdict(list) if len(chr_full_names) > 1: chr_link = qconfig.icarus_html_fname icarus_links["links"].append(chr_link) icarus_links["links_names"].append(qconfig.icarus_link) main_menu_template_fpath = html_saver.get_real_path(qconfig.icarus_menu_template_fname) main_data_dict = dict() labels = [qconfig.assembly_labels_by_fpath[contigs_fpath] for contigs_fpath in contigs_fpaths] main_data_dict['assemblies'] = labels html_saver.save_icarus_data(json_output_dir, ', '.join(labels), 'assemblies') contig_size_browser_fpath = os.path.join(qconfig.icarus_dirname, qconfig.contig_size_viewer_fname) main_data_dict['contig_size_html'] = contig_size_browser_fpath html_saver.save_icarus_data(json_output_dir, contig_size_browser_fpath, 'contig_size_html') if not chr_names: icarus_links["links"].append(contig_size_browser_fpath) icarus_links["links_names"].append(qconfig.icarus_link) if chr_full_names and (len(chr_full_names) > 1 or qconfig.is_combined_ref): main_data_dict['table_references'] = {'references': []} num_aligned_assemblies = [len(aligned_assemblies[chr]) for chr in chr_full_names] is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1 if is_unaligned_asm_exists: main_data_dict['table_references']['th_assemblies'] = True for chr in sorted(chr_full_names): chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths, contig_names_by_refs, one_chromosome=len(chr_full_names) == 1) reference_dict = dict() reference_dict['chr_link'] = chr_link reference_dict['tooltip'] = tooltip reference_dict['chr_name'] = os.path.basename(chr_name) reference_dict['num_contigs'] = str(num_contigs[chr]) reference_dict['chr_size'] = format_long_numbers(chr_size) if is_unaligned_asm_exists: reference_dict['num_assemblies'] = str(len(aligned_assemblies[chr])) reference_dict['chr_gf'] = '%.3f' % chr_genome reference_dict['num_misassemblies'] = str(num_misassemblies[chr]) main_data_dict['table_references']['references'].append(reference_dict) html_saver.save_icarus_data(json_output_dir, main_data_dict['table_references'], 'table_references', as_text=False) else: if chr_full_names: chr = chr_full_names[0] chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr(chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths, contig_names_by_refs, one_chromosome=True) main_data_dict['one_reference'] = dict() main_data_dict['one_reference']['alignment_link'] = chr_link main_data_dict['one_reference']['ref_fpath'] = os.path.basename(ref_fpath) main_data_dict['one_reference']['ref_fragments'] = str(num_contigs[chr]) main_data_dict['one_reference']['ref_size'] = format_long_numbers(chr_size) main_data_dict['one_reference']['ref_gf'] = '%.3f' % chr_genome main_data_dict['one_reference']['num_misassemblies'] = str(num_misassemblies[chr]) icarus_links["links"].append(chr_link) icarus_links["links_names"].append(qconfig.icarus_link) html_saver.save_icarus_data(json_output_dir, main_data_dict['one_reference'], 'menu_reference', as_text=False) html_saver.save_icarus_html(main_menu_template_fpath, main_menu_fpath, main_data_dict) html_saver.save_icarus_links(output_dirpath, icarus_links) if json_output_dir: json_saver.save_icarus_links(json_output_dir, icarus_links) return main_menu_fpath
def js_data_gen(assemblies, contigs_fpaths, chromosomes_length, output_dirpath, structures_by_labels, contigs_by_assemblies, ambiguity_alignments_by_labels=None, contig_names_by_refs=None, ref_fpath=None, stdout_pattern=None, features_data=None, gc_fpath=None, cov_fpath=None, physical_cov_fpath=None, json_output_dir=None): chr_names = [] if chromosomes_length and assemblies: chr_to_aligned_blocks = OrderedDict() chr_names = list(chromosomes_length.keys()) for assembly in assemblies.assemblies: chr_to_aligned_blocks[assembly.label] = defaultdict(list) similar_correct = 0 similar_misassembled = 0 for align in assembly.alignments: chr_to_aligned_blocks[assembly.label][align.ref_name].append( align) if align.similar: if align.misassembled: similar_misassembled += 1 else: similar_correct += 1 report = reporting.get(assembly.fpath) report.add_field(reporting.Fields.SIMILAR_CONTIGS, similar_correct) report.add_field(reporting.Fields.SIMILAR_MIS_BLOCKS, similar_misassembled) main_menu_fpath = os.path.join(output_dirpath, qconfig.icarus_html_fname) output_all_files_dir_path = os.path.join(output_dirpath, qconfig.icarus_dirname) if not os.path.exists(output_all_files_dir_path): os.mkdir(output_all_files_dir_path) chr_full_names, contig_names_by_refs = group_references( chr_names, contig_names_by_refs, chromosomes_length, ref_fpath) cov_data, max_depth = parse_cov_fpath(cov_fpath, chr_names, chr_full_names, contig_names_by_refs) physical_cov_data, physical_max_depth = parse_cov_fpath( physical_cov_fpath, chr_names, chr_full_names, contig_names_by_refs) gc_data, max_gc = parse_cov_fpath(gc_fpath, chr_names, chr_full_names, contig_names_by_refs) chr_sizes = {} num_contigs = {} aligned_bases = genome_analyzer.get_ref_aligned_lengths() nx_marks = [ reporting.Fields.N50, reporting.Fields.N75, reporting.Fields.NG50, reporting.Fields.NG75 ] assemblies_data, assemblies_contig_size_data, assemblies_n50 = get_assemblies_data( contigs_fpaths, output_all_files_dir_path, stdout_pattern, nx_marks) ref_contigs_dict = {} chr_lengths_dict = {} ref_data = 'var references_by_id = {};\n' chr_names_by_id = dict( (chrom, str(i)) for i, chrom in enumerate(chr_names)) for chrom, i in chr_names_by_id.items(): ref_data += 'references_by_id["' + str(i) + '"] = "' + chrom + '";\n' for i, chr in enumerate(chr_full_names): if contig_names_by_refs: ref_contigs = [ contig for contig in chr_names if contig_names_by_refs[contig] == chr ] elif len(chr_full_names) == 1: ref_contigs = chr_names else: ref_contigs = [chr] ref_contigs_dict[chr] = ref_contigs chr_lengths_dict[chr] = [0] + [ chromosomes_length[contig] for contig in ref_contigs ] num_misassemblies = defaultdict(int) aligned_bases_by_chr = defaultdict(list) aligned_assemblies = defaultdict(set) for i, chr in enumerate(chr_full_names): ref_contigs = ref_contigs_dict[chr] chr_lengths = chr_lengths_dict[chr] chr_size = sum([chromosomes_length[contig] for contig in ref_contigs]) chr_sizes[chr] = chr_size num_contigs[chr] = len(ref_contigs) data_str = [] data_str.append('var chromosomes_len = {};') for ref_contig in ref_contigs: l = chromosomes_length[ref_contig] data_str.append('chromosomes_len["' + ref_contig + '"] = ' + str(l) + ';') aligned_bases_by_chr[chr].extend(aligned_bases[ref_contig]) cov_data_str = format_cov_data(chr, cov_data, 'coverage_data', max_depth, 'reads_max_depth') if cov_data else None physical_cov_data_str = format_cov_data(chr, physical_cov_data, 'physical_coverage_data', physical_max_depth, 'physical_max_depth') \ if physical_cov_data else None gc_data_str = format_cov_data(chr, gc_data, 'gc_data', 100, 'max_gc') if gc_data else None alignment_viewer_fpath, ref_data_str, contigs_structure_str, additional_assemblies_data, ms_selectors, num_misassemblies[chr], aligned_assemblies[chr] = \ prepare_alignment_data_for_one_ref(chr, chr_full_names, chr_names_by_id, ref_contigs, data_str, chr_to_aligned_blocks, structures_by_labels, contigs_by_assemblies, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels, cov_data_str=cov_data_str, physical_cov_data_str=physical_cov_data_str, gc_data_str=gc_data_str, contig_names_by_refs=contig_names_by_refs, output_dir_path=output_all_files_dir_path) ref_name = qutils.name_from_fpath(ref_fpath) save_alignment_data_for_one_ref( chr, ref_contigs, ref_name, json_output_dir, alignment_viewer_fpath, ref_data_str, ms_selectors, ref_data=ref_data, features_data=features_data, assemblies_data=assemblies_data, contigs_structure_str=contigs_structure_str, additional_assemblies_data=additional_assemblies_data) contigs_sizes_str, too_many_contigs = get_contigs_data( contigs_by_assemblies, nx_marks, assemblies_n50, structures_by_labels, contig_names_by_refs, chr_names, chr_full_names) all_data = assemblies_data + assemblies_contig_size_data + contigs_sizes_str save_contig_size_html(output_all_files_dir_path, json_output_dir, too_many_contigs, all_data) icarus_links = defaultdict(list) if len(chr_full_names) > 1: chr_link = qconfig.icarus_html_fname icarus_links["links"].append(chr_link) icarus_links["links_names"].append(qconfig.icarus_link) main_menu_template_fpath = html_saver.get_real_path( qconfig.icarus_menu_template_fname) main_data_dict = dict() labels = [ qconfig.assembly_labels_by_fpath[contigs_fpath] for contigs_fpath in contigs_fpaths ] main_data_dict['assemblies'] = labels html_saver.save_icarus_data(json_output_dir, ', '.join(labels), 'assemblies') contig_size_browser_fpath = os.path.join(qconfig.icarus_dirname, qconfig.contig_size_viewer_fname) main_data_dict['contig_size_html'] = contig_size_browser_fpath html_saver.save_icarus_data(json_output_dir, contig_size_browser_fpath, 'contig_size_html') if not chr_names: icarus_links["links"].append(contig_size_browser_fpath) icarus_links["links_names"].append(qconfig.icarus_link) if chr_full_names and (len(chr_full_names) > 1 or qconfig.is_combined_ref): main_data_dict['table_references'] = {'references': []} num_aligned_assemblies = [ len(aligned_assemblies[chr]) for chr in chr_full_names ] is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1 if is_unaligned_asm_exists: main_data_dict['table_references']['th_assemblies'] = True for chr in sorted(chr_full_names, key=natural_sort): chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr( chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths, contig_names_by_refs, one_chromosome=len(chr_full_names) == 1) reference_dict = dict() reference_dict['chr_link'] = chr_link reference_dict['tooltip'] = tooltip reference_dict['chr_name'] = os.path.basename(chr_name) reference_dict['num_contigs'] = str(num_contigs[chr]) reference_dict['chr_size'] = format_long_numbers(chr_size) if is_unaligned_asm_exists: reference_dict['num_assemblies'] = str( len(aligned_assemblies[chr])) reference_dict['chr_gf'] = '%.3f' % chr_genome reference_dict['num_misassemblies'] = str(num_misassemblies[chr]) main_data_dict['table_references']['references'].append( reference_dict) html_saver.save_icarus_data(json_output_dir, main_data_dict['table_references'], 'table_references', as_text=False) else: if chr_full_names: chr = chr_full_names[0] chr_link, chr_name, chr_genome, chr_size, tooltip = get_info_by_chr( chr, aligned_bases_by_chr, chr_sizes, contigs_fpaths, contig_names_by_refs, one_chromosome=True) main_data_dict['one_reference'] = dict() main_data_dict['one_reference']['alignment_link'] = chr_link main_data_dict['one_reference']['ref_fpath'] = os.path.basename( ref_fpath) main_data_dict['one_reference']['ref_fragments'] = str( num_contigs[chr]) main_data_dict['one_reference']['ref_size'] = format_long_numbers( chr_size) main_data_dict['one_reference']['ref_gf'] = '%.3f' % chr_genome main_data_dict['one_reference']['num_misassemblies'] = str( num_misassemblies[chr]) icarus_links["links"].append(chr_link) icarus_links["links_names"].append(qconfig.icarus_link) html_saver.save_icarus_data(json_output_dir, main_data_dict['one_reference'], 'menu_reference', as_text=False) html_saver.save_icarus_html(main_menu_template_fpath, main_menu_fpath, main_data_dict) html_saver.save_icarus_links(output_dirpath, icarus_links) return main_menu_fpath
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1): tmp_output_dirpath = create_minimap_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) out_basename = join(tmp_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join( output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = [ 'S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group' ] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths( out_basename) status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath) if status != AlignerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if status == AlignerStatus.ERROR: logger.error( ' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif status == AlignerStatus.FAILED: log_err_f.write( qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif status == AlignerStatus.NOT_ALIGNED: log_err_f.write( qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') return status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} with open(coords_fpath) as coords_file: for line in coords_file: mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_features = {} # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in reference_chromosomes.items(): log_out_f.write('\tLoaded [%s]\n' % name) regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'), icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic) log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') total_aligned_bases, indels_info = analyze_coverage( ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath) total_indels_info += indels_info cov_stats = { 'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases } result.update(cov_stats) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta( join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join( output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join( output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile( r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall( contig)[0][0] contig_cov = len_cov_pattern.findall( contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') if not ref_aligns: return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None, index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'): filename = qutils.name_from_fpath(fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') if using_reads != 'all': sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)): required_files.append(sam_fpath) stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat') index_str = qutils.index_to_str(index) if index is not None else '' reads_fpaths = qconfig.reads_fpaths correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) can_reuse = correct_chr_names is not None if not can_reuse and not reads_fpaths: return None, None, None if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)): if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) if isfile(stats_fpath) or alignment_only: return correct_chr_names, sam_fpath, bam_fpath logger.info(' ' + index_str + 'Pre-processing reads...') if is_non_empty_file(sam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif is_non_empty_file(bam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths: if is_reference: logger.info(' Running BWA for reference...') else: logger.info(' ' + index_str + 'Running BWA...') # use absolute paths because we will change workdir fpath = abspath(fpath) sam_fpath = abspath(sam_fpath) prev_dir = os.getcwd() os.chdir(output_dirpath) bwa_index(fpath, err_fpath, logger) sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads) if len(sam_fpaths) > 1: merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath) elif len(sam_fpaths) == 1: shutil.move(sam_fpaths[0], sam_fpath) tmp_bam_fpath = sam_fpaths[0].replace('.sam', '.bam') if is_non_empty_file(tmp_bam_fpath): shutil.move(tmp_bam_fpath, bam_fpath) logger.info(' ' + index_str + 'Done.') os.chdir(prev_dir) if not is_non_empty_file(sam_fpath): logger.error(' Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.') return None, None, None correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif not correct_chr_names or not is_non_empty_file(sam_fpath): return None, None, None if is_reference: logger.info(' Sorting SAM-file for reference...') else: logger.info(' ' + index_str + 'Sorting SAM-file...') if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath): logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) else: correct_sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.correct.sam') # write in output dir sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath) sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) qutils.assert_file_exists(bam_fpath, 'bam file') if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) if is_reference: logger.info(' Analysis for reference is finished.') else: logger.info(' ' + index_str + 'Analysis is finished.') return correct_chr_names, sam_fpath, bam_fpath
def main(args): check_dirpath(qconfig.QUAST_HOME, 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '.\n' + 'Please, put QUAST in a different directory, then try again.\n', exit_code=3) if not args: qconfig.usage(stream=sys.stderr) sys.exit(1) metaquast_path = [os.path.realpath(__file__)] quast_py_args, contigs_fpaths = parse_options(logger, metaquast_path + args) output_dirpath, ref_fpaths, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels html_report = qconfig.html_report test_mode = qconfig.test # Directories output_dirpath, _, _ = qutils.set_up_output_dir( output_dirpath, None, not output_dirpath, save_json=False) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) qconfig.set_max_threads(logger) qutils.logger = logger ######################################################################## from quast_libs import reporting try: import importlib importlib.reload(reporting) except (ImportError, AttributeError): reload(reporting) from quast_libs import plotter if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCES if ref_fpaths: logger.main_info() logger.main_info('Reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath) # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') qconfig.no_check_meta = True assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels) if not assemblies: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.") return 4 # Running QUAST(s) if qconfig.gene_finding: quast_py_args += ['--mgm'] if qconfig.min_IDY is None: # special case: user not specified min-IDY, so we need to use MetaQUAST default value quast_py_args += ['--min-identity', str(qconfig.META_MIN_IDY)] if qconfig.reuse_combined_alignments: reuse_combined_alignments = True else: reuse_combined_alignments = False downloaded_refs = False # SEARCHING REFERENCES if not ref_fpaths: logger.main_info() if qconfig.max_references == 0: logger.notice("Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled") else: if qconfig.references_txt: logger.main_info("List of references was provided, starting to download reference genomes from NCBI...") else: logger.main_info("No references are provided, starting to search for reference genomes in SILVA 16S rRNA database " "and to download them from NCBI...") downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname) if not os.path.isdir(downloaded_dirpath): os.mkdir(downloaded_dirpath) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, corrected_dirpath, qconfig.references_txt) if ref_fpaths: search_references_meta.is_quast_first_run = True if not qconfig.references_txt: downloaded_refs = True logger.main_info() logger.main_info('Downloaded reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=True) elif test_mode and not ref_fpaths: logger.error('Failed to download or setup SILVA 16S rRNA database for working without ' 'references on metagenome datasets!', to_stderr=True, exit_with_code=4) if not ref_fpaths: # No references, running regular quast with MetaGenemark gene finder logger.main_info() logger.notice('No references are provided, starting regular QUAST with MetaGeneMark gene finder') assemblies = [Assembly(fpath, qutils.label_from_fpath(fpath)) for fpath in contigs_fpaths] _start_quast_main(quast_py_args, assemblies=assemblies, output_dirpath=output_dirpath, run_regular_quast=True) exit(0) # Running combined reference combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name) qconfig.reference = combined_ref_fpath if qconfig.bed: quast_py_args += ['--sv-bed'] quast_py_args += [qconfig.bed] quast_py_args += ['--combined-ref'] if qconfig.draw_plots or qconfig.html_report: if plotter_data.dict_color_and_ls: colors_and_ls = [plotter_data.dict_color_and_ls[asm.label] for asm in assemblies] quast_py_args += ['--colors'] quast_py_args += [','.join([style[0] for style in colors_and_ls])] quast_py_args += ['--ls'] quast_py_args += [','.join([style[1] for style in colors_and_ls])] run_name = 'for the combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') total_num_notices = 0 total_num_warnings = 0 total_num_nf_errors = 0 total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors) if qconfig.html_report: from quast_libs.html_saver import json_saver json_texts = [] else: json_texts = None if qconfig.unique_mapping: ambiguity_opts = [] else: ambiguity_opts = ["--ambiguity-usage", 'all'] return_code, total_num_notifications = \ _start_quast_main(quast_py_args + ambiguity_opts, labels=labels, assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_combined_ref=True) if json_texts is not None: json_texts.append(json_saver.json_text) search_references_meta.is_quast_first_run = False genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats') genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt') if not os.path.exists(genome_info_fpath): logger.main_info('') if not downloaded_refs: msg = 'Try to restart MetaQUAST with another references.' else: msg = 'Try to use option --max-ref-number to change maximum number of references (per each assembly) to download.' logger.main_info('Failed aligning the contigs for all the references. ' + msg) logger.main_info('') cleanup(corrected_dirpath) logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) if downloaded_refs and return_code == 0: logger.main_info() logger.main_info('Excluding downloaded references with low genome fraction from further analysis..') corr_ref_fpaths = get_downloaded_refs_with_alignments(genome_info_fpath, ref_fpaths, chromosomes_by_refs) if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths: logger.main_info() logger.main_info('Filtered reference(s):') os.remove(combined_ref_fpath) contigs_analyzer.ref_labels_by_chromosomes = OrderedDict() corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names = \ correct_meta_references(corr_ref_fpaths, corrected_dirpath) assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels) run_name = 'for the corrected combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications = \ _start_quast_main(quast_py_args + ambiguity_opts, labels=labels, assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_combined_ref=True) if json_texts is not None: json_texts = json_texts[:-1] json_texts.append(json_saver.json_text) elif corr_ref_fpaths == ref_fpaths: logger.main_info('All downloaded references have genome fraction more than 10%. Nothing was excluded.') else: logger.main_info('All downloaded references have low genome fraction. Nothing was excluded for now.') if return_code != 0: logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) if qconfig.calculate_read_support: calculate_ave_read_support(combined_output_dirpath, assemblies) prepare_regular_quast_args(quast_py_args, combined_output_dirpath, reuse_combined_alignments) logger.main_info() logger.main_info('Partitioning contigs into bins aligned to each reference..') assemblies_by_reference, not_aligned_assemblies = partition_contigs( assemblies, corrected_ref_fpaths, corrected_dirpath, os.path.join(combined_output_dirpath, qconfig.detailed_contigs_reports_dirname, 'alignments_%s.tsv'), labels) output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname) if not qconfig.memory_efficient and \ len(assemblies_by_reference) > len(assemblies) and len(assemblies) < qconfig.max_threads: logger.main_info() logger.main_info('Run QUAST on different references in parallel..') threads_per_ref = max(1, qconfig.max_threads // len(assemblies_by_reference)) quast_py_args += ['--memory-efficient'] quast_py_args += ['-t', str(threads_per_ref)] num_notifications = (0, 0, 0) parallel_run_args = [(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, num_notifications, True) for ref_fpath, ref_assemblies in assemblies_by_reference] ref_names, ref_json_texts, ref_notifications = \ run_parallel(_run_quast_per_ref, parallel_run_args, qconfig.max_threads, filter_results=True) per_ref_num_notifications = list(map(sum, zip(*ref_notifications))) total_num_notifications = list(map(sum, zip(total_num_notifications, per_ref_num_notifications))) if json_texts is not None: json_texts.extend(ref_json_texts) quast_py_args.remove('--memory-efficient') quast_py_args = remove_from_quast_py_args(quast_py_args, '-t', str(threads_per_ref)) else: ref_names = [] for ref_fpath, ref_assemblies in assemblies_by_reference: ref_name, json_text, total_num_notifications = \ _run_quast_per_ref(quast_py_args, output_dirpath_per_ref, ref_fpath, ref_assemblies, total_num_notifications) if not ref_name: continue ref_names.append(ref_name) if json_texts is not None: json_texts.append(json_text) # Finally running for the contigs that has not been aligned to any reference no_unaligned_contigs = True for assembly in not_aligned_assemblies: if os.path.isfile(assembly.fpath) and os.stat(assembly.fpath).st_size != 0: no_unaligned_contigs = False break run_name = 'for the contigs not aligned anywhere' logger.main_info() if no_unaligned_contigs: logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)') else: logger.main_info('Starting quast.py ' + run_name + '... (logging to ' + os.path.join(output_dirpath, qconfig.not_aligned_name, qconfig.LOGGER_DEFAULT_NAME + '.log)')) return_code, total_num_notifications = _start_quast_main(quast_py_args + ['-t', str(qconfig.max_threads)], assemblies=not_aligned_assemblies, output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name), num_notifications_tuple=total_num_notifications) if return_code not in [0, 4]: logger.error('Error running quast.py for the contigs not aligned anywhere') elif return_code == 4: # no unaligned contigs, i.e. everything aligned no_unaligned_contigs = True if not no_unaligned_contigs: if json_texts is not None: json_texts.append(json_saver.json_text) if ref_names: logger.print_timestamp() logger.main_info("Summarizing results...") summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir) if not os.path.isdir(summary_output_dirpath): os.makedirs(summary_output_dirpath) if html_report and json_texts: from quast_libs.html_saver import html_saver html_summary_report_fpath = html_saver.init_meta_report(output_dirpath) else: html_summary_report_fpath = None from quast_libs import create_meta_summary metrics_for_plots = reporting.Fields.main_metrics misassembly_metrics = [reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION, reporting.Fields.MIS_ISTRANSLOCATIONS] if no_unaligned_contigs: full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths] else: full_ref_names = [qutils.name_from_fpath(ref_fpath) for ref_fpath in corrected_ref_fpaths] + [qconfig.not_aligned_name] create_meta_summary.do(html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath, output_dirpath_per_ref, metrics_for_plots, misassembly_metrics, full_ref_names) if html_report and json_texts: html_saver.save_colors(output_dirpath, contigs_fpaths, plotter_data.dict_color_and_ls, meta=True) if qconfig.create_icarus_html: icarus_html_fpath = html_saver.create_meta_icarus(output_dirpath, ref_names) logger.main_info(' Icarus (contig browser) is saved to %s' % icarus_html_fpath) html_saver.create_meta_report(output_dirpath, json_texts) cleanup(corrected_dirpath) logger.main_info('') logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys())) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # for cumulative plots: files_genes_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed process_results = Parallel(n_jobs=n_jobs)(delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [process_results[i][1] for i in range(len(process_results))] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('================================================================================================================\n') for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from quast_libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter if genes_container.region_list: plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return [genes_container, operons_container]
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.main_info('Running NA-NGA calculation...') ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath) reference_length = sum(ref_chr_lengths.values()) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values())) import N50 for i, (contigs_fpath, lens, assembly_len) in enumerate( itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) na75 = N50.NG50(lens, assembly_len, 75) la50 = N50.LG50(lens, assembly_len) la75 = N50.LG50(lens, assembly_len, 75) if not qconfig.is_combined_ref: nga50 = N50.NG50(lens, reference_length) nga75 = N50.NG50(lens, reference_length, 75) lga50 = N50.LG50(lens, reference_length) lga75 = N50.LG50(lens, reference_length, 75) logger.info(' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') + ', LA50 = ' + str(la50) + (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else '')) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LA75, la75) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))]) if json_output_dirpath: from quast_libs.html_saver import json_saver json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) import plotter if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths, json_output_dir=json_output_dirpath) if not qconfig.is_combined_ref: plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath) logger.main_info('Done.') return report_dict
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1): nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \ get_nucmer_aux_out_fpaths(nucmer_fpath) nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath) if nucmer_status != NucmerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if nucmer_status == NucmerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif nucmer_status == NucmerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif nucmer_status == NucmerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') clean_tmp_files(nucmer_fpath) return nucmer_status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} coords_file = open(coords_fpath) coords_filtered_file = open(coords_filtered_fpath, 'w') coords_filtered_file.write(coords_file.readline()) coords_filtered_file.write(coords_file.readline()) for line in coords_file: if line.strip() == '': break assert line[0] != '=' #Clear leading spaces from nucmer output #Store nucmer lines in an array mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_lens = {} ref_features = {} for name, seq in fastaparser.read_fasta(ref_fpath): name = name.split()[0] # no spaces in reference header ref_lens[name] = len(seq) log_out_f.write('\tLoaded [%s]\n' % name) #Loading the SNP calls if qconfig.show_snps: log_out_f.write('Loading SNPs...\n') used_snps_file = None snps = {} if qconfig.show_snps: prev_line = None for line in open_gzipsafe(show_snps_fpath): #print "$line"; line = line.split() if not line[0].isdigit(): continue if prev_line and line == prev_line: continue ref = line[10] ctg = line[11] pos = int(line[0]) # Kolya: python don't convert int<->str types automatically loc = int(line[3]) # Kolya: same as above # if (! exists $line[11]) { die "Malformed line in SNP file. Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; } if pos in snps.setdefault(ref, {}).setdefault(ctg, {}): snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])) else: snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])] prev_line = line used_snps_file = open_gzipsafe(used_snps_fpath, 'w') # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in ref_lens.items(): regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file, used_snps_f=used_snps_file, icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic) # if qconfig.large_genome: # log_out_f.write('Analyzing large blocks...\n') # large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null' # ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'), # coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w')) # min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD # result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null', # aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0]) # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info)) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') clean_tmp_files(nucmer_fpath) if not qconfig.no_gzip: compress_nucmer_output(logger, nucmer_fpath) if not ref_aligns: return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, features_dict, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): coords_dirpath = os.path.join(detailed_contigs_reports_dirpath, qconfig.minimap_output_dirname) from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: coords_dirpath = os.path.join(coords_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) genome_size, reference_chromosomes, ns_by_chromosomes = fastaparser.get_genome_stats(ref_fpath) # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = os.path.join(genome_stats_dirpath, 'genome_info.txt') res_file = open(result_fpath, 'w') containers = [] for feature, feature_fpath in features_dict.items(): containers.append(FeatureContainer([feature_fpath], feature)) if not features_dict: logger.notice('No file with genomic features were provided. ' 'Use the --features option if you want to specify it.\n', indent=' ') if operons_fpaths: containers.append(FeatureContainer(operons_fpaths, 'operon')) else: logger.notice('No file with operons were provided. ' 'Use the -O option if you want to specify it.', indent=' ') for container in containers: if not container.fpaths: continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No genomic features of type "' + container.kind + '" were loaded.', indent=' ') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' genomic features of type "' + container.kind + '"') res_file.write('Genomic features of type "' + container.kind + '" loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys())) ref_genes_num, ref_operons_num = None, None for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) genomic_features = 0 for container in containers: if container.kind == 'operon': ref_operons_num = len(container.region_list) report.add_field(reporting.Fields.REF_OPERONS, len(container.region_list)) else: genomic_features += len(container.region_list) if genomic_features: ref_genes_num = genomic_features report.add_field(reporting.Fields.REF_GENES, genomic_features) # for cumulative plots: files_features_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_unsorted_features_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} files_unsorted_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) parallel_run_args = [(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)] ref_lengths, results_genes_operons_tuples = run_parallel(process_single_file, parallel_run_args, n_jobs, filter_results=True) num_nf_errors += len(aligned_contigs_fpaths) - len(ref_lengths) logger._num_nf_errors = num_nf_errors if not ref_lengths: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, ' + 'total length without N\'s: ' + str(chr_len - len(ns_by_chromosomes[chr_name])) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('=' * 120 + '\n') for contigs_fpath, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)\ in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_features_in_contigs[contigs_fpath] = features_in_contigs files_unsorted_features_in_contigs[contigs_fpath] = unsorted_features_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs files_unsorted_operons_in_contigs[contigs_fpath] = unsorted_operons_in_contigs full_found_genes.append(sum(features_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], report.get_field(reporting.Fields.MAPPEDGENOME), report.get_field(reporting.Fields.DUPLICATION_RATIO), gaps_count)) genome_mapped.append(float(report.get_field(reporting.Fields.MAPPEDGENOME))) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if qconfig.html_report: from quast_libs.html_saver import html_saver if ref_genes_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'features', files_features_in_contigs, ref_genes_num) if ref_operons_num: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter from quast_libs.ca_utils.misc import contigs_aligned_lengths if ref_genes_num: plotter.genes_operons_plot(ref_genes_num, aligned_contigs_fpaths, files_features_in_contigs, genome_stats_dirpath + '/features_cumulative_plot', 'genomic features') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_features_in_contigs, genome_stats_dirpath + '/features_frcurve_plot', 'genomic features') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_features_histogram', '# complete genomic features') if ref_operons_num: plotter.genes_operons_plot(ref_operons_num, aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.frc_plot(output_dirpath, ref_fpath, aligned_contigs_fpaths, contigs_aligned_lengths, files_unsorted_operons_in_contigs, genome_stats_dirpath + '/operons_frcurve_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return containers
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path, sam_fpath=None, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(main_ref_fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') sam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(sam_fpath, 'sorted')) bam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(bam_fpath, 'sorted')) bed_fpath = bed_fpath or os.path.join(res_path, ref_name + '.bed') cov_fpath = os.path.join(res_path, ref_name + '.cov') physical_cov_fpath = os.path.join(res_path, ref_name + '.physical.cov') if qconfig.no_sv: logger.info(' Will not search Structural Variations (--fast or --no-sv is specified)') bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info(' Will not calculate coverage (--no-icarus or --space-efficient is specified)') cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (qconfig.space_efficient or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): return bed_fpath, cov_fpath, physical_cov_fpath logger.info(' ' + 'Pre-processing reads...') logger.info(' ' + 'Logging to %s...' % err_path) correct_chr_names = None if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms(output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths) elif is_non_empty_file(bam_fpath): logger.info(' Using existing BAM-file: ' + bam_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_fpath], stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) correct_chr_names = get_correct_names_for_chroms(output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths) if not correct_chr_names and reads_fpaths: logger.info(' Running BWA...') # use absolute paths because we will change workdir sam_fpath = os.path.abspath(sam_fpath) abs_reads_fpaths = [] for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) if len(abs_reads_fpaths) != 2: logger.error(' You should specify files with forward and reverse reads.') logger.info(' Failed searching structural variations.') return None, None, None if not qconfig.no_check: if not paired_reads_names_are_equal(reads_fpaths, logger): logger.error(' Read names are discordant, skipping reads analysis!') logger.info(' Failed searching structural variations.') return None, None, None prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bwa_fpath('bwa'), 'index', '-p', ref_name, main_ref_fpath] if os.path.getsize(main_ref_fpath) > 2 * 1024 ** 3: # if reference size bigger than 2GB cmd += ['-a', 'bwtsw'] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bwa_fpath('bwa') + ' mem -t ' + str(qconfig.max_threads) + ' ' + ref_name + ' ' + abs_reads_fpaths[0] + ' ' + abs_reads_fpaths[1] qutils.call_subprocess(shlex.split(cmd), stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running BWA for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None, None, None elif not correct_chr_names: logger.info(' Failed searching structural variations.') return None, None, None logger.info(' Sorting SAM-file...') if (is_non_empty_file(sam_sorted_fpath) and all_read_names_correct(sam_sorted_fpath)) and is_non_empty_file(bam_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: correct_sam_fpath = os.path.join(output_dirpath, ref_name + '.sam.correct') # write in output dir clean_read_names(sam_fpath, correct_sam_fpath) bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped', '-S', correct_sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_sorted_fpath], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage(output_dirpath, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info(' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: if mapping.ref == '*': continue # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid(): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_bad(position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip() == '=' or cur_ref == ref_labels[mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if isfile(config_manta_fpath): try: manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if os.path.exists(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info(' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger): istranslocations_by_asm = [ result['istranslocations_by_refs'] if result else None for result in results ] misassemblies_by_asm = [ result['misassemblies_by_ref'] if result else None for result in results ] all_refs = [] for ref in ref_labels_by_chromosomes.values(): if ref not in all_refs: all_refs.append(ref) if not qconfig.use_input_ref_order: all_refs.sort() misassemblies_by_refs_rows = [] row = {'metricName': 'References', 'values': all_refs} misassemblies_by_refs_rows.append(row) if not istranslocations_by_asm: return for i, fpath in enumerate(contigs_fpaths): label = qutils.label_from_fpath(fpath) row = {'metricName': label, 'values': []} misassemblies_by_refs_rows.append(row) istranslocations_by_ref = istranslocations_by_asm[i] intergenomic_misassemblies_by_asm[label] = defaultdict(list) for ref in all_refs: intergenomic_misassemblies_by_asm[label][ ref] = misassemblies_by_asm[i][ref] if misassemblies_by_asm[ i] else [] if istranslocations_by_ref: assembly_name = qutils.name_from_fpath(fpath) all_rows = [] row = { 'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))] } all_rows.append(row) for ref in all_refs: row = {'metricName': ref, 'values': []} for second_ref in all_refs: if ref == second_ref or second_ref not in istranslocations_by_ref: row['values'].append(None) else: row['values'].append( istranslocations_by_ref[ref][second_ref]) possible_misassemblies = 0 misassemblies_by_ref = misassemblies_by_asm[i] if misassemblies_by_ref: possible_misassemblies = misassemblies_by_ref[ref].count( Misassembly.POSSIBLE_MISASSEMBLIES) istranslocations = max(0, sum([r for r in row['values'] if r])) misassemblies_by_refs_rows[-1]['values'].append( istranslocations + possible_misassemblies) all_rows.append(row) misassembly_by_ref_fpath = os.path.join( output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name) with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file: misassembly_by_ref_file.write( 'Number of interspecies translocations by references: \n') print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True) with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file: misassembly_by_ref_file.write('References:\n') for ref_num, ref in enumerate(all_refs): misassembly_by_ref_file.write( str(ref_num + 1) + ' - ' + ref + '\n') logger.info( ' Information about interspecies translocations by references for %s is saved to %s' % (assembly_name, misassembly_by_ref_fpath)) misassemblies = [] if qconfig.draw_plots: from quast_libs import plotter aligned_contigs_labels = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: aligned_contigs_labels.append(row['metricName']) else: misassemblies_by_refs_rows.remove(row) for i in range(len(all_refs)): cur_results = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: cur_results.append(row['values'][i]) misassemblies.append(cur_results) is_translocations_plot_fpath = os.path.join( output_dir, 'intergenomic_misassemblies') plotter.draw_meta_summary_plot( '', output_dir, aligned_contigs_labels, all_refs, misassemblies, is_translocations_plot_fpath, title='Intergenomic misassemblies (found and supposed)', reverse=False, yaxis_title=None, print_all_refs=True, logger=logger)
def save_result(result, report, fname, ref_fpath): region_misassemblies = result['region_misassemblies'] misassemblies_by_ref = result['misassemblies_by_ref'] region_struct_variations = result['region_struct_variations'] misassemblies_matched_sv = result['misassemblies_matched_sv'] misassembled_contigs = result['misassembled_contigs'] misassembled_bases = result['misassembled_bases'] misassembly_internal_overlap = result['misassembly_internal_overlap'] unaligned = result['unaligned'] partially_unaligned = result['partially_unaligned'] partially_unaligned_bases = result['partially_unaligned_bases'] fully_unaligned_bases = result['fully_unaligned_bases'] ambiguous_contigs = result['ambiguous_contigs'] ambiguous_contigs_extra_bases = result['ambiguous_contigs_extra_bases'] SNPs = result['SNPs'] indels_list = result['indels_list'] total_aligned_bases = result['total_aligned_bases'] half_unaligned_with_misassembly = result['half_unaligned_with_misassembly'] report.add_field(reporting.Fields.MISLOCAL, region_misassemblies.count(Misassembly.LOCAL)) report.add_field(reporting.Fields.MISASSEMBL, region_misassemblies.count(Misassembly.RELOCATION) + region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) + region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.MISCONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MISCONTIGSBASES, misassembled_bases) report.add_field(reporting.Fields.MISINTERNALOVERLAP, misassembly_internal_overlap) if qconfig.bed: report.add_field(reporting.Fields.STRUCT_VARIATIONS, misassemblies_matched_sv) report.add_field(reporting.Fields.UNALIGNED, '%d + %d part' % (unaligned, partially_unaligned)) report.add_field(reporting.Fields.UNALIGNEDBASES, (fully_unaligned_bases + partially_unaligned_bases)) report.add_field(reporting.Fields.AMBIGUOUS, ambiguous_contigs) report.add_field(reporting.Fields.AMBIGUOUSEXTRABASES, ambiguous_contigs_extra_bases) report.add_field(reporting.Fields.MISMATCHES, SNPs) # different types of indels: if indels_list is not None: report.add_field(reporting.Fields.INDELS, len(indels_list)) report.add_field(reporting.Fields.INDELSBASES, sum(indels_list)) report.add_field(reporting.Fields.MIS_SHORT_INDELS, len([i for i in indels_list if i <= qconfig.SHORT_INDEL_THRESHOLD])) report.add_field(reporting.Fields.MIS_LONG_INDELS, len([i for i in indels_list if i > qconfig.SHORT_INDEL_THRESHOLD])) if total_aligned_bases: report.add_field(reporting.Fields.SUBSERROR, "%.2f" % (float(SNPs) * 100000.0 / float(total_aligned_bases))) report.add_field(reporting.Fields.INDELSERROR, "%.2f" % (float(report.get_field(reporting.Fields.INDELS)) * 100000.0 / float(total_aligned_bases))) # for misassemblies report: report.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, region_misassemblies.count(Misassembly.RELOCATION) + region_misassemblies.count(Misassembly.INVERSION) + region_misassemblies.count(Misassembly.TRANSLOCATION) + region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.MIS_RELOCATION, region_misassemblies.count(Misassembly.RELOCATION)) report.add_field(reporting.Fields.MIS_TRANSLOCATION, region_misassemblies.count(Misassembly.TRANSLOCATION)) report.add_field(reporting.Fields.MIS_INVERTION, region_misassemblies.count(Misassembly.INVERSION)) report.add_field(reporting.Fields.MIS_EXTENSIVE_CONTIGS, len(misassembled_contigs)) report.add_field(reporting.Fields.MIS_EXTENSIVE_BASES, misassembled_bases) report.add_field(reporting.Fields.MIS_LOCAL, region_misassemblies.count(Misassembly.LOCAL)) if qconfig.is_combined_ref: report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, region_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, region_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) all_references = sorted(list(set([ref for ref in ref_labels_by_chromosomes.values()]))) for ref_name in all_references: subreport = reporting.get(fname, ref_name=ref_name) ref_misassemblies = misassemblies_by_ref[ref_name] subreport.add_field(reporting.Fields.MIS_ALL_EXTENSIVE, ref_misassemblies.count(Misassembly.RELOCATION) + ref_misassemblies.count(Misassembly.INVERSION) + ref_misassemblies.count(Misassembly.TRANSLOCATION) + ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_RELOCATION, ref_misassemblies.count(Misassembly.RELOCATION)) subreport.add_field(reporting.Fields.MIS_TRANSLOCATION, ref_misassemblies.count(Misassembly.TRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_INVERTION, ref_misassemblies.count(Misassembly.INVERSION)) subreport.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) subreport.add_field(reporting.Fields.MIS_LOCAL, ref_misassemblies.count(Misassembly.LOCAL)) subreport.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) subreport.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) if fname not in qconfig.dict_of_broken_scaffolds: subreport.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, ref_misassemblies.count(Misassembly.SCAFFOLD_GAP)) if qconfig.check_for_fragmented_ref: subreport.add_field(reporting.Fields.MIS_FRAGMENTED, ref_misassemblies.count(Misassembly.FRAGMENTED)) elif intergenomic_misassemblies_by_asm: label = qutils.label_from_fpath(fname) ref_name = qutils.name_from_fpath(ref_fpath) ref_misassemblies = intergenomic_misassemblies_by_asm[label][ref_name] report.add_field(reporting.Fields.MIS_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.INTERSPECTRANSLOCATION)) report.add_field(reporting.Fields.POSSIBLE_MISASSEMBLIES, ref_misassemblies.count(Misassembly.POSSIBLE_MISASSEMBLIES)) report.add_field(reporting.Fields.CONTIGS_WITH_ISTRANSLOCATIONS, ref_misassemblies.count(Misassembly.POTENTIALLY_MIS_CONTIGS)) if fname not in qconfig.dict_of_broken_scaffolds: report.add_field(reporting.Fields.MIS_SCAFFOLDS_GAP, region_misassemblies.count(Misassembly.SCAFFOLD_GAP)) if qconfig.check_for_fragmented_ref: report.add_field(reporting.Fields.MIS_FRAGMENTED, region_misassemblies.count(Misassembly.FRAGMENTED)) # for unaligned report: report.add_field(reporting.Fields.UNALIGNED_FULL_CNTGS, unaligned) report.add_field(reporting.Fields.UNALIGNED_FULL_LENGTH, fully_unaligned_bases) report.add_field(reporting.Fields.UNALIGNED_PART_CNTGS, partially_unaligned) report.add_field(reporting.Fields.UNALIGNED_PART_LENGTH, partially_unaligned_bases) report.add_field(reporting.Fields.UNALIGNED_MISASSEMBLED_CTGS, half_unaligned_with_misassembly) return report
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Generating Upper Bound Assembly...") if not reads_analyzer.compile_reads_analyzer_tools(logger): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to compile necessary third-party read processing tools [bwa, bedtools, minimap2]), skipping...' ) return None if qconfig.platform_name == 'linux_32': logger.warning( ' Sorry, can\'t create Upper Bound Assembly on this platform ' '(only linux64 and macOS are supported), skipping...') return None red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger) binary_fpath = download_external_tool('Red', red_dirpath, 'red', platform_specific=True, is_executable=True) if not binary_fpath or not os.path.isfile(binary_fpath): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to install/download third-party repeat finding tool [Red]), skipping...' ) return None insert_size = qconfig.optimal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.optimal_assembly_default_IS ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % ( ref_basename, qconfig.optimal_assembly_basename, insert_size) long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads if long_reads: result_basename = add_suffix(result_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(original_ref_fpath)) prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % ( original_ref_basename, qconfig.optimal_assembly_basename, insert_size) if long_reads: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, mp_polished_suffix) ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath uncovered_fpath = None reads_analyzer_dir = join(dirname(output_dirpath), qconfig.reads_stats_dirname) if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam: sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads='all', calculate_coverage=True) if qconfig.optimal_assembly_insert_size != 'auto' and qconfig.optimal_assembly_insert_size != insert_size: calculated_insert_size = qconfig.optimal_assembly_insert_size result_fpath = result_fpath.replace('is' + str(insert_size), 'is' + str(calculated_insert_size)) prepared_optimal_assembly_basename = prepared_optimal_assembly_basename.replace( 'is' + str(insert_size), 'is' + str(calculated_insert_size)) insert_size = calculated_insert_size ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath log_fpath = os.path.join(output_dirpath, 'upper_bound_assembly.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) unique_covered_regions, repeats_regions = get_unique_covered_regions( ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=long_reads) if unique_covered_regions is None: logger.error( ' Failed to create Upper Bound Assembly, see log for details: ' + log_fpath) return None reference = list(fastaparser.read_fasta(ref_fpath)) result_fasta = [] if long_reads or qconfig.mate_pairs: if long_reads: join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore' else: join_reads = 'mp' sam_fpath, bam_fpath, _ = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads=join_reads) joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath, bam_fpath, tmp_dir, log_fpath, join_reads) uncovered_regions = parse_bed( uncovered_fpath) if join_reads == 'mp' else defaultdict(list) mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None for chrom, seq in reference: region_pairing = get_regions_pairing(unique_covered_regions[chrom], joiners[chrom], mp_len) ref_coords_to_output = scaffolding(unique_covered_regions[chrom], region_pairing) get_fasta_entries_from_coords(result_fasta, (chrom, seq), ref_coords_to_output, repeats_regions[chrom], uncovered_regions[chrom]) else: for chrom, seq in reference: for idx, region in enumerate(unique_covered_regions[chrom]): if region[1] - region[0] >= MIN_CONTIG_LEN: result_fasta.append( (chrom + '_' + str(idx), seq[region[0]:region[1]])) fastaparser.write_fasta(result_fpath, result_fasta) logger.info(' ' + 'Theoretical Upper Bound Assembly is saved to ' + result_fpath) logger.notice( '(on reusing *this* Upper Bound Assembly in the *future* evaluations on *the same* dataset)\n' '\tThe next time, you can simply provide this file as an additional assembly (you could also rename it to UpperBound.fasta for the clarity). ' 'In this case, you do not need to specify --upper-bound-assembly and provide files with reads (--pe1/pe2, etc).\n' '\t\tOR\n' '\tYou can copy ' + result_fpath + ' to ' + ref_prepared_optimal_assembly + '. ' 'The next time you evaluate assemblies with --upper-bound-assembly option and against the same reference (' + original_ref_fpath + ') and ' 'the same reads (or if you specify the insert size of the paired-end reads explicitly with --est-insert-size ' + str(insert_size) + '), ' 'QUAST will reuse this Upper Bound Assembly.\n') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath
def save_combined_ref_stats(results, contigs_fpaths, ref_labels_by_chromosomes, output_dir, logger): istranslocations_by_asm = [result['istranslocations_by_refs'] if result else None for result in results] misassemblies_by_asm = [result['misassemblies_by_ref'] if result else None for result in results] all_refs = [] for ref in ref_labels_by_chromosomes.values(): if ref not in all_refs: all_refs.append(ref) if not qconfig.use_input_ref_order: all_refs.sort() misassemblies_by_refs_rows = [] row = {'metricName': 'References', 'values': all_refs} misassemblies_by_refs_rows.append(row) if not istranslocations_by_asm: return for i, fpath in enumerate(contigs_fpaths): label = qutils.label_from_fpath(fpath) row = {'metricName': label, 'values': []} misassemblies_by_refs_rows.append(row) istranslocations_by_ref = istranslocations_by_asm[i] intergenomic_misassemblies_by_asm[label] = defaultdict(list) for ref in all_refs: intergenomic_misassemblies_by_asm[label][ref] = misassemblies_by_asm[i][ref] if misassemblies_by_asm[i] else [] if istranslocations_by_ref: assembly_name = qutils.name_from_fpath(fpath) all_rows = [] row = {'metricName': 'References', 'values': [ref_num + 1 for ref_num in range(len(all_refs))]} all_rows.append(row) for ref in all_refs: row = {'metricName': ref, 'values': []} for second_ref in all_refs: if ref == second_ref or second_ref not in istranslocations_by_ref: row['values'].append(None) else: row['values'].append(istranslocations_by_ref[ref][second_ref]) possible_misassemblies = 0 misassemblies_by_ref = misassemblies_by_asm[i] if misassemblies_by_ref: possible_misassemblies = misassemblies_by_ref[ref].count(Misassembly.POSSIBLE_MISASSEMBLIES) istranslocations = max(0, sum([r for r in row['values'] if r])) misassemblies_by_refs_rows[-1]['values'].append(istranslocations + possible_misassemblies) all_rows.append(row) misassembly_by_ref_fpath = os.path.join(output_dir, 'interspecies_translocations_by_refs_%s.info' % assembly_name) with open(misassembly_by_ref_fpath, 'w') as misassembly_by_ref_file: misassembly_by_ref_file.write('Number of interspecies translocations by references: \n') print_file(all_rows, misassembly_by_ref_fpath, append_to_existing_file=True) with open(misassembly_by_ref_fpath, 'a') as misassembly_by_ref_file: misassembly_by_ref_file.write('References:\n') for ref_num, ref in enumerate(all_refs): misassembly_by_ref_file.write(str(ref_num + 1) + ' - ' + ref + '\n') logger.info(' Information about interspecies translocations by references for %s is saved to %s' % (assembly_name, misassembly_by_ref_fpath)) misassemblies = [] if qconfig.draw_plots: from quast_libs import plotter aligned_contigs_labels = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: aligned_contigs_labels.append(row['metricName']) else: misassemblies_by_refs_rows.remove(row) for i in range(len(all_refs)): cur_results = [] for row in misassemblies_by_refs_rows[1:]: if row['values']: cur_results.append(row['values'][i]) misassemblies.append(cur_results) is_translocations_plot_fpath = os.path.join(output_dir, 'intergenomic_misassemblies') plotter.draw_meta_summary_plot('', output_dir, aligned_contigs_labels, all_refs, misassemblies, is_translocations_plot_fpath, title='Intergenomic misassemblies (found and supposed)', reverse=False, yaxis_title=None, print_all_refs=True, logger=logger)
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath): from quast_libs import reporting ref_reads_stats = None ref_lap_score = None if ref_fpath: ref_name = qutils.name_from_fpath(ref_fpath) stats_fpath = join(output_dir, ref_name + '.stat') if isfile(stats_fpath): ref_reads_stats = parse_reads_stats(stats_fpath) if int(ref_reads_stats['mapped']) == 0: logger.info(' BWA: nothing aligned for reference.') lap_out_fpath = get_safe_fpath(output_dir, ref_name + '.lap.out') if is_non_empty_file(lap_out_fpath): with open(lap_out_fpath) as f: l = f.readline() ref_lap_score = float(l.split()[0]) if l else None # process all contigs files for index, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) stats_fpath = join(output_dir, assembly_name + '.stat') if ref_reads_stats: report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped']) report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt']) report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons']) report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint']) report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth']) if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS, [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0]) if not isfile(stats_fpath): continue reads_stats = parse_reads_stats(stats_fpath) report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total']) report.add_field(reporting.Fields.LEFT_READS, reads_stats['left']) report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right']) report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped']) report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt']) if int(reads_stats['mapped']) == 0: logger.info(' ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.') report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons']) report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint']) report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.DEPTH, reads_stats['depth']) if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS, [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0]) lap_out_fpath = get_safe_fpath(output_dir, assembly_name + '.lap.out') if is_non_empty_file(lap_out_fpath): with open(lap_out_fpath) as f: l = f.readline() lap_score = float(l.split()[0]) if l else None report.add_field(reporting.Fields.LAP_SCORE, ('%.3f' % lap_score if lap_score is not None else None)) report.add_field(reporting.Fields.REF_LAP_SCORE, ('%.3f' % ref_lap_score if ref_lap_score is not None else None))