def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning("GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning(' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) else: successful = install_genemark(os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name)) if not successful: return if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) from joblib import Parallel, delayed results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) unique_count, count = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if count is not None: report.add_field(reporting.Fields.PREDICTED_GENES, count) if unique_count is None and count is None: logger.error(' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + qutils.label_from_fpath(fasta_path) + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tool_src_dirpath = os.path.join(tool_dirpath, 'src') tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm') tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isfile(tool_exec_fpath): # making logger.main_info("Compiling GlimmerHMM...") return_code = qutils.call_subprocess( ['make', '-C', tool_src_dirpath], stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'), indent=' ') if return_code != 0 or not os.path.isfile(tool_exec_fpath): logger.error( "Failed to compile GlimmerHMM (" + tool_src_dirpath + ")!\nTry to compile it manually or do not use --gene-finding " "option with --eukaryote.\nUse --debug option to see the command lines." ) return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed results = Parallel(n_jobs=n_jobs)( delayed(predict_genes)(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) unique, cnt = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if cnt is not None: report.add_field(reporting.Fields.PREDICTED_GENES, cnt) if unique is None and cnt is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(contigs_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tool_src_dirpath = os.path.join(tool_dirpath, 'src') tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm') tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isfile(tool_exec_fpath): # making logger.main_info("Compiling GlimmerHMM...") return_code = qutils.call_subprocess( ['make', '-C', tool_src_dirpath], stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'), indent=' ') if return_code != 0 or not os.path.isfile(tool_exec_fpath): logger.error("Failed to compile GlimmerHMM (" + tool_src_dirpath + ")!\nTry to compile it manually or do not use --gene-finding " "option with --eukaryote.\nUse --debug option to see the command lines.") return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) unique, cnt = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if cnt is not None: report.add_field(reporting.Fields.PREDICTED_GENES, cnt) if unique is None and cnt is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(contigs_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def do(fasta_fpaths, gene_lengths, out_dirpath, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning("GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'metagenemark' gmhmm_p_function = gmhmm_p_metagenomic else: tool_name = 'GeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC logger.info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning(' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) else: successful = install_genemark(tool_dirpath) if not successful: return if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) from joblib import Parallel, delayed results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function) for index, fasta_fpath in enumerate(fasta_fpaths)) # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) unique_count, count = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if count is not None: report.add_field(reporting.Fields.PREDICTED_GENES, count) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') from libs import search_references_meta if search_references_meta.is_quast_first_run: nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file( fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict( container.kind, container.region_list, reference_chromosomes.keys()) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # for cumulative plots: files_genes_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed process_results = Parallel(n_jobs=n_jobs)( delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [ process_results[i][1] for i in range(len(process_results)) ] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ ref_lengths[i][ref] for i in range(len(ref_lengths)) ] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.iteritems(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write( '================================================================================================================\n' ) for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip( aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: import plotter if genes_container.region_list: plotter.genes_operons_plot( len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram( aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot( len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram( aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.')
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning( "GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning( ' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) else: successful = install_genemark( os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name)) if not successful: return if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) from joblib import Parallel, delayed results = Parallel(n_jobs=n_jobs)( delayed(predict_genes)(index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) unique_count, count = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if count is not None: report.add_field(reporting.Fields.PREDICTED_GENES, count) if unique_count is None and count is None: logger.error( ' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + qutils.label_from_fpath(fasta_path) + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def do(ref_fpath, contigs_fpaths, output_dirpath): gage_results_dirpath = os.path.join(output_dirpath, 'gage') # suffixes for files with report tables in plain text and tab separated formats if not os.path.isdir(gage_results_dirpath): os.mkdir(gage_results_dirpath) ######################################################################## gage_tool_path = os.path.join(qconfig.LIBS_LOCATION, 'gage', 'getCorrectnessStats.sh') ######################################################################## logger.print_timestamp() logger.info('Running GAGE...') metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs', 'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation', 'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50'] metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE, reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50] tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp') if not os.path.exists(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths)) if 0 not in return_codes: logger.warning('Error occurred while GAGE was processing assemblies.' ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr')) return ## find metrics for total report: for i, contigs_fpath in enumerate(contigs_fpaths): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) report = reporting.get(contigs_fpath) log_out_fpath = os.path.join( gage_results_dirpath, 'gage_' + assembly_name + '.stdout') logfile_out = open(log_out_fpath, 'r') cur_metric_id = 0 for line in logfile_out: if metrics[cur_metric_id] in line: if (metrics[cur_metric_id].startswith('N50')): report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip()) else: report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip()) cur_metric_id += 1 if cur_metric_id == len(metrics): break logfile_out.close() reporting.save_gage(output_dirpath) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') logger.print_timestamp() logger.info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.iteritems(): res_file.write('\t' + chr_name + ' (' + str(chr_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, reference_chromosomes.keys()) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('================================================================================================================\n') # for cumulative plots: files_genes_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed results_genes_operons_tuples = Parallel(n_jobs=n_jobs)(delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: import plotter if genes_container.region_list: plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.info('Done.')
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.main_info('Running NA-NGA calculation...') reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append( sum(fastaparser.get_lengths_from_fastafile(contigs_fpath))) import N50 for i, (contigs_fpath, lens, assembly_len) in enumerate( itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) na75 = N50.NG50(lens, assembly_len, 75) la50 = N50.LG50(lens, assembly_len) la75 = N50.LG50(lens, assembly_len, 75) if not qconfig.is_combined_ref: nga50 = N50.NG50(lens, reference_length) nga75 = N50.NG50(lens, reference_length, 75) lga50 = N50.LG50(lens, reference_length) lga75 = N50.LG50(lens, reference_length, 75) logger.info( ' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') + ', LA50 = ' + str(la50) + (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else '')) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LA75, la75) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## num_contigs = max([ len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists)) ]) if json_output_dirpath: from libs.html_saver import json_saver json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) import plotter if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... plotter.cumulative_plot( ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths, json_output_dir=json_output_dirpath) if not qconfig.is_combined_ref: plotter.Nx_plot( output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath) logger.main_info('Done.') return report_dict
def main(args): if ' ' in quast_dirpath: logger.error('QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(quast_dirpath) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage() sys.exit(0) reload(qconfig) try: options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options) except getopt.GetoptError: _, exc_value, _ = sys.exc_info() print >> sys.stderr, exc_value print >> sys.stderr qconfig.usage() sys.exit(2) for opt, arg in options[:]: if opt in ('-d', '--debug'): options.remove((opt, arg)) qconfig.debug = True logger.set_up_console_handler(debug=True) if opt == '--test': options.remove((opt, arg)) options += [('-o', 'quast_test_output'), ('-R', 'test_data/reference.fasta.gz'), # for compiling MUMmer ('-O', 'test_data/operons.gff'), ('-G', 'test_data/genes.gff'), ('--gene-finding',''), ('--eukaryote','')] # for compiling GlimmerHMM contigs_fpaths += ['test_data/contigs_1.fasta', 'test_data/contigs_2.fasta'] qconfig.test = True if opt.startswith('--help'): qconfig.usage(opt == "--help-hidden") sys.exit(0) if not contigs_fpaths: logger.error("You should specify at least one file with contigs!\n") qconfig.usage() sys.exit(2) json_output_dirpath = None output_dirpath = None labels = None all_labels_from_dirs = False ref_fpath = '' genes_fpaths = [] operons_fpaths = [] # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7. for opt, arg in options: if opt in ('-o', "--output-dir"): output_dirpath = os.path.abspath(arg) qconfig.make_latest_symlink = False elif opt in ('-G', "--genes"): genes_fpaths.append(assert_file_exists(arg, 'genes')) elif opt in ('-O', "--operons"): operons_fpaths.append(assert_file_exists(arg, 'operons')) elif opt in ('-R', "--reference"): ref_fpath = assert_file_exists(arg, 'reference') elif opt in ('-t', "--contig-thresholds"): qconfig.contig_thresholds = arg elif opt in ('-M', "--min-contig"): qconfig.min_contig = int(arg) elif opt in ('-T', "--threads"): qconfig.max_threads = int(arg) if qconfig.max_threads < 1: qconfig.max_threads = 1 elif opt in ('-c', "--mincluster"): qconfig.mincluster = int(arg) elif opt == "--est-ref-size": qconfig.estimated_reference_size = int(arg) elif opt in ('-S', "--gene-thresholds"): qconfig.genes_lengths = arg elif opt in ('-j', '--save-json'): qconfig.save_json = True elif opt in ('-J', '--save-json-to'): qconfig.save_json = True qconfig.make_latest_symlink = False json_output_dirpath = arg elif opt in ('-s', "--scaffolds"): qconfig.scaffolds = True elif opt == "--gage": qconfig.with_gage = True elif opt in ('-e', "--eukaryote"): qconfig.prokaryote = False elif opt in ('-f', "--gene-finding"): qconfig.gene_finding = True elif opt in ('-a', "--ambiguity-usage"): if arg in ["none", "one", "all"]: qconfig.ambiguity_usage = arg elif opt in ('-u', "--use-all-alignments"): qconfig.use_all_alignments = True elif opt in ('-n', "--strict-NA"): qconfig.strict_NA = True elif opt == '--no-plots': qconfig.draw_plots = False elif opt == '--no-html': qconfig.html_report = False elif opt in ('-m', '--meta'): qconfig.meta = True elif opt in ('-l', '--labels'): labels = parse_labels(arg, contigs_fpaths) elif opt == '-L': all_labels_from_dirs = True else: logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2) for contigs_fpath in contigs_fpaths: assert_file_exists(contigs_fpath, 'contigs') labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs) output_dirpath, json_output_dirpath, existing_alignments = \ _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.set_up_file_handler(output_dirpath) logger.print_command_line([os.path.realpath(__file__)] + args, wrap_after=None) logger.start() if existing_alignments: logger.info() logger.notice("Output directory already exists. Existing Nucmer alignments can be used.") qutils.remove_reports(output_dirpath) if qconfig.contig_thresholds == "None": qconfig.contig_thresholds = [] else: qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(",")) if qconfig.genes_lengths == "None": qconfig.genes_lengths = [] else: qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(",")) # Threading if qconfig.max_threads is None: try: import multiprocessing qconfig.max_threads = multiprocessing.cpu_count() except: logger.warning('Failed to determine the number of CPUs') qconfig.max_threads = qconfig.DEFAULT_MAX_THREADS logger.info() logger.notice('Maximum number of threads is set to ' + str(qconfig.max_threads) + ' (use --threads option to set it manually)') ######################################################################## from libs import reporting reload(reporting) if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCE if ref_fpath: logger.info() logger.info('Reference:') ref_fpath = _correct_reference(ref_fpath, corrected_dirpath) else: ref_fpath = '' # PROCESSING CONTIGS logger.info() logger.info('Contigs:') contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) if not contigs_fpaths: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 if qconfig.with_gage: ######################################################################## ### GAGE ######################################################################## if not ref_fpath: logger.warning("GAGE can't be run without a reference and will be skipped.") else: from libs import gage gage.do(ref_fpath, contigs_fpaths, output_dirpath) # Where all pdfs will be saved all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) all_pdf_file = None if qconfig.draw_plots: from libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. try: from matplotlib.backends.backend_pdf import PdfPages all_pdf_file = PdfPages(all_pdf_fpath) except: all_pdf_file = None ######################################################################## ### Stats and plots ######################################################################## from libs import basic_stats basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), json_output_dirpath, output_dirpath) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from libs import aligned_stats aligned_stats.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from libs import genome_analyzer genome_analyzer.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots: logger.print_timestamp() logger.info('Drawing large plots...') logger.info('This may take a while: press Ctrl-C to skip this step..') try: number_of_steps = sum([int(bool(value)) for value in [detailed_contigs_reports_dirpath, all_pdf_file]]) if detailed_contigs_reports_dirpath: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.info(' 1 of %d: Creating contig alignment plot...' % number_of_steps) from libs import contig_alignment_plotter contig_alignment_plot_fpath = contig_alignment_plotter.do( contigs_fpaths, os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout'), output_dirpath, ref_fpath, similar=True) if all_pdf_file: # full report in PDF format: all tables and plots logger.info(' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_file) logger.info('Done') except KeyboardInterrupt: logger.info('..step skipped!') os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.info('RESULTS:') logger.info(' Text versions of total report are saved to ' + reports_fpaths) logger.info(' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if json_output_dirpath: json_saver.save_total_report(json_output_dirpath, qconfig.min_contig) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_total_report(output_dirpath, qconfig.min_contig) if os.path.isfile(all_pdf_fpath): logger.info(' PDF version (tables and plots) saved to ' + all_pdf_fpath) if contig_alignment_plot_fpath: logger.info(' Contig alignment plot: %s' % contig_alignment_plot_fpath) _cleanup(corrected_dirpath) logger.finish_up(check_test=qconfig.test) return 0
def main(args): if ' ' in qconfig.QUAST_HOME: logger.error('QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage() sys.exit(0) reload(qconfig) try: options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options) except getopt.GetoptError: _, exc_value, _ = sys.exc_info() print >> sys.stderr, exc_value print >> sys.stderr qconfig.usage() sys.exit(2) for opt, arg in options[:]: if opt == '--test' or opt == '--test-sv': options.remove((opt, arg)) options += [('-o', 'quast_test_output'), ('-R', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reference.fasta.gz')), # for compiling MUMmer ('-O', os.path.join(qconfig.QUAST_HOME, 'test_data', 'operons.gff')), ('-G', os.path.join(qconfig.QUAST_HOME, 'test_data', 'genes.gff')), ('--gage', ''), # for compiling GAGE Java classes ('--gene-finding', ''), ('--eukaryote', ''), ('--glimmer', '')] # for compiling GlimmerHMM if opt == '--test-sv': options += [('-1', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads1.fastq.gz')), ('-2', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads2.fastq.gz'))] contigs_fpaths += [os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_1.fasta'), os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_2.fasta')] qconfig.test = True if opt.startswith('--help') or opt == '-h': qconfig.usage(opt == "--help-hidden", short=False) sys.exit(0) elif opt.startswith('--version') or opt == '-v': qconfig.print_version() sys.exit(0) if not contigs_fpaths: logger.error("You should specify at least one file with contigs!\n") qconfig.usage() sys.exit(2) json_output_dirpath = None output_dirpath = None labels = None all_labels_from_dirs = False qconfig.is_combined_ref = False ref_fpath = '' genes_fpaths = [] operons_fpaths = [] bed_fpath = None reads_fpath_f = '' reads_fpath_r = '' # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7. for opt, arg in options: if opt in ('-d', '--debug'): qconfig.debug = True logger.set_up_console_handler(debug=True) elif opt in ('-o', "--output-dir"): output_dirpath = os.path.abspath(arg) qconfig.make_latest_symlink = False if ' ' in output_dirpath: logger.error('QUAST does not support spaces in paths. \n' 'You have specified ' + str(output_dirpath) + ' as an output path.\n' 'Please, use a different directory.\n', to_stderr=True, exit_with_code=3) elif opt in ('-G', "--genes"): genes_fpaths.append(assert_file_exists(arg, 'genes')) elif opt in ('-O', "--operons"): operons_fpaths.append(assert_file_exists(arg, 'operons')) elif opt in ('-R', "--reference"): ref_fpath = assert_file_exists(arg, 'reference') elif opt == "--contig-thresholds": qconfig.contig_thresholds = arg elif opt in ('-m', "--min-contig"): qconfig.min_contig = int(arg) elif opt in ('-t', "--threads"): qconfig.max_threads = int(arg) if qconfig.max_threads < 1: qconfig.max_threads = 1 elif opt in ('-c', "--min-cluster"): qconfig.min_cluster = int(arg) elif opt in ('-i', "--min-alignment"): qconfig.min_alignment = int(arg) elif opt == "--est-ref-size": qconfig.estimated_reference_size = int(arg) elif opt == "--gene-thresholds": qconfig.genes_lengths = arg elif opt in ('-j', '--save-json'): qconfig.save_json = True elif opt in ('-J', '--save-json-to'): qconfig.save_json = True qconfig.make_latest_symlink = False json_output_dirpath = arg elif opt == '--err-fpath': # for web-quast qconfig.save_error = True qconfig.error_log_fname = arg elif opt in ('-s', "--scaffolds"): qconfig.scaffolds = True elif opt == "--gage": qconfig.with_gage = True elif opt in ('-e', "--eukaryote"): qconfig.prokaryote = False elif opt in ('-f', "--gene-finding"): qconfig.gene_finding = True elif opt in ('-a', "--ambiguity-usage"): if arg in ["none", "one", "all"]: qconfig.ambiguity_usage = arg elif opt in ('-u', "--use-all-alignments"): qconfig.use_all_alignments = True elif opt == "--strict-NA": qconfig.strict_NA = True elif opt in ('-x', "--extensive-mis-size"): if int(arg) <= qconfig.MAX_INDEL_LENGTH: logger.error("--extensive-mis-size should be greater than maximum indel length (%d)!" % qconfig.MAX_INDEL_LENGTH, 1, to_stderr=True) qconfig.extensive_misassembly_threshold = int(arg) elif opt == '--no-snps': qconfig.show_snps = False elif opt == '--no-plots': qconfig.draw_plots = False elif opt == '--no-html': qconfig.html_report = False elif opt == '--no-check': qconfig.no_check = True elif opt == '--no-gc': qconfig.no_gc = True elif opt == '--fast': # --no-gc, --no-plots, --no-snps #qconfig.no_check = True # too risky to include qconfig.no_gc = True qconfig.show_snps = False qconfig.draw_plots = False qconfig.html_report = False elif opt == '--plots-format': if arg.lower() in qconfig.supported_plot_extensions: qconfig.plot_extension = arg.lower() else: logger.error('Format "%s" is not supported. Please, use one of the supported formats: %s.' % (arg, ', '.join(qconfig.supported_plot_extensions)), to_stderr=True, exit_with_code=2) elif opt == '--meta': qconfig.meta = True elif opt == '--no-check-meta': qconfig.no_check = True qconfig.no_check_meta = True elif opt == '--references-list': pass elif opt in ('-l', '--labels'): labels = parse_labels(arg, contigs_fpaths) elif opt == '-L': all_labels_from_dirs = True elif opt == '--glimmer': qconfig.glimmer = True elif opt == '--combined-ref': qconfig.is_combined_ref = True elif opt == '--memory-efficient': qconfig.memory_efficient = True elif opt == '--silent': qconfig.silent = True elif opt in ('-1', '--reads1'): reads_fpath_f = arg elif opt in ('-2', '--reads2'): reads_fpath_r = arg elif opt == '--bed-file': bed_fpath = arg elif opt == '--contig-alignment-html': qconfig.create_contig_alignment_html = True else: logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2) for contigs_fpath in contigs_fpaths: assert_file_exists(contigs_fpath, 'contigs') labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs) output_dirpath, json_output_dirpath, existing_alignments = \ _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.set_up_file_handler(output_dirpath) args = [os.path.realpath(__file__)] for k, v in options: args.extend([k, v]) args.extend(contigs_fpaths) logger.print_command_line(args, wrap_after=None, is_main=True) logger.start() if existing_alignments: logger.main_info() logger.notice("Output directory already exists. Existing Nucmer alignments can be used.") qutils.remove_reports(output_dirpath) if qconfig.contig_thresholds == "None": qconfig.contig_thresholds = [] else: qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(",")) if qconfig.genes_lengths == "None": qconfig.genes_lengths = [] else: qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(",")) qconfig.set_max_threads(logger) logger.main_info() logger.print_params() ######################################################################## from libs import reporting reload(reporting) if qconfig.is_combined_ref: corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname) else: if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCE if ref_fpath: logger.main_info() logger.main_info('Reference:') ref_fpath = _correct_reference(ref_fpath, corrected_dirpath) else: ref_fpath = '' # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') contigs_fpaths, old_contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) reads_fpaths = [] if reads_fpath_f: reads_fpaths.append(reads_fpath_f) if reads_fpath_r: reads_fpaths.append(reads_fpath_r) if reads_fpaths: bed_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths, reads_fpaths, None, os.path.join(output_dirpath, qconfig.variation_dirname), external_logger=logger) if not contigs_fpaths: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 qconfig.assemblies_fpaths = contigs_fpaths if qconfig.with_gage: ######################################################################## ### GAGE ######################################################################## if not ref_fpath: logger.warning("GAGE can't be run without a reference and will be skipped.") else: from libs import gage gage.do(ref_fpath, contigs_fpaths, output_dirpath) # Where all pdfs will be saved all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) all_pdf_file = None if qconfig.draw_plots or qconfig.html_report: from libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. try: from matplotlib.backends.backend_pdf import PdfPages all_pdf_file = PdfPages(all_pdf_fpath) except: all_pdf_file = None if json_output_dirpath: from libs.html_saver import json_saver if json_saver.simplejson_error: json_output_dirpath = None ######################################################################## ### Stats and plots ######################################################################## from libs import basic_stats basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), json_output_dirpath, output_dirpath) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None if ref_fpath: ######################################################################## ### former PLANTAKOLYA, PLANTAGORA ######################################################################## from libs import contigs_analyzer nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do( ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports'), old_contigs_fpaths, bed_fpath) for contigs_fpath in contigs_fpaths: if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK: aligned_contigs_fpaths.append(contigs_fpath) aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath]) # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from libs import aligned_stats aligned_stats.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from libs import genome_analyzer genome_analyzer.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) if qconfig.gene_finding or qconfig.glimmer: if qconfig.glimmer: ######################################################################## ### Glimmer ######################################################################## from libs import glimmer glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes')) else: ######################################################################## ### GeneMark ######################################################################## from libs import genemark genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.prokaryote, qconfig.meta) else: logger.main_info("") logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.") ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots: logger.print_timestamp() logger.main_info('Drawing large plots...') logger.main_info('This may take a while: press Ctrl-C to skip this step..') try: if detailed_contigs_reports_dirpath and qconfig.show_snps: contig_report_fpath_pattern = os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout') else: contig_report_fpath_pattern = None number_of_steps = sum([int(bool(value)) for value in [contig_report_fpath_pattern, all_pdf_file]]) if contig_report_fpath_pattern: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.main_info(' 1 of %d: Creating contig alignment plot...' % number_of_steps) from libs import contig_alignment_plotter contig_alignment_plot_fpath = contig_alignment_plotter.do( contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, similar=True) if all_pdf_file: # full report in PDF format: all tables and plots logger.main_info(' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_file) logger.main_info('Done') except KeyboardInterrupt: logger.main_info('..step skipped!') os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.main_info('RESULTS:') logger.main_info(' Text versions of total report are saved to ' + reports_fpaths) logger.main_info(' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if json_output_dirpath: json_saver.save_total_report(json_output_dirpath, qconfig.min_contig, ref_fpath) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls) html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath) if os.path.isfile(all_pdf_fpath): logger.main_info(' PDF version (tables and plots) saved to ' + all_pdf_fpath) if contig_alignment_plot_fpath: logger.main_info(' Contig alignment plot: %s' % contig_alignment_plot_fpath) _cleanup(corrected_dirpath) logger.finish_up(check_test=qconfig.test) return 0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.info('Running NA-NGA calculation...') reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append(sum(fastaparser.get_lengths_from_fastafile(contigs_fpath))) import N50 for i, (contigs_fpath, lens, assembly_len) in enumerate( itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) nga50 = N50.NG50(lens, reference_length) na75 = N50.NG50(lens, assembly_len, 75) nga75 = N50.NG50(lens, reference_length, 75) la50 = N50.LG50(lens, assembly_len) lga50 = N50.LG50(lens, reference_length) la75 = N50.LG50(lens, assembly_len, 75) lga75 = N50.LG50(lens, reference_length, 75) logger.info(' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + ', NGA50 = ' + str(nga50) + ', LA50 = ' + str(la50) + ', LGA50 = ' + str(lga50)) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LA75, la75) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## # saving to JSON if json_output_dirpath: from libs.html_saver import json_saver json_saver.save_aligned_contigs_lengths(json_output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists) json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_aligned_contigs_lengths(output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists) html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... import plotter plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths) plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))]) logger.info('Done.') return report_dict
def do(ref_fpath, contigs_fpaths, output_dirpath): gage_results_dirpath = os.path.join(output_dirpath, 'gage') # suffixes for files with report tables in plain text and tab separated formats if not os.path.isdir(gage_results_dirpath): os.mkdir(gage_results_dirpath) ######################################################################## gage_tool_path = os.path.join(qconfig.LIBS_LOCATION, 'gage', 'getCorrectnessStats.sh') ######################################################################## logger.print_timestamp() logger.main_info('Running GAGE...') metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs', 'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation', 'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50'] metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE, reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50] tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp') if not os.path.exists(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths)) if 0 not in return_codes: logger.warning('Error occurred while GAGE was processing assemblies.' ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr')) return ## find metrics for total report: for i, contigs_fpath in enumerate(contigs_fpaths): assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) report = reporting.get(contigs_fpath) log_out_fpath = os.path.join( gage_results_dirpath, 'gage_' + assembly_label + '.stdout') logfile_out = open(log_out_fpath, 'r') cur_metric_id = 0 for line in logfile_out: if metrics[cur_metric_id] in line: if (metrics[cur_metric_id].startswith('N50')): report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip()) else: report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip()) cur_metric_id += 1 if cur_metric_id == len(metrics): break logfile_out.close() reporting.save_gage(output_dirpath) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def main(args): if ' ' in quast_dirpath: logger.error('QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(quast_dirpath) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage() sys.exit(0) reload(qconfig) try: options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options) except getopt.GetoptError: _, exc_value, _ = sys.exc_info() print >> sys.stderr, exc_value print >> sys.stderr qconfig.usage() sys.exit(2) for opt, arg in options[:]: if opt in ('-d', '--debug'): options.remove((opt, arg)) qconfig.debug = True logger.set_up_console_handler(debug=True) if opt == '--test': options.remove((opt, arg)) options += [('-o', 'quast_test_output'), ('-R', 'test_data/reference.fasta.gz'), # for compiling MUMmer ('-O', 'test_data/operons.gff'), ('-G', 'test_data/genes.gff'), ('--gene-finding',''), ('--eukaryote','')] # for compiling GlimmerHMM contigs_fpaths += ['test_data/contigs_1.fasta', 'test_data/contigs_2.fasta'] qconfig.test = True if opt.startswith('--help'): qconfig.usage(opt == "--help-hidden") sys.exit(0) if not contigs_fpaths: logger.error("You should specify at least one file with contigs!\n") qconfig.usage() sys.exit(2) json_output_dirpath = None output_dirpath = None labels = None all_labels_from_dirs = False ref_fpath = '' genes_fpaths = [] operons_fpaths = [] # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7. for opt, arg in options: if opt in ('-o', "--output-dir"): output_dirpath = os.path.abspath(arg) qconfig.make_latest_symlink = False elif opt in ('-G', "--genes"): genes_fpaths.append(assert_file_exists(arg, 'genes')) elif opt in ('-O', "--operons"): operons_fpaths.append(assert_file_exists(arg, 'operons')) elif opt in ('-R', "--reference"): ref_fpath = assert_file_exists(arg, 'reference') elif opt in ('-t', "--contig-thresholds"): qconfig.contig_thresholds = arg elif opt in ('-M', "--min-contig"): qconfig.min_contig = int(arg) elif opt in ('-T', "--threads"): qconfig.max_threads = int(arg) if qconfig.max_threads < 1: qconfig.max_threads = 1 elif opt in ('-c', "--mincluster"): qconfig.mincluster = int(arg) elif opt == "--est-ref-size": qconfig.estimated_reference_size = int(arg) elif opt in ('-S', "--gene-thresholds"): qconfig.genes_lengths = arg elif opt in ('-j', '--save-json'): qconfig.save_json = True elif opt in ('-J', '--save-json-to'): qconfig.save_json = True qconfig.make_latest_symlink = False json_output_dirpath = arg elif opt in ('-s', "--scaffolds"): qconfig.scaffolds = True elif opt == "--gage": qconfig.with_gage = True elif opt in ('-e', "--eukaryote"): qconfig.prokaryote = False elif opt in ('-f', "--gene-finding"): qconfig.gene_finding = True elif opt in ('-a', "--ambiguity-usage"): if arg in ["none", "one", "all"]: qconfig.ambiguity_usage = arg elif opt in ('-u', "--use-all-alignments"): qconfig.use_all_alignments = True elif opt in ('-n', "--strict-NA"): qconfig.strict_NA = True elif opt == '--no-plots': qconfig.draw_plots = False elif opt == '--no-html': qconfig.html_report = False elif opt in ('-m', '--meta'): qconfig.meta = True elif opt in ('-l', '--labels'): labels = parse_labels(arg, contigs_fpaths) elif opt == '-L': all_labels_from_dirs = True else: logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2) for contigs_fpath in contigs_fpaths: assert_file_exists(contigs_fpath, 'contigs') labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs) output_dirpath, json_output_dirpath, existing_alignments = \ _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.set_up_file_handler(output_dirpath) logger.print_command_line([os.path.realpath(__file__)] + args, wrap_after=None) logger.start() if existing_alignments: logger.info() logger.notice("Output directory already exists. Existing Nucmer alignments can be used.") qutils.remove_reports(output_dirpath) if qconfig.contig_thresholds == "None": qconfig.contig_thresholds = [] else: qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(",")) if qconfig.genes_lengths == "None": qconfig.genes_lengths = [] else: qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(",")) # Threading if qconfig.max_threads is None: try: import multiprocessing qconfig.max_threads = multiprocessing.cpu_count() except: logger.warning('Failed to determine the number of CPUs') qconfig.max_threads = qconfig.DEFAULT_MAX_THREADS logger.info() logger.notice('Maximum number of threads is set to ' + str(qconfig.max_threads) + ' (use --threads option to set it manually)') ######################################################################## from libs import reporting reload(reporting) if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCE if ref_fpath: logger.info() logger.info('Reference:') ref_fpath = _correct_reference(ref_fpath, corrected_dirpath) else: ref_fpath = '' # PROCESSING CONTIGS logger.info() logger.info('Contigs:') contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) if not contigs_fpaths: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 if qconfig.with_gage: ######################################################################## ### GAGE ######################################################################## if not ref_fpath: logger.warning("GAGE can't be run without a reference and will be skipped.") else: from libs import gage gage.do(ref_fpath, contigs_fpaths, output_dirpath) # Where all pdfs will be saved all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) all_pdf_file = None if qconfig.draw_plots: from libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. try: from matplotlib.backends.backend_pdf import PdfPages all_pdf_file = PdfPages(all_pdf_fpath) except: all_pdf_file = None ######################################################################## ### Stats and plots ######################################################################## from libs import basic_stats basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), json_output_dirpath, output_dirpath) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None if ref_fpath: ######################################################################## ### former PLANTAKOLYA, PLANTAGORA ######################################################################## from libs import contigs_analyzer nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do( ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports')) for contigs_fpath in contigs_fpaths: if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK: aligned_contigs_fpaths.append(contigs_fpath) aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath]) # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from libs import aligned_stats aligned_stats.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from libs import genome_analyzer genome_analyzer.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) if qconfig.gene_finding: if qconfig.prokaryote or qconfig.meta: ######################################################################## ### GeneMark ######################################################################## from libs import genemark genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.meta) else: ######################################################################## ### Glimmer ######################################################################## from libs import glimmer glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes')) else: logger.info("") logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.") ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots: logger.print_timestamp() logger.info('Drawing large plots...') logger.info('This may take a while: press Ctrl-C to skip this step..') try: number_of_steps = sum([int(bool(value)) for value in [detailed_contigs_reports_dirpath, all_pdf_file]]) if detailed_contigs_reports_dirpath: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.info(' 1 of %d: Creating contig alignment plot...' % number_of_steps) from libs import contig_alignment_plotter contig_alignment_plot_fpath = contig_alignment_plotter.do( contigs_fpaths, os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout'), output_dirpath, ref_fpath, similar=True) if all_pdf_file: # full report in PDF format: all tables and plots logger.info(' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_file) logger.info('Done') except KeyboardInterrupt: logger.info('..step skipped!') os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.info('RESULTS:') logger.info(' Text versions of total report are saved to ' + reports_fpaths) logger.info(' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if json_output_dirpath: json_saver.save_total_report(json_output_dirpath, qconfig.min_contig) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_total_report(output_dirpath, qconfig.min_contig) if os.path.isfile(all_pdf_fpath): logger.info(' PDF version (tables and plots) saved to ' + all_pdf_fpath) if contig_alignment_plot_fpath: logger.info(' Contig alignment plot: %s' % contig_alignment_plot_fpath) _cleanup(corrected_dirpath) logger.finish_up(check_test=qconfig.test) return 0
def main(args): if ' ' in qconfig.QUAST_HOME: logger.error( 'QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage() sys.exit(0) reload(qconfig) try: options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options) except getopt.GetoptError: _, exc_value, _ = sys.exc_info() print >> sys.stderr, exc_value print >> sys.stderr qconfig.usage() sys.exit(2) for opt, arg in options[:]: if opt == '--test' or opt == '--test-sv': options.remove((opt, arg)) options += [ ('-o', 'quast_test_output'), ('-R', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reference.fasta.gz')), # for compiling MUMmer ('-O', os.path.join(qconfig.QUAST_HOME, 'test_data', 'operons.gff')), ('-G', os.path.join(qconfig.QUAST_HOME, 'test_data', 'genes.gff')), ('--gage', ''), # for compiling GAGE Java classes ('--gene-finding', ''), ('--eukaryote', ''), ('--glimmer', '') ] # for compiling GlimmerHMM if opt == '--test-sv': options += [('-1', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads1.fastq.gz')), ('-2', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads2.fastq.gz'))] contigs_fpaths += [ os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_1.fasta'), os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_2.fasta') ] qconfig.test = True if opt.startswith('--help') or opt == '-h': qconfig.usage(opt == "--help-hidden", short=False) sys.exit(0) elif opt.startswith('--version') or opt == '-v': qconfig.print_version() sys.exit(0) if not contigs_fpaths: logger.error("You should specify at least one file with contigs!\n") qconfig.usage() sys.exit(2) json_output_dirpath = None output_dirpath = None labels = None all_labels_from_dirs = False qconfig.is_combined_ref = False ref_fpath = '' genes_fpaths = [] operons_fpaths = [] bed_fpath = None reads_fpath_f = '' reads_fpath_r = '' # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7. for opt, arg in options: if opt in ('-d', '--debug'): qconfig.debug = True logger.set_up_console_handler(debug=True) elif opt in ('-o', "--output-dir"): output_dirpath = os.path.abspath(arg) qconfig.make_latest_symlink = False if ' ' in output_dirpath: logger.error('QUAST does not support spaces in paths. \n' 'You have specified ' + str(output_dirpath) + ' as an output path.\n' 'Please, use a different directory.\n', to_stderr=True, exit_with_code=3) elif opt in ('-G', "--genes"): genes_fpaths.append(assert_file_exists(arg, 'genes')) elif opt in ('-O', "--operons"): operons_fpaths.append(assert_file_exists(arg, 'operons')) elif opt in ('-R', "--reference"): ref_fpath = assert_file_exists(arg, 'reference') elif opt == "--contig-thresholds": qconfig.contig_thresholds = arg elif opt in ('-m', "--min-contig"): qconfig.min_contig = int(arg) elif opt in ('-t', "--threads"): qconfig.max_threads = int(arg) if qconfig.max_threads < 1: qconfig.max_threads = 1 elif opt in ('-c', "--min-cluster"): qconfig.min_cluster = int(arg) elif opt in ('-i', "--min-alignment"): qconfig.min_alignment = int(arg) elif opt == "--est-ref-size": qconfig.estimated_reference_size = int(arg) elif opt == "--gene-thresholds": qconfig.genes_lengths = arg elif opt in ('-j', '--save-json'): qconfig.save_json = True elif opt in ('-J', '--save-json-to'): qconfig.save_json = True qconfig.make_latest_symlink = False json_output_dirpath = arg elif opt == '--err-fpath': # for web-quast qconfig.save_error = True qconfig.error_log_fname = arg elif opt in ('-s', "--scaffolds"): qconfig.scaffolds = True elif opt == "--gage": qconfig.with_gage = True elif opt in ('-e', "--eukaryote"): qconfig.prokaryote = False elif opt in ('-f', "--gene-finding"): qconfig.gene_finding = True elif opt in ('-a', "--ambiguity-usage"): if arg in ["none", "one", "all"]: qconfig.ambiguity_usage = arg elif opt in ('-u', "--use-all-alignments"): qconfig.use_all_alignments = True elif opt == "--strict-NA": qconfig.strict_NA = True elif opt in ('-x', "--extensive-mis-size"): if int(arg) <= qconfig.MAX_INDEL_LENGTH: logger.error( "--extensive-mis-size should be greater than maximum indel length (%d)!" % qconfig.MAX_INDEL_LENGTH, 1, to_stderr=True) qconfig.extensive_misassembly_threshold = int(arg) elif opt == '--no-snps': qconfig.show_snps = False elif opt == '--no-plots': qconfig.draw_plots = False elif opt == '--no-html': qconfig.html_report = False elif opt == '--no-check': qconfig.no_check = True elif opt == '--no-gc': qconfig.no_gc = True elif opt == '--fast': # --no-gc, --no-plots, --no-snps #qconfig.no_check = True # too risky to include qconfig.no_gc = True qconfig.show_snps = False qconfig.draw_plots = False qconfig.html_report = False elif opt == '--plots-format': if arg.lower() in qconfig.supported_plot_extensions: qconfig.plot_extension = arg.lower() else: logger.error( 'Format "%s" is not supported. Please, use one of the supported formats: %s.' % (arg, ', '.join(qconfig.supported_plot_extensions)), to_stderr=True, exit_with_code=2) elif opt == '--meta': qconfig.meta = True elif opt == '--no-check-meta': qconfig.no_check = True qconfig.no_check_meta = True elif opt == '--references-list': pass elif opt in ('-l', '--labels'): labels = parse_labels(arg, contigs_fpaths) elif opt == '-L': all_labels_from_dirs = True elif opt == '--glimmer': qconfig.glimmer = True elif opt == '--combined-ref': qconfig.is_combined_ref = True elif opt == '--memory-efficient': qconfig.memory_efficient = True elif opt == '--silent': qconfig.silent = True elif opt in ('-1', '--reads1'): reads_fpath_f = arg elif opt in ('-2', '--reads2'): reads_fpath_r = arg elif opt == '--bed-file': bed_fpath = arg elif opt == '--contig-alignment-html': qconfig.create_contig_alignment_html = True else: logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2) for contigs_fpath in contigs_fpaths: assert_file_exists(contigs_fpath, 'contigs') labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs) output_dirpath, json_output_dirpath, existing_alignments = \ _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.set_up_file_handler(output_dirpath) args = [os.path.realpath(__file__)] for k, v in options: args.extend([k, v]) args.extend(contigs_fpaths) logger.print_command_line(args, wrap_after=None, is_main=True) logger.start() if existing_alignments: logger.main_info() logger.notice( "Output directory already exists. Existing Nucmer alignments can be used." ) qutils.remove_reports(output_dirpath) if qconfig.contig_thresholds == "None": qconfig.contig_thresholds = [] else: qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(",")) if qconfig.genes_lengths == "None": qconfig.genes_lengths = [] else: qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(",")) qconfig.set_max_threads(logger) logger.main_info() logger.print_params() ######################################################################## from libs import reporting reload(reporting) if qconfig.is_combined_ref: corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname) else: if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCE if ref_fpath: logger.main_info() logger.main_info('Reference:') ref_fpath = _correct_reference(ref_fpath, corrected_dirpath) else: ref_fpath = '' # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') contigs_fpaths, old_contigs_fpaths = _correct_contigs( contigs_fpaths, corrected_dirpath, reporting, labels) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) reads_fpaths = [] if reads_fpath_f: reads_fpaths.append(reads_fpath_f) if reads_fpath_r: reads_fpaths.append(reads_fpath_r) if reads_fpaths: bed_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths, reads_fpaths, None, os.path.join(output_dirpath, qconfig.variation_dirname), external_logger=logger) if not contigs_fpaths: logger.error( "None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 qconfig.assemblies_fpaths = contigs_fpaths if qconfig.with_gage: ######################################################################## ### GAGE ######################################################################## if not ref_fpath: logger.warning( "GAGE can't be run without a reference and will be skipped.") else: from libs import gage gage.do(ref_fpath, contigs_fpaths, output_dirpath) # Where all pdfs will be saved all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) all_pdf_file = None if qconfig.draw_plots or qconfig.html_report: from libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. try: from matplotlib.backends.backend_pdf import PdfPages all_pdf_file = PdfPages(all_pdf_fpath) except: all_pdf_file = None if json_output_dirpath: from libs.html_saver import json_saver if json_saver.simplejson_error: json_output_dirpath = None ######################################################################## ### Stats and plots ######################################################################## from libs import basic_stats basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), json_output_dirpath, output_dirpath) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None if ref_fpath: ######################################################################## ### former PLANTAKOLYA, PLANTAGORA ######################################################################## from libs import contigs_analyzer nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do( ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports'), old_contigs_fpaths, bed_fpath) for contigs_fpath in contigs_fpaths: if nucmer_statuses[ contigs_fpath] == contigs_analyzer.NucmerStatus.OK: aligned_contigs_fpaths.append(contigs_fpath) aligned_lengths_lists.append( aligned_lengths_per_fpath[contigs_fpath]) # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join( output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from libs import aligned_stats aligned_stats.do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from libs import genome_analyzer genome_analyzer.do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) if qconfig.gene_finding or qconfig.glimmer: if qconfig.glimmer: ######################################################################## ### Glimmer ######################################################################## from libs import glimmer glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes')) else: ######################################################################## ### GeneMark ######################################################################## from libs import genemark genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.prokaryote, qconfig.meta) else: logger.main_info("") logger.notice( "Genes are not predicted by default. Use --gene-finding option to enable it." ) ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total( output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots: logger.print_timestamp() logger.main_info('Drawing large plots...') logger.main_info( 'This may take a while: press Ctrl-C to skip this step..') try: if detailed_contigs_reports_dirpath and qconfig.show_snps: contig_report_fpath_pattern = os.path.join( detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout') else: contig_report_fpath_pattern = None number_of_steps = sum([ int(bool(value)) for value in [contig_report_fpath_pattern, all_pdf_file] ]) if contig_report_fpath_pattern: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.main_info( ' 1 of %d: Creating contig alignment plot...' % number_of_steps) from libs import contig_alignment_plotter contig_alignment_plot_fpath = contig_alignment_plotter.do( contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, similar=True) if all_pdf_file: # full report in PDF format: all tables and plots logger.main_info( ' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_file) logger.main_info('Done') except KeyboardInterrupt: logger.main_info('..step skipped!') os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.main_info('RESULTS:') logger.main_info(' Text versions of total report are saved to ' + reports_fpaths) logger.main_info( ' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if json_output_dirpath: json_saver.save_total_report(json_output_dirpath, qconfig.min_contig, ref_fpath) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls) html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath) if os.path.isfile(all_pdf_fpath): logger.main_info(' PDF version (tables and plots) saved to ' + all_pdf_fpath) if contig_alignment_plot_fpath: logger.main_info(' Contig alignment plot: %s' % contig_alignment_plot_fpath) _cleanup(corrected_dirpath) logger.finish_up(check_test=qconfig.test) return 0