def save_colors_and_ls(fpaths): if not dict_color_and_ls: color_id = 0 for fpath in fpaths: ls = primary_line_style label = qutils.label_from_fpath(fpath) # contigs and scaffolds should be equally colored but scaffolds should be dashed if fpath and fpath in qconfig.dict_of_broken_scaffolds: color = dict_color_and_ls[qutils.label_from_fpath(qconfig.dict_of_broken_scaffolds[fpath])][0] ls = secondary_line_style else: color = colors[color_id % len(colors)] color_id += 1 dict_color_and_ls[label] = (color, ls)
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning("GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning(' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) else: successful = install_genemark(os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name)) if not successful: return if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) from joblib import Parallel, delayed results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) unique_count, count = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if count is not None: report.add_field(reporting.Fields.PREDICTED_GENES, count) if unique_count is None and count is None: logger.error(' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + qutils.label_from_fpath(fasta_path) + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) err_fpath = os.path.join(out_dirpath, assembly_name + '_genemark.stderr') genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath) if not genes: unique_count = None count = None # [None] * len(gene_lengths) else: tool_name = "genemark" out_gff_fpath = os.path.join(out_dirpath, assembly_name + '_' + tool_name + '_genes.gff') add_genes_to_gff(genes, out_gff_fpath) if OUTPUT_FASTA: out_fasta_fpath = os.path.join(out_dirpath, assembly_name + '_' + tool_name + '_genes.fasta') add_genes_to_fasta(genes, out_fasta_fpath) count = [sum([gene[3] - gene[2] > x for gene in genes]) for x in gene_lengths] unique_count = len(set([gene[4] for gene in genes])) total_count = len(genes) logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_fpath) return unique_count, count
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) out_fpath = os.path.join(out_dirpath, assembly_name + '_glimmer') err_fpath = os.path.join(out_dirpath, assembly_name + '_glimmer.stderr') #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir, # fasta_path, out_path, gene_lengths, err_path) out_gff_path, unique, total, cnt = glimmerHMM(tool_dirpath, contigs_fpath, out_fpath, gene_lengths, err_fpath, tmp_dirpath, index) if out_gff_path: logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique) + ' unique, ' + str(total) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_path) return unique, cnt
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess([ 'sh', gage_tool_path, reference, contigs_fpath, tmp_dir, str(qconfig.min_contig) ], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code == 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def _handle_fasta(contigs_fpath, corr_fpath, reporting): lengths = fastaparser.get_lengths_from_fastafile(contigs_fpath) if not sum(l for l in lengths if l >= qconfig.min_contig): logger.warning( "Skipping %s because it doesn't contain contigs >= %d bp." % (qutils.label_from_fpath(corr_fpath), qconfig.min_contig)) return False # correcting if not correct_fasta(contigs_fpath, corr_fpath, qconfig.min_contig): return False ## filling column "Assembly" with names of assemblies report = reporting.get(corr_fpath) ## filling columns "Number of contigs >=110 bp", ">=200 bp", ">=500 bp" report.add_field(reporting.Fields.CONTIGS__FOR_THRESHOLDS, [ sum(1 for l in lengths if l >= threshold) for threshold in qconfig.contig_thresholds ]) report.add_field(reporting.Fields.TOTALLENS__FOR_THRESHOLDS, [ sum(l for l in lengths if l >= threshold) for threshold in qconfig.contig_thresholds ]) return True
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess( ['sh', gage_tool_path, reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def save_features_in_contigs(output_dirpath, contigs_fpaths, feature_name, features_in_contigs, ref_features_num): return save(output_dirpath + prefix_fn + feature_name + in_contigs_suffix_fn, { 'filenames': map(qutils.label_from_fpath, contigs_fpaths), feature_name + '_in_contigs': dict((qutils.label_from_fpath(contigs_fpath), feature_amounts) for (contigs_fpath, feature_amounts) in features_in_contigs.items()), 'ref_' + feature_name + '_number': ref_features_num, })
def get_color_and_ls(fpath, label=None): if not label: label = qutils.label_from_fpath(fpath) """ Returns tuple: color, line style """ return dict_color_and_ls[label]
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index): def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess( [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code tool_exec = os.path.join(tool_dir, 'glimmerhmm') # Note: why arabidopsis? for no particular reason, really. trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis') contigs = {} gffs = [] base_dir = tempfile.mkdtemp(dir=tmp_dir) for ind, seq in read_fasta(fasta_fpath): contig_path = os.path.join(base_dir, ind + '.fasta') gff_path = os.path.join(base_dir, ind + '.gff') write_fasta(contig_path, [(ind, seq)]) if run(contig_path, gff_path) == 0: gffs.append(gff_path) contigs[ind] = seq if not gffs: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(fasta_fpath)) return None, None, None, None out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff') unique, total = set(), 0 genes = [] cnt = [0] * len(gene_lengths) for contig, gene_id, start, end, strand in parse_gff(out_gff_path): total += 1 if strand == '+': gene_seq = contigs[contig][start:end + 1] else: gene_seq = rev_comp(contigs[contig][start:end + 1]) if gene_seq not in unique: unique.add(gene_seq) genes.append((gene_id, gene_seq)) for idx, gene_length in enumerate(gene_lengths): cnt[idx] += end - start > gene_length if OUTPUT_FASTA: out_fasta_path = out_fpath + '_genes.fasta' write_fasta(out_fasta_path, genes) if not qconfig.debug: shutil.rmtree(base_dir) #return out_gff_path, out_fasta_path, len(unique), total, cnt return out_gff_path, len(unique), total, cnt
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tool_src_dirpath = os.path.join(tool_dirpath, 'src') tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm') tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isfile(tool_exec_fpath): # making logger.main_info("Compiling GlimmerHMM...") return_code = qutils.call_subprocess( ['make', '-C', tool_src_dirpath], stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'), indent=' ') if return_code != 0 or not os.path.isfile(tool_exec_fpath): logger.error( "Failed to compile GlimmerHMM (" + tool_src_dirpath + ")!\nTry to compile it manually or do not use --gene-finding " "option with --eukaryote.\nUse --debug option to see the command lines." ) return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed results = Parallel(n_jobs=n_jobs)( delayed(predict_genes)(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) unique, cnt = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if cnt is not None: report.add_field(reporting.Fields.PREDICTED_GENES, cnt) if unique is None and cnt is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(contigs_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template): assembly_label = qutils.label_from_fpath(asm.fpath) logger.info(' ' + 'processing ' + assembly_label) added_ref_asm = [] not_aligned_fname = assembly_label + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() aligned_contigs_for_each_ref = {} contigs_seq = fastaparser.read_fasta_one_time(asm.fpath) if os.path.exists(alignments_fpath_template % assembly_label): for line in open(alignments_fpath_template % assembly_label): values = line.split() if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys(): ref_name = contigs_analyzer.ref_labels_by_chromosomes[ values[0]] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, assembly_label + '_to_' + ref_name[:40] + '.fasta') if ref_name not in aligned_contigs_for_each_ref: aligned_contigs_for_each_ref[ref_name] = [] for (cont_name, seq) in contigs_seq: if not cont_name in contigs: contigs[cont_name] = seq if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ ref_name]: # Collecting all aligned contigs names in order to futher extract not-aligned aligned_contig_names.add(cont_name) aligned_contigs_for_each_ref[ref_name].append( cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, assembly_label) if ref_asm.name not in added_ref_asm: if ref_name in assemblies_by_ref: assemblies_by_ref[ref_name].append(ref_asm) added_ref_asm.append(ref_asm.name) # Exctraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) return assemblies_by_ref, not_aligned_asm
def save_features_in_contigs(output_dirpath, contigs_fpaths, feature_name, features_in_contigs, ref_features_num): return save( output_dirpath + prefix_fn + feature_name + in_contigs_suffix_fn, { 'filenames': map(qutils.label_from_fpath, contigs_fpaths), feature_name + '_in_contigs': dict((qutils.label_from_fpath(contigs_fpath), feature_amounts) for (contigs_fpath, feature_amounts) in features_in_contigs.items()), 'ref_' + feature_name + '_number': ref_features_num, })
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tool_src_dirpath = os.path.join(tool_dirpath, 'src') tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm') tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isfile(tool_exec_fpath): # making logger.main_info("Compiling GlimmerHMM...") return_code = qutils.call_subprocess( ['make', '-C', tool_src_dirpath], stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'), indent=' ') if return_code != 0 or not os.path.isfile(tool_exec_fpath): logger.error("Failed to compile GlimmerHMM (" + tool_src_dirpath + ")!\nTry to compile it manually or do not use --gene-finding " "option with --eukaryote.\nUse --debug option to see the command lines.") return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) unique, cnt = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if cnt is not None: report.add_field(reporting.Fields.PREDICTED_GENES, cnt) if unique is None and cnt is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(contigs_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def save_colors(results_dirpath, contigs_fpaths, dict_colors, meta=False): # coordinates for Nx, NAx, NGx, NGAX from libs import plotter if meta: html_fpath = os.path.join(results_dirpath, report_fname) with open(html_fpath) as f_html: html_text = f_html.read() html_text = re.sub("{{ " + "colors" + " }}", "standard_colors", html_text) with open(html_fpath, "w") as f_html: f_html.write(html_text) else: colors_and_ls = [dict_colors[qutils.label_from_fpath(contigs_fpath)] for contigs_fpath in contigs_fpaths] colors = [color_and_ls[0] for color_and_ls in colors_and_ls] colors_for_html = [html_colors[plotter.colors.index(color)] for color in colors] json_fpath = json_saver.save_colors(results_dirpath, colors_for_html) append(results_dirpath, json_fpath, "colors")
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template): assembly_label = qutils.label_from_fpath(asm.fpath) logger.info(' ' + 'processing ' + assembly_label) added_ref_asm = [] not_aligned_fname = assembly_label + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() aligned_contigs_for_each_ref = {} contigs_seq = fastaparser.read_fasta_one_time(asm.fpath) if os.path.exists(alignments_fpath_template % assembly_label): for line in open(alignments_fpath_template % assembly_label): values = line.split() if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys(): ref_name = contigs_analyzer.ref_labels_by_chromosomes[values[0]] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, assembly_label + '_to_' + ref_name[:40] + '.fasta') if ref_name not in aligned_contigs_for_each_ref: aligned_contigs_for_each_ref[ref_name] = [] for (cont_name, seq) in contigs_seq: if not cont_name in contigs: contigs[cont_name] = seq if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ref_name]: # Collecting all aligned contigs names in order to futher extract not-aligned aligned_contig_names.add(cont_name) aligned_contigs_for_each_ref[ref_name].append(cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, assembly_label) if ref_asm.name not in added_ref_asm: if ref_name in assemblies_by_ref: assemblies_by_ref[ref_name].append(ref_asm) added_ref_asm.append(ref_asm.name) # Exctraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) return assemblies_by_ref, not_aligned_asm
def _handle_fasta(contigs_fpath, corr_fpath, reporting): lengths = fastaparser.get_lengths_from_fastafile(contigs_fpath) if not sum(l for l in lengths if l >= qconfig.min_contig): logger.warning("Skipping %s because it doesn't contain contigs >= %d bp." % (qutils.label_from_fpath(corr_fpath), qconfig.min_contig)) return False # correcting if not correct_fasta(contigs_fpath, corr_fpath, qconfig.min_contig): return False ## filling column "Assembly" with names of assemblies report = reporting.get(corr_fpath) ## filling columns "Number of contigs >=110 bp", ">=200 bp", ">=500 bp" report.add_field(reporting.Fields.CONTIGS__FOR_THRESHOLDS, [sum(1 for l in lengths if l >= threshold) for threshold in qconfig.contig_thresholds]) report.add_field(reporting.Fields.TOTALLENS__FOR_THRESHOLDS, [sum(l for l in lengths if l >= threshold) for threshold in qconfig.contig_thresholds]) return True
def save_colors(results_dirpath, contigs_fpaths, dict_colors, meta=False): # coordinates for Nx, NAx, NGx, NGAX from libs import plotter if meta: html_fpath = os.path.join(results_dirpath, report_fname) with open(html_fpath) as f_html: html_text = f_html.read() html_text = re.sub('{{ ' + 'colors' + ' }}', 'standard_colors', html_text) with open(html_fpath, 'w') as f_html: f_html.write(html_text) else: colors_and_ls = [ dict_colors[qutils.label_from_fpath(contigs_fpath)] for contigs_fpath in contigs_fpaths ] colors = [color_and_ls[0] for color_and_ls in colors_and_ls] colors_for_html = [ html_colors[plotter.colors.index(color)] for color in colors ] json_fpath = json_saver.save_colors(results_dirpath, colors_for_html) append(results_dirpath, json_fpath, 'colors')
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None if ref_fpath: reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size logger.info(' Estimated reference length = ' + str(reference_length)) if reference_length: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_length(json_output_dir, reference_length) # Saving for an HTML report if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_reference_length(results_dir, reference_length) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, lists_of_lengths) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, lists_of_lengths) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath) list_of_GC_distributions.append(GC_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) ) report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) if json_output_dir: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.draw_plots: import plotter ########################################################################import plotter plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length') ######################################################################## # Drawing GC content plot... list_of_GC_distributions_with_ref = list_of_GC_distributions if ref_fpath: list_of_GC_distributions_with_ref.append(reference_GC_distribution) # Drawing cumulative plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot') ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', []) if reference_length: plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))]) logger.info('Done.')
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index): def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess([ tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code tool_exec = os.path.join(tool_dir, 'glimmerhmm') # Note: why arabidopsis? for no particular reason, really. trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis') contigs = {} gffs = [] base_dir = tempfile.mkdtemp(dir=tmp_dir) for ind, seq in read_fasta(fasta_fpath): contig_path = os.path.join(base_dir, ind + '.fasta') gff_path = os.path.join(base_dir, ind + '.gff') write_fasta(contig_path, [(ind, seq)]) if run(contig_path, gff_path) == 0: gffs.append(gff_path) contigs[ind] = seq if not gffs: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(fasta_fpath)) return None, None, None, None out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff') unique, total = set(), 0 genes = [] cnt = [0] * len(gene_lengths) for contig, gene_id, start, end, strand in parse_gff(out_gff_path): total += 1 if strand == '+': gene_seq = contigs[contig][start:end + 1] else: gene_seq = rev_comp(contigs[contig][start:end + 1]) if gene_seq not in unique: unique.add(gene_seq) genes.append((gene_id, gene_seq)) for idx, gene_length in enumerate(gene_lengths): cnt[idx] += end - start > gene_length if OUTPUT_FASTA: out_fasta_path = out_fpath + '_genes.fasta' write_fasta(out_fasta_path, genes) if not qconfig.debug: shutil.rmtree(base_dir) #return out_gff_path, out_fasta_path, len(unique), total, cnt return out_gff_path, len(unique), total, cnt
def do(ref_fpath, contigs_fpaths, output_dirpath): gage_results_dirpath = os.path.join(output_dirpath, 'gage') # suffixes for files with report tables in plain text and tab separated formats if not os.path.isdir(gage_results_dirpath): os.mkdir(gage_results_dirpath) ######################################################################## gage_tool_path = os.path.join(qconfig.LIBS_LOCATION, 'gage', 'getCorrectnessStats.sh') ######################################################################## logger.print_timestamp() logger.info('Running GAGE...') metrics = [ 'Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs', 'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation', 'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50' ] metrics_in_reporting = [ reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE, reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50 ] tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp') if not os.path.exists(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed return_codes = Parallel(n_jobs=n_jobs)( delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths)) if 0 not in return_codes: logger.warning('Error occurred while GAGE was processing assemblies.' ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr')) return ## find metrics for total report: for i, contigs_fpath in enumerate(contigs_fpaths): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) report = reporting.get(contigs_fpath) log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stdout') logfile_out = open(log_out_fpath, 'r') cur_metric_id = 0 for line in logfile_out: if metrics[cur_metric_id] in line: if (metrics[cur_metric_id].startswith('N50')): report.add_field( metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip()) else: report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip()) cur_metric_id += 1 if cur_metric_id == len(metrics): break logfile_out.close() reporting.save_gage(output_dirpath) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def js_data_gen(assemblies, contigs_fpaths, chr_names, chromosomes_length, output_dir_path, cov_fpath, ref_fpath, genome_size): chr_to_aligned_blocks = dict() for chr in chr_names: chr_init = [] for fpath in contigs_fpaths: f = Alignment('FICTIVE', 0, 0, 0, 0, False, 0, 0, None) f.label = qutils.label_from_fpath(fpath) f.unshifted_start = 0 f.unshifted_end = 0 chr_init.append(f) chr_to_aligned_blocks.setdefault(chr, chr_init) for assembly in assemblies.assemblies: for align in assembly.alignments: chr_to_aligned_blocks[align.ref_name].append(align) summary_fname = 'alignment_summary.html' summary_path = os.path.join(output_dir_path, summary_fname) output_all_files_dir_path = os.path.join(output_dir_path, alignment_plots_dirname) if not os.path.exists(output_all_files_dir_path): os.mkdir(output_all_files_dir_path) import contigs_analyzer if contigs_analyzer.ref_labels_by_chromosomes: contig_names_by_refs = contigs_analyzer.ref_labels_by_chromosomes chr_full_names = list( set([contig_names_by_refs[contig] for contig in chr_names])) elif genome_size < MAX_SIZE_FOR_COMB_PLOT and len( chr_names) >= MIN_CONTIGS_FOR_COMB_PLOT: chr_full_names = [NAME_FOR_ONE_PLOT] else: chr_full_names = chr_names if cov_fpath: cov_data = dict() not_covered = dict() cur_len = dict() with open(cov_fpath, 'r') as coverage: name = chr_names[0] contig_to_chr = {} for chr in chr_full_names: cov_data.setdefault(chr, []) not_covered.setdefault(chr, []) cur_len.setdefault(chr, 0) if contigs_analyzer.ref_labels_by_chromosomes: contigs = [ contig for contig in chr_names if contig_names_by_refs[contig] == chr ] elif chr == NAME_FOR_ONE_PLOT: contigs = chr_names else: contigs = [chr] for contig in contigs: contig_to_chr[contig] = chr for index, line in enumerate(coverage): c = list(line.split()) name = contig_to_chr[qutils.correct_name(c[0])] cur_len[name] += int(c[2]) if index % 100 == 0 and index > 0: cov_data[name].append(cur_len[name] / 100) cur_len[name] = 0 if c[2] == '0': not_covered[name].append(c[1]) chr_sizes = {} num_contigs = {} aligned_bases = genome_analyzer.get_ref_aligned_lengths() aligned_bases_by_chr = {} num_misassemblies = {} aligned_assemblies = {} for i, chr in enumerate(chr_full_names): short_chr = chr[:30] num_misassemblies[chr] = 0 aligned_bases_by_chr[chr] = [] aligned_assemblies[chr] = [] with open( os.path.join(output_all_files_dir_path, 'data_%s.js' % short_chr), 'w') as result: result.write('"use strict";\n') if contigs_analyzer.ref_labels_by_chromosomes: contigs = [ contig for contig in chr_names if contig_names_by_refs[contig] == chr ] result.write('var links_to_chromosomes = {};\n') links_to_chromosomes = [] used_chromosomes = [] elif chr == NAME_FOR_ONE_PLOT: contigs = chr_names else: contigs = [chr] chr_size = sum([chromosomes_length[contig] for contig in contigs]) chr_sizes[chr] = chr_size num_contigs[chr] = len(contigs) for contig in contigs: aligned_bases_by_chr[chr].extend(aligned_bases[contig]) data_str = 'var chromosomes_len = {};\n' for contig in contigs: l = chromosomes_length[contig] data_str += 'chromosomes_len["{contig}"] = {l};\n'.format( **locals()) result.write(data_str) # adding assembly data data_str = 'var contig_data = {};\n' data_str += 'contig_data["{chr}"] = [ '.format(**locals()) prev_len = 0 chr_lengths = [0] + [ chromosomes_length[contig] for contig in contigs ] for num_contig, contig in enumerate(contigs): if num_contig > 0: prev_len += chr_lengths[num_contig] if len(chr_to_aligned_blocks[contig]) > 0: for alignment in chr_to_aligned_blocks[contig]: if alignment.misassembled: num_misassemblies[chr] += 1 corr_start = prev_len + alignment.unshifted_start corr_end = prev_len + alignment.unshifted_end data_str += '{{name: "{alignment.name}", corr_start: {corr_start}, corr_end: {corr_end},' \ 'start: {alignment.unshifted_start}, end: {alignment.unshifted_end}, assembly: "{alignment.label}", similar: "{alignment.similar}", misassembled: "{alignment.misassembled}" '.format(**locals()) if alignment.name != 'FICTIVE': if len(aligned_assemblies[chr]) < len( contigs_fpaths ) and alignment.label not in aligned_assemblies[ chr]: aligned_assemblies[chr].append(alignment.label) data_str += ', structure: [' for el in alignment.misassembled_structure: if type(el) == list: if el[5] in contigs: num_chr = contigs.index(el[5]) corr_len = sum(chr_lengths[:num_chr + 1]) else: corr_len = -int(el[1]) if contigs_analyzer.ref_labels_by_chromosomes and el[ 5] not in used_chromosomes: used_chromosomes.append(el[5]) new_chr = contig_names_by_refs[ el[5]] links_to_chromosomes.append( 'links_to_chromosomes["{el[5]}"] = "{new_chr}";\n' .format(**locals())) corr_start = corr_len + int(el[0]) corr_end = corr_len + int(el[1]) data_str += '{{type: "A", corr_start: {corr_start}, corr_end: {corr_end}, start: {el[0]}, end: {el[1]}, start_in_contig: {el[2]}, end_in_contig: {el[3]}, IDY: {el[4]}, chr: "{el[5]}"}},'.format( **locals()) elif type(el) == str: data_str += '{{type: "M", mstype: "{el}"}},'.format( **locals()) if data_str[-1] == '[': data_str = data_str + ']},' else: data_str = data_str[:-1] + ']},' else: data_str += '},' data_str = data_str[:-1] + '];\n\n' result.write(data_str) if contigs_analyzer.ref_labels_by_chromosomes: result.write(''.join(links_to_chromosomes)) if cov_fpath: # adding coverage data data_str = 'var coverage_data = {};\n' if cov_data[chr]: data_str += 'coverage_data["{chr}"] = [ '.format( **locals()) for e in cov_data[chr]: data_str += '{e},'.format(**locals()) if len(data_str) > 10000 and e != cov_data[chr][-1]: result.write(data_str) data_str = '' data_str = data_str[:-1] + '];\n' result.write(data_str) data_str = '' data_str = 'var not_covered = {};\n' data_str += 'not_covered["{chr}"] = [ '.format(**locals()) if len(not_covered[chr]) > 0: for e in not_covered[chr]: data_str += '{e},'.format(**locals()) if len(data_str) > 10000 and e != cov_data[chr][-1]: result.write(data_str) data_str = '' data_str = data_str[:-1] data_str += '];\n' result.write(data_str) data_str = '' with open(html_saver.get_real_path('_chr_templ.html'), 'r') as template: with open( os.path.join(output_all_files_dir_path, '_{short_chr}.html'.format(**locals())), 'w') as result: for line in template: if line.find( '<script type="text/javascript" src=""></script>' ) != -1: result.write( '<script type="text/javascript" src="data_{short_chr}.js"></script>\n' .format(**locals())) else: result.write(line) if line.find('<body>') != -1: chr_size = chr_sizes[chr] chr_name = chr.replace('_', ' ') if len(chr_name) > 50: chr_name = chr_name[:50] + '...' title = 'CONTIG ALIGNMENT BROWSER: %s (' % chr_name + ( '%s fragments, ' % num_contigs[chr] if num_contigs[chr] > 1 else '' ) + '%s bp)' % format_long_numbers(chr_size) result.write( '<div class = "block title"><a href="../{summary_fname}"><button class="back_button">↵</button></a>{title}</div>\n' .format(**locals())) if line.find( '<script type="text/javascript">') != -1: chromosome = '","'.join(contigs) result.write( 'var CHROMOSOME = "{chr}";\n'.format( **locals())) result.write( 'var chrContigs = ["{chromosome}"];\n'. format(**locals())) with open(html_saver.get_real_path('alignment_summary_templ.html'), 'r') as template: with open(summary_path, 'w') as result: num_aligned_assemblies = [ len(aligned_assemblies[chr]) for chr in chr_full_names ] is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1 for line in template: result.write(line) if line.find('<!--- assemblies: ---->') != -1: if not is_unaligned_asm_exists: result.write( '<div class="subtitle"># assemblies: %s</div>' % len(contigs_fpaths)) if line.find('<!--- th_assemblies: ---->') != -1: if is_unaligned_asm_exists: result.write('<th># assemblies</th>') if line.find('<!--- references: ---->') != -1: for chr in sorted(chr_full_names): result.write('<tr>') short_chr = chr[:30] chr_link = os.path.join( alignment_plots_dirname, '_{short_chr}.html'.format(**locals())) chr_name = chr.replace('_', ' ') aligned_lengths = [ aligned_len for aligned_len in aligned_bases_by_chr[chr] if aligned_len is not None ] chr_genome = sum(aligned_lengths) * 100.0 / ( chr_sizes[chr] * len(contigs_fpaths)) chr_size = chr_sizes[chr] result.write('<td><a href="%s">%s</a></td>' % (chr_link, chr_name)) result.write('<td>%s</td>' % num_contigs[chr]) result.write('<td>%s</td>' % format_long_numbers(chr_size)) if is_unaligned_asm_exists: result.write('<td>%s</td>' % len(aligned_assemblies[chr])) result.write('<td>%.3f</td>' % chr_genome) result.write('<td>%s</td>' % num_misassemblies[chr]) result.write('</tr>') copyfile( html_saver.get_real_path( os.path.join('static', 'contig_alignment_plot.css')), os.path.join(output_all_files_dir_path, 'contig_alignment_plot.css')) copyfile(html_saver.get_real_path(os.path.join('static', 'd3.js')), os.path.join(output_all_files_dir_path, 'd3.js')) copyfile( html_saver.get_real_path( os.path.join('static', 'scripts', 'contig_alignment_plot_script.js')), os.path.join(output_all_files_dir_path, 'contig_alignment_plot_script.js'))
def main(args): if ' ' in qconfig.QUAST_HOME: logger.error( 'QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage() sys.exit(0) reload(qconfig) try: options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options) except getopt.GetoptError: _, exc_value, _ = sys.exc_info() print >> sys.stderr, exc_value print >> sys.stderr qconfig.usage() sys.exit(2) for opt, arg in options[:]: if opt == '--test' or opt == '--test-sv': options.remove((opt, arg)) options += [ ('-o', 'quast_test_output'), ('-R', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reference.fasta.gz')), # for compiling MUMmer ('-O', os.path.join(qconfig.QUAST_HOME, 'test_data', 'operons.gff')), ('-G', os.path.join(qconfig.QUAST_HOME, 'test_data', 'genes.gff')), ('--gage', ''), # for compiling GAGE Java classes ('--gene-finding', ''), ('--eukaryote', ''), ('--glimmer', '') ] # for compiling GlimmerHMM if opt == '--test-sv': options += [('-1', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads1.fastq.gz')), ('-2', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads2.fastq.gz'))] contigs_fpaths += [ os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_1.fasta'), os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_2.fasta') ] qconfig.test = True if opt.startswith('--help') or opt == '-h': qconfig.usage(opt == "--help-hidden", short=False) sys.exit(0) elif opt.startswith('--version') or opt == '-v': qconfig.print_version() sys.exit(0) if not contigs_fpaths: logger.error("You should specify at least one file with contigs!\n") qconfig.usage() sys.exit(2) json_output_dirpath = None output_dirpath = None labels = None all_labels_from_dirs = False qconfig.is_combined_ref = False ref_fpath = '' genes_fpaths = [] operons_fpaths = [] bed_fpath = None reads_fpath_f = '' reads_fpath_r = '' # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7. for opt, arg in options: if opt in ('-d', '--debug'): qconfig.debug = True logger.set_up_console_handler(debug=True) elif opt in ('-o', "--output-dir"): output_dirpath = os.path.abspath(arg) qconfig.make_latest_symlink = False if ' ' in output_dirpath: logger.error('QUAST does not support spaces in paths. \n' 'You have specified ' + str(output_dirpath) + ' as an output path.\n' 'Please, use a different directory.\n', to_stderr=True, exit_with_code=3) elif opt in ('-G', "--genes"): genes_fpaths.append(assert_file_exists(arg, 'genes')) elif opt in ('-O', "--operons"): operons_fpaths.append(assert_file_exists(arg, 'operons')) elif opt in ('-R', "--reference"): ref_fpath = assert_file_exists(arg, 'reference') elif opt == "--contig-thresholds": qconfig.contig_thresholds = arg elif opt in ('-m', "--min-contig"): qconfig.min_contig = int(arg) elif opt in ('-t', "--threads"): qconfig.max_threads = int(arg) if qconfig.max_threads < 1: qconfig.max_threads = 1 elif opt in ('-c', "--min-cluster"): qconfig.min_cluster = int(arg) elif opt in ('-i', "--min-alignment"): qconfig.min_alignment = int(arg) elif opt == "--est-ref-size": qconfig.estimated_reference_size = int(arg) elif opt == "--gene-thresholds": qconfig.genes_lengths = arg elif opt in ('-j', '--save-json'): qconfig.save_json = True elif opt in ('-J', '--save-json-to'): qconfig.save_json = True qconfig.make_latest_symlink = False json_output_dirpath = arg elif opt == '--err-fpath': # for web-quast qconfig.save_error = True qconfig.error_log_fname = arg elif opt in ('-s', "--scaffolds"): qconfig.scaffolds = True elif opt == "--gage": qconfig.with_gage = True elif opt in ('-e', "--eukaryote"): qconfig.prokaryote = False elif opt in ('-f', "--gene-finding"): qconfig.gene_finding = True elif opt in ('-a', "--ambiguity-usage"): if arg in ["none", "one", "all"]: qconfig.ambiguity_usage = arg elif opt in ('-u', "--use-all-alignments"): qconfig.use_all_alignments = True elif opt == "--strict-NA": qconfig.strict_NA = True elif opt in ('-x', "--extensive-mis-size"): if int(arg) <= qconfig.MAX_INDEL_LENGTH: logger.error( "--extensive-mis-size should be greater than maximum indel length (%d)!" % qconfig.MAX_INDEL_LENGTH, 1, to_stderr=True) qconfig.extensive_misassembly_threshold = int(arg) elif opt == '--no-snps': qconfig.show_snps = False elif opt == '--no-plots': qconfig.draw_plots = False elif opt == '--no-html': qconfig.html_report = False elif opt == '--no-check': qconfig.no_check = True elif opt == '--no-gc': qconfig.no_gc = True elif opt == '--fast': # --no-gc, --no-plots, --no-snps #qconfig.no_check = True # too risky to include qconfig.no_gc = True qconfig.show_snps = False qconfig.draw_plots = False qconfig.html_report = False elif opt == '--plots-format': if arg.lower() in qconfig.supported_plot_extensions: qconfig.plot_extension = arg.lower() else: logger.error( 'Format "%s" is not supported. Please, use one of the supported formats: %s.' % (arg, ', '.join(qconfig.supported_plot_extensions)), to_stderr=True, exit_with_code=2) elif opt == '--meta': qconfig.meta = True elif opt == '--no-check-meta': qconfig.no_check = True qconfig.no_check_meta = True elif opt == '--references-list': pass elif opt in ('-l', '--labels'): labels = parse_labels(arg, contigs_fpaths) elif opt == '-L': all_labels_from_dirs = True elif opt == '--glimmer': qconfig.glimmer = True elif opt == '--combined-ref': qconfig.is_combined_ref = True elif opt == '--memory-efficient': qconfig.memory_efficient = True elif opt == '--silent': qconfig.silent = True elif opt in ('-1', '--reads1'): reads_fpath_f = arg elif opt in ('-2', '--reads2'): reads_fpath_r = arg elif opt == '--bed-file': bed_fpath = arg elif opt == '--contig-alignment-html': qconfig.create_contig_alignment_html = True else: logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2) for contigs_fpath in contigs_fpaths: assert_file_exists(contigs_fpath, 'contigs') labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs) output_dirpath, json_output_dirpath, existing_alignments = \ _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.set_up_file_handler(output_dirpath) args = [os.path.realpath(__file__)] for k, v in options: args.extend([k, v]) args.extend(contigs_fpaths) logger.print_command_line(args, wrap_after=None, is_main=True) logger.start() if existing_alignments: logger.main_info() logger.notice( "Output directory already exists. Existing Nucmer alignments can be used." ) qutils.remove_reports(output_dirpath) if qconfig.contig_thresholds == "None": qconfig.contig_thresholds = [] else: qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(",")) if qconfig.genes_lengths == "None": qconfig.genes_lengths = [] else: qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(",")) qconfig.set_max_threads(logger) logger.main_info() logger.print_params() ######################################################################## from libs import reporting reload(reporting) if qconfig.is_combined_ref: corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname) else: if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCE if ref_fpath: logger.main_info() logger.main_info('Reference:') ref_fpath = _correct_reference(ref_fpath, corrected_dirpath) else: ref_fpath = '' # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') contigs_fpaths, old_contigs_fpaths = _correct_contigs( contigs_fpaths, corrected_dirpath, reporting, labels) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) reads_fpaths = [] if reads_fpath_f: reads_fpaths.append(reads_fpath_f) if reads_fpath_r: reads_fpaths.append(reads_fpath_r) if reads_fpaths: bed_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths, reads_fpaths, None, os.path.join(output_dirpath, qconfig.variation_dirname), external_logger=logger) if not contigs_fpaths: logger.error( "None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 qconfig.assemblies_fpaths = contigs_fpaths if qconfig.with_gage: ######################################################################## ### GAGE ######################################################################## if not ref_fpath: logger.warning( "GAGE can't be run without a reference and will be skipped.") else: from libs import gage gage.do(ref_fpath, contigs_fpaths, output_dirpath) # Where all pdfs will be saved all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) all_pdf_file = None if qconfig.draw_plots or qconfig.html_report: from libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. try: from matplotlib.backends.backend_pdf import PdfPages all_pdf_file = PdfPages(all_pdf_fpath) except: all_pdf_file = None if json_output_dirpath: from libs.html_saver import json_saver if json_saver.simplejson_error: json_output_dirpath = None ######################################################################## ### Stats and plots ######################################################################## from libs import basic_stats basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), json_output_dirpath, output_dirpath) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None if ref_fpath: ######################################################################## ### former PLANTAKOLYA, PLANTAGORA ######################################################################## from libs import contigs_analyzer nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do( ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports'), old_contigs_fpaths, bed_fpath) for contigs_fpath in contigs_fpaths: if nucmer_statuses[ contigs_fpath] == contigs_analyzer.NucmerStatus.OK: aligned_contigs_fpaths.append(contigs_fpath) aligned_lengths_lists.append( aligned_lengths_per_fpath[contigs_fpath]) # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join( output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from libs import aligned_stats aligned_stats.do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from libs import genome_analyzer genome_analyzer.do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) if qconfig.gene_finding or qconfig.glimmer: if qconfig.glimmer: ######################################################################## ### Glimmer ######################################################################## from libs import glimmer glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes')) else: ######################################################################## ### GeneMark ######################################################################## from libs import genemark genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.prokaryote, qconfig.meta) else: logger.main_info("") logger.notice( "Genes are not predicted by default. Use --gene-finding option to enable it." ) ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total( output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots: logger.print_timestamp() logger.main_info('Drawing large plots...') logger.main_info( 'This may take a while: press Ctrl-C to skip this step..') try: if detailed_contigs_reports_dirpath and qconfig.show_snps: contig_report_fpath_pattern = os.path.join( detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout') else: contig_report_fpath_pattern = None number_of_steps = sum([ int(bool(value)) for value in [contig_report_fpath_pattern, all_pdf_file] ]) if contig_report_fpath_pattern: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.main_info( ' 1 of %d: Creating contig alignment plot...' % number_of_steps) from libs import contig_alignment_plotter contig_alignment_plot_fpath = contig_alignment_plotter.do( contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, similar=True) if all_pdf_file: # full report in PDF format: all tables and plots logger.main_info( ' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_file) logger.main_info('Done') except KeyboardInterrupt: logger.main_info('..step skipped!') os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.main_info('RESULTS:') logger.main_info(' Text versions of total report are saved to ' + reports_fpaths) logger.main_info( ' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if json_output_dirpath: json_saver.save_total_report(json_output_dirpath, qconfig.min_contig, ref_fpath) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls) html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath) if os.path.isfile(all_pdf_fpath): logger.main_info(' PDF version (tables and plots) saved to ' + all_pdf_fpath) if contig_alignment_plot_fpath: logger.main_info(' Contig alignment plot: %s' % contig_alignment_plot_fpath) _cleanup(corrected_dirpath) logger.finish_up(check_test=qconfig.test) return 0
def do(ref_fpath, contigs_fpaths, output_dirpath): gage_results_dirpath = os.path.join(output_dirpath, 'gage') # suffixes for files with report tables in plain text and tab separated formats if not os.path.isdir(gage_results_dirpath): os.mkdir(gage_results_dirpath) ######################################################################## gage_tool_path = os.path.join(qconfig.LIBS_LOCATION, 'gage', 'getCorrectnessStats.sh') ######################################################################## logger.print_timestamp() logger.info('Running GAGE...') metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs', 'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation', 'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50'] metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE, reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50] tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp') if not os.path.exists(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths)) if 0 not in return_codes: logger.warning('Error occurred while GAGE was processing assemblies.' ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr')) return ## find metrics for total report: for i, contigs_fpath in enumerate(contigs_fpaths): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) report = reporting.get(contigs_fpath) log_out_fpath = os.path.join( gage_results_dirpath, 'gage_' + assembly_name + '.stdout') logfile_out = open(log_out_fpath, 'r') cur_metric_id = 0 for line in logfile_out: if metrics[cur_metric_id] in line: if (metrics[cur_metric_id].startswith('N50')): report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip()) else: report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip()) cur_metric_id += 1 if cur_metric_id == len(metrics): break logfile_out.close() reporting.save_gage(output_dirpath) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def main(args): if ' ' in quast_dirpath: logger.error('QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(quast_dirpath) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage() sys.exit(0) reload(qconfig) try: options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options) except getopt.GetoptError: _, exc_value, _ = sys.exc_info() print >> sys.stderr, exc_value print >> sys.stderr qconfig.usage() sys.exit(2) for opt, arg in options[:]: if opt in ('-d', '--debug'): options.remove((opt, arg)) qconfig.debug = True logger.set_up_console_handler(debug=True) if opt == '--test': options.remove((opt, arg)) options += [('-o', 'quast_test_output'), ('-R', 'test_data/reference.fasta.gz'), # for compiling MUMmer ('-O', 'test_data/operons.gff'), ('-G', 'test_data/genes.gff'), ('--gene-finding',''), ('--eukaryote','')] # for compiling GlimmerHMM contigs_fpaths += ['test_data/contigs_1.fasta', 'test_data/contigs_2.fasta'] qconfig.test = True if opt.startswith('--help'): qconfig.usage(opt == "--help-hidden") sys.exit(0) if not contigs_fpaths: logger.error("You should specify at least one file with contigs!\n") qconfig.usage() sys.exit(2) json_output_dirpath = None output_dirpath = None labels = None all_labels_from_dirs = False ref_fpath = '' genes_fpaths = [] operons_fpaths = [] # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7. for opt, arg in options: if opt in ('-o', "--output-dir"): output_dirpath = os.path.abspath(arg) qconfig.make_latest_symlink = False elif opt in ('-G', "--genes"): genes_fpaths.append(assert_file_exists(arg, 'genes')) elif opt in ('-O', "--operons"): operons_fpaths.append(assert_file_exists(arg, 'operons')) elif opt in ('-R', "--reference"): ref_fpath = assert_file_exists(arg, 'reference') elif opt in ('-t', "--contig-thresholds"): qconfig.contig_thresholds = arg elif opt in ('-M', "--min-contig"): qconfig.min_contig = int(arg) elif opt in ('-T', "--threads"): qconfig.max_threads = int(arg) if qconfig.max_threads < 1: qconfig.max_threads = 1 elif opt in ('-c', "--mincluster"): qconfig.mincluster = int(arg) elif opt == "--est-ref-size": qconfig.estimated_reference_size = int(arg) elif opt in ('-S', "--gene-thresholds"): qconfig.genes_lengths = arg elif opt in ('-j', '--save-json'): qconfig.save_json = True elif opt in ('-J', '--save-json-to'): qconfig.save_json = True qconfig.make_latest_symlink = False json_output_dirpath = arg elif opt in ('-s', "--scaffolds"): qconfig.scaffolds = True elif opt == "--gage": qconfig.with_gage = True elif opt in ('-e', "--eukaryote"): qconfig.prokaryote = False elif opt in ('-f', "--gene-finding"): qconfig.gene_finding = True elif opt in ('-a', "--ambiguity-usage"): if arg in ["none", "one", "all"]: qconfig.ambiguity_usage = arg elif opt in ('-u', "--use-all-alignments"): qconfig.use_all_alignments = True elif opt in ('-n', "--strict-NA"): qconfig.strict_NA = True elif opt == '--no-plots': qconfig.draw_plots = False elif opt == '--no-html': qconfig.html_report = False elif opt in ('-m', '--meta'): qconfig.meta = True elif opt in ('-l', '--labels'): labels = parse_labels(arg, contigs_fpaths) elif opt == '-L': all_labels_from_dirs = True else: logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2) for contigs_fpath in contigs_fpaths: assert_file_exists(contigs_fpath, 'contigs') labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs) output_dirpath, json_output_dirpath, existing_alignments = \ _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.set_up_file_handler(output_dirpath) logger.print_command_line([os.path.realpath(__file__)] + args, wrap_after=None) logger.start() if existing_alignments: logger.info() logger.notice("Output directory already exists. Existing Nucmer alignments can be used.") qutils.remove_reports(output_dirpath) if qconfig.contig_thresholds == "None": qconfig.contig_thresholds = [] else: qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(",")) if qconfig.genes_lengths == "None": qconfig.genes_lengths = [] else: qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(",")) # Threading if qconfig.max_threads is None: try: import multiprocessing qconfig.max_threads = multiprocessing.cpu_count() except: logger.warning('Failed to determine the number of CPUs') qconfig.max_threads = qconfig.DEFAULT_MAX_THREADS logger.info() logger.notice('Maximum number of threads is set to ' + str(qconfig.max_threads) + ' (use --threads option to set it manually)') ######################################################################## from libs import reporting reload(reporting) if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCE if ref_fpath: logger.info() logger.info('Reference:') ref_fpath = _correct_reference(ref_fpath, corrected_dirpath) else: ref_fpath = '' # PROCESSING CONTIGS logger.info() logger.info('Contigs:') contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) if not contigs_fpaths: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 if qconfig.with_gage: ######################################################################## ### GAGE ######################################################################## if not ref_fpath: logger.warning("GAGE can't be run without a reference and will be skipped.") else: from libs import gage gage.do(ref_fpath, contigs_fpaths, output_dirpath) # Where all pdfs will be saved all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) all_pdf_file = None if qconfig.draw_plots: from libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. try: from matplotlib.backends.backend_pdf import PdfPages all_pdf_file = PdfPages(all_pdf_fpath) except: all_pdf_file = None ######################################################################## ### Stats and plots ######################################################################## from libs import basic_stats basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), json_output_dirpath, output_dirpath) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None if ref_fpath: ######################################################################## ### former PLANTAKOLYA, PLANTAGORA ######################################################################## from libs import contigs_analyzer nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do( ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports')) for contigs_fpath in contigs_fpaths: if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK: aligned_contigs_fpaths.append(contigs_fpath) aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath]) # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from libs import aligned_stats aligned_stats.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from libs import genome_analyzer genome_analyzer.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) if qconfig.gene_finding: if qconfig.prokaryote or qconfig.meta: ######################################################################## ### GeneMark ######################################################################## from libs import genemark genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.meta) else: ######################################################################## ### Glimmer ######################################################################## from libs import glimmer glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes')) else: logger.info("") logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.") ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots: logger.print_timestamp() logger.info('Drawing large plots...') logger.info('This may take a while: press Ctrl-C to skip this step..') try: number_of_steps = sum([int(bool(value)) for value in [detailed_contigs_reports_dirpath, all_pdf_file]]) if detailed_contigs_reports_dirpath: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.info(' 1 of %d: Creating contig alignment plot...' % number_of_steps) from libs import contig_alignment_plotter contig_alignment_plot_fpath = contig_alignment_plotter.do( contigs_fpaths, os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout'), output_dirpath, ref_fpath, similar=True) if all_pdf_file: # full report in PDF format: all tables and plots logger.info(' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_file) logger.info('Done') except KeyboardInterrupt: logger.info('..step skipped!') os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.info('RESULTS:') logger.info(' Text versions of total report are saved to ' + reports_fpaths) logger.info(' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if json_output_dirpath: json_saver.save_total_report(json_output_dirpath, qconfig.min_contig) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_total_report(output_dirpath, qconfig.min_contig) if os.path.isfile(all_pdf_fpath): logger.info(' PDF version (tables and plots) saved to ' + all_pdf_fpath) if contig_alignment_plot_fpath: logger.info(' Contig alignment plot: %s' % contig_alignment_plot_fpath) _cleanup(corrected_dirpath) logger.finish_up(check_test=qconfig.test) return 0
def get(assembly_fpath): if assembly_fpath not in assembly_fpaths: assembly_fpaths.append(assembly_fpath) return reports.setdefault(assembly_fpath, Report(qutils.label_from_fpath(assembly_fpath)))
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.main_info('Running NA-NGA calculation...') reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append( sum(fastaparser.get_lengths_from_fastafile(contigs_fpath))) import N50 for i, (contigs_fpath, lens, assembly_len) in enumerate( itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) na75 = N50.NG50(lens, assembly_len, 75) la50 = N50.LG50(lens, assembly_len) la75 = N50.LG50(lens, assembly_len, 75) if not qconfig.is_combined_ref: nga50 = N50.NG50(lens, reference_length) nga75 = N50.NG50(lens, reference_length, 75) lga50 = N50.LG50(lens, reference_length) lga75 = N50.LG50(lens, reference_length, 75) logger.info( ' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') + ', LA50 = ' + str(la50) + (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else '')) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LA75, la75) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## num_contigs = max([ len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists)) ]) if json_output_dirpath: from libs.html_saver import json_saver json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) import plotter if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... plotter.cumulative_plot( ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths, json_output_dir=json_output_dirpath) if not qconfig.is_combined_ref: plotter.Nx_plot( output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath) logger.main_info('Done.') return report_dict
def main(args): if ' ' in quast_dirpath: logger.error('QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(quast_dirpath) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage() sys.exit(0) reload(qconfig) try: options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options) except getopt.GetoptError: _, exc_value, _ = sys.exc_info() print >> sys.stderr, exc_value print >> sys.stderr qconfig.usage() sys.exit(2) for opt, arg in options[:]: if opt in ('-d', '--debug'): options.remove((opt, arg)) qconfig.debug = True logger.set_up_console_handler(debug=True) if opt == '--test': options.remove((opt, arg)) options += [('-o', 'quast_test_output'), ('-R', 'test_data/reference.fasta.gz'), # for compiling MUMmer ('-O', 'test_data/operons.gff'), ('-G', 'test_data/genes.gff'), ('--gene-finding',''), ('--eukaryote','')] # for compiling GlimmerHMM contigs_fpaths += ['test_data/contigs_1.fasta', 'test_data/contigs_2.fasta'] qconfig.test = True if opt.startswith('--help'): qconfig.usage(opt == "--help-hidden") sys.exit(0) if not contigs_fpaths: logger.error("You should specify at least one file with contigs!\n") qconfig.usage() sys.exit(2) json_output_dirpath = None output_dirpath = None labels = None all_labels_from_dirs = False ref_fpath = '' genes_fpaths = [] operons_fpaths = [] # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7. for opt, arg in options: if opt in ('-o', "--output-dir"): output_dirpath = os.path.abspath(arg) qconfig.make_latest_symlink = False elif opt in ('-G', "--genes"): genes_fpaths.append(assert_file_exists(arg, 'genes')) elif opt in ('-O', "--operons"): operons_fpaths.append(assert_file_exists(arg, 'operons')) elif opt in ('-R', "--reference"): ref_fpath = assert_file_exists(arg, 'reference') elif opt in ('-t', "--contig-thresholds"): qconfig.contig_thresholds = arg elif opt in ('-M', "--min-contig"): qconfig.min_contig = int(arg) elif opt in ('-T', "--threads"): qconfig.max_threads = int(arg) if qconfig.max_threads < 1: qconfig.max_threads = 1 elif opt in ('-c', "--mincluster"): qconfig.mincluster = int(arg) elif opt == "--est-ref-size": qconfig.estimated_reference_size = int(arg) elif opt in ('-S', "--gene-thresholds"): qconfig.genes_lengths = arg elif opt in ('-j', '--save-json'): qconfig.save_json = True elif opt in ('-J', '--save-json-to'): qconfig.save_json = True qconfig.make_latest_symlink = False json_output_dirpath = arg elif opt in ('-s', "--scaffolds"): qconfig.scaffolds = True elif opt == "--gage": qconfig.with_gage = True elif opt in ('-e', "--eukaryote"): qconfig.prokaryote = False elif opt in ('-f', "--gene-finding"): qconfig.gene_finding = True elif opt in ('-a', "--ambiguity-usage"): if arg in ["none", "one", "all"]: qconfig.ambiguity_usage = arg elif opt in ('-u', "--use-all-alignments"): qconfig.use_all_alignments = True elif opt in ('-n', "--strict-NA"): qconfig.strict_NA = True elif opt == '--no-plots': qconfig.draw_plots = False elif opt == '--no-html': qconfig.html_report = False elif opt in ('-m', '--meta'): qconfig.meta = True elif opt in ('-l', '--labels'): labels = parse_labels(arg, contigs_fpaths) elif opt == '-L': all_labels_from_dirs = True else: logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2) for contigs_fpath in contigs_fpaths: assert_file_exists(contigs_fpath, 'contigs') labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs) output_dirpath, json_output_dirpath, existing_alignments = \ _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.set_up_file_handler(output_dirpath) logger.print_command_line([os.path.realpath(__file__)] + args, wrap_after=None) logger.start() if existing_alignments: logger.info() logger.notice("Output directory already exists. Existing Nucmer alignments can be used.") qutils.remove_reports(output_dirpath) if qconfig.contig_thresholds == "None": qconfig.contig_thresholds = [] else: qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(",")) if qconfig.genes_lengths == "None": qconfig.genes_lengths = [] else: qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(",")) # Threading if qconfig.max_threads is None: try: import multiprocessing qconfig.max_threads = multiprocessing.cpu_count() except: logger.warning('Failed to determine the number of CPUs') qconfig.max_threads = qconfig.DEFAULT_MAX_THREADS logger.info() logger.notice('Maximum number of threads is set to ' + str(qconfig.max_threads) + ' (use --threads option to set it manually)') ######################################################################## from libs import reporting reload(reporting) if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCE if ref_fpath: logger.info() logger.info('Reference:') ref_fpath = _correct_reference(ref_fpath, corrected_dirpath) else: ref_fpath = '' # PROCESSING CONTIGS logger.info() logger.info('Contigs:') contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) if not contigs_fpaths: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 if qconfig.with_gage: ######################################################################## ### GAGE ######################################################################## if not ref_fpath: logger.warning("GAGE can't be run without a reference and will be skipped.") else: from libs import gage gage.do(ref_fpath, contigs_fpaths, output_dirpath) # Where all pdfs will be saved all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) all_pdf_file = None if qconfig.draw_plots: from libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. try: from matplotlib.backends.backend_pdf import PdfPages all_pdf_file = PdfPages(all_pdf_fpath) except: all_pdf_file = None ######################################################################## ### Stats and plots ######################################################################## from libs import basic_stats basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), json_output_dirpath, output_dirpath) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from libs import aligned_stats aligned_stats.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from libs import genome_analyzer genome_analyzer.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots: logger.print_timestamp() logger.info('Drawing large plots...') logger.info('This may take a while: press Ctrl-C to skip this step..') try: number_of_steps = sum([int(bool(value)) for value in [detailed_contigs_reports_dirpath, all_pdf_file]]) if detailed_contigs_reports_dirpath: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.info(' 1 of %d: Creating contig alignment plot...' % number_of_steps) from libs import contig_alignment_plotter contig_alignment_plot_fpath = contig_alignment_plotter.do( contigs_fpaths, os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout'), output_dirpath, ref_fpath, similar=True) if all_pdf_file: # full report in PDF format: all tables and plots logger.info(' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_file) logger.info('Done') except KeyboardInterrupt: logger.info('..step skipped!') os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.info('RESULTS:') logger.info(' Text versions of total report are saved to ' + reports_fpaths) logger.info(' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if json_output_dirpath: json_saver.save_total_report(json_output_dirpath, qconfig.min_contig) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_total_report(output_dirpath, qconfig.min_contig) if os.path.isfile(all_pdf_fpath): logger.info(' PDF version (tables and plots) saved to ' + all_pdf_fpath) if contig_alignment_plot_fpath: logger.info(' Contig alignment plot: %s' % contig_alignment_plot_fpath) _cleanup(corrected_dirpath) logger.finish_up(check_test=qconfig.test) return 0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.info('Running NA-NGA calculation...') reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append(sum(fastaparser.get_lengths_from_fastafile(contigs_fpath))) import N50 for i, (contigs_fpath, lens, assembly_len) in enumerate( itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) nga50 = N50.NG50(lens, reference_length) na75 = N50.NG50(lens, assembly_len, 75) nga75 = N50.NG50(lens, reference_length, 75) la50 = N50.LG50(lens, assembly_len) lga50 = N50.LG50(lens, reference_length) la75 = N50.LG50(lens, assembly_len, 75) lga75 = N50.LG50(lens, reference_length, 75) logger.info(' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + ', NGA50 = ' + str(nga50) + ', LA50 = ' + str(la50) + ', LGA50 = ' + str(lga50)) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LA75, la75) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## # saving to JSON if json_output_dirpath: from libs.html_saver import json_saver json_saver.save_aligned_contigs_lengths(json_output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists) json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_aligned_contigs_lengths(output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists) html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... import plotter plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths) plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))]) logger.info('Done.') return report_dict
def main(args): if ' ' in qconfig.QUAST_HOME: logger.error('QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage() sys.exit(0) reload(qconfig) try: options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options) except getopt.GetoptError: _, exc_value, _ = sys.exc_info() print >> sys.stderr, exc_value print >> sys.stderr qconfig.usage() sys.exit(2) for opt, arg in options[:]: if opt == '--test' or opt == '--test-sv': options.remove((opt, arg)) options += [('-o', 'quast_test_output'), ('-R', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reference.fasta.gz')), # for compiling MUMmer ('-O', os.path.join(qconfig.QUAST_HOME, 'test_data', 'operons.gff')), ('-G', os.path.join(qconfig.QUAST_HOME, 'test_data', 'genes.gff')), ('--gage', ''), # for compiling GAGE Java classes ('--gene-finding', ''), ('--eukaryote', ''), ('--glimmer', '')] # for compiling GlimmerHMM if opt == '--test-sv': options += [('-1', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads1.fastq.gz')), ('-2', os.path.join(qconfig.QUAST_HOME, 'test_data', 'reads2.fastq.gz'))] contigs_fpaths += [os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_1.fasta'), os.path.join(qconfig.QUAST_HOME, 'test_data', 'contigs_2.fasta')] qconfig.test = True if opt.startswith('--help') or opt == '-h': qconfig.usage(opt == "--help-hidden", short=False) sys.exit(0) elif opt.startswith('--version') or opt == '-v': qconfig.print_version() sys.exit(0) if not contigs_fpaths: logger.error("You should specify at least one file with contigs!\n") qconfig.usage() sys.exit(2) json_output_dirpath = None output_dirpath = None labels = None all_labels_from_dirs = False qconfig.is_combined_ref = False ref_fpath = '' genes_fpaths = [] operons_fpaths = [] bed_fpath = None reads_fpath_f = '' reads_fpath_r = '' # Yes, this is a code duplicating. But OptionParser is deprecated since version 2.7. for opt, arg in options: if opt in ('-d', '--debug'): qconfig.debug = True logger.set_up_console_handler(debug=True) elif opt in ('-o', "--output-dir"): output_dirpath = os.path.abspath(arg) qconfig.make_latest_symlink = False if ' ' in output_dirpath: logger.error('QUAST does not support spaces in paths. \n' 'You have specified ' + str(output_dirpath) + ' as an output path.\n' 'Please, use a different directory.\n', to_stderr=True, exit_with_code=3) elif opt in ('-G', "--genes"): genes_fpaths.append(assert_file_exists(arg, 'genes')) elif opt in ('-O', "--operons"): operons_fpaths.append(assert_file_exists(arg, 'operons')) elif opt in ('-R', "--reference"): ref_fpath = assert_file_exists(arg, 'reference') elif opt == "--contig-thresholds": qconfig.contig_thresholds = arg elif opt in ('-m', "--min-contig"): qconfig.min_contig = int(arg) elif opt in ('-t', "--threads"): qconfig.max_threads = int(arg) if qconfig.max_threads < 1: qconfig.max_threads = 1 elif opt in ('-c', "--min-cluster"): qconfig.min_cluster = int(arg) elif opt in ('-i', "--min-alignment"): qconfig.min_alignment = int(arg) elif opt == "--est-ref-size": qconfig.estimated_reference_size = int(arg) elif opt == "--gene-thresholds": qconfig.genes_lengths = arg elif opt in ('-j', '--save-json'): qconfig.save_json = True elif opt in ('-J', '--save-json-to'): qconfig.save_json = True qconfig.make_latest_symlink = False json_output_dirpath = arg elif opt == '--err-fpath': # for web-quast qconfig.save_error = True qconfig.error_log_fname = arg elif opt in ('-s', "--scaffolds"): qconfig.scaffolds = True elif opt == "--gage": qconfig.with_gage = True elif opt in ('-e', "--eukaryote"): qconfig.prokaryote = False elif opt in ('-f', "--gene-finding"): qconfig.gene_finding = True elif opt in ('-a', "--ambiguity-usage"): if arg in ["none", "one", "all"]: qconfig.ambiguity_usage = arg elif opt in ('-u', "--use-all-alignments"): qconfig.use_all_alignments = True elif opt == "--strict-NA": qconfig.strict_NA = True elif opt in ('-x', "--extensive-mis-size"): if int(arg) <= qconfig.MAX_INDEL_LENGTH: logger.error("--extensive-mis-size should be greater than maximum indel length (%d)!" % qconfig.MAX_INDEL_LENGTH, 1, to_stderr=True) qconfig.extensive_misassembly_threshold = int(arg) elif opt == '--no-snps': qconfig.show_snps = False elif opt == '--no-plots': qconfig.draw_plots = False elif opt == '--no-html': qconfig.html_report = False elif opt == '--no-check': qconfig.no_check = True elif opt == '--no-gc': qconfig.no_gc = True elif opt == '--fast': # --no-gc, --no-plots, --no-snps #qconfig.no_check = True # too risky to include qconfig.no_gc = True qconfig.show_snps = False qconfig.draw_plots = False qconfig.html_report = False elif opt == '--plots-format': if arg.lower() in qconfig.supported_plot_extensions: qconfig.plot_extension = arg.lower() else: logger.error('Format "%s" is not supported. Please, use one of the supported formats: %s.' % (arg, ', '.join(qconfig.supported_plot_extensions)), to_stderr=True, exit_with_code=2) elif opt == '--meta': qconfig.meta = True elif opt == '--no-check-meta': qconfig.no_check = True qconfig.no_check_meta = True elif opt == '--references-list': pass elif opt in ('-l', '--labels'): labels = parse_labels(arg, contigs_fpaths) elif opt == '-L': all_labels_from_dirs = True elif opt == '--glimmer': qconfig.glimmer = True elif opt == '--combined-ref': qconfig.is_combined_ref = True elif opt == '--memory-efficient': qconfig.memory_efficient = True elif opt == '--silent': qconfig.silent = True elif opt in ('-1', '--reads1'): reads_fpath_f = arg elif opt in ('-2', '--reads2'): reads_fpath_r = arg elif opt == '--bed-file': bed_fpath = arg elif opt == '--contig-alignment-html': qconfig.create_contig_alignment_html = True else: logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2) for contigs_fpath in contigs_fpaths: assert_file_exists(contigs_fpath, 'contigs') labels = process_labels(contigs_fpaths, labels, all_labels_from_dirs) output_dirpath, json_output_dirpath, existing_alignments = \ _set_up_output_dir(output_dirpath, json_output_dirpath, qconfig.make_latest_symlink, qconfig.save_json) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.set_up_file_handler(output_dirpath) args = [os.path.realpath(__file__)] for k, v in options: args.extend([k, v]) args.extend(contigs_fpaths) logger.print_command_line(args, wrap_after=None, is_main=True) logger.start() if existing_alignments: logger.main_info() logger.notice("Output directory already exists. Existing Nucmer alignments can be used.") qutils.remove_reports(output_dirpath) if qconfig.contig_thresholds == "None": qconfig.contig_thresholds = [] else: qconfig.contig_thresholds = map(int, qconfig.contig_thresholds.split(",")) if qconfig.genes_lengths == "None": qconfig.genes_lengths = [] else: qconfig.genes_lengths = map(int, qconfig.genes_lengths.split(",")) qconfig.set_max_threads(logger) logger.main_info() logger.print_params() ######################################################################## from libs import reporting reload(reporting) if qconfig.is_combined_ref: corrected_dirpath = os.path.join(output_dirpath, '..', qconfig.corrected_dirname) else: if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCE if ref_fpath: logger.main_info() logger.main_info('Reference:') ref_fpath = _correct_reference(ref_fpath, corrected_dirpath) else: ref_fpath = '' # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') contigs_fpaths, old_contigs_fpaths = _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.NAME, qutils.label_from_fpath(contigs_fpath)) qconfig.assemblies_num = len(contigs_fpaths) reads_fpaths = [] if reads_fpath_f: reads_fpaths.append(reads_fpath_f) if reads_fpath_r: reads_fpaths.append(reads_fpath_r) if reads_fpaths: bed_fpath = reads_analyzer.do(ref_fpath, contigs_fpaths, reads_fpaths, None, os.path.join(output_dirpath, qconfig.variation_dirname), external_logger=logger) if not contigs_fpaths: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.", fake_if_nested_run=True) return 4 qconfig.assemblies_fpaths = contigs_fpaths if qconfig.with_gage: ######################################################################## ### GAGE ######################################################################## if not ref_fpath: logger.warning("GAGE can't be run without a reference and will be skipped.") else: from libs import gage gage.do(ref_fpath, contigs_fpaths, output_dirpath) # Where all pdfs will be saved all_pdf_fpath = os.path.join(output_dirpath, qconfig.plots_fname) all_pdf_file = None if qconfig.draw_plots or qconfig.html_report: from libs import plotter # Do not remove this line! It would lead to a warning in matplotlib. try: from matplotlib.backends.backend_pdf import PdfPages all_pdf_file = PdfPages(all_pdf_fpath) except: all_pdf_file = None if json_output_dirpath: from libs.html_saver import json_saver if json_saver.simplejson_error: json_output_dirpath = None ######################################################################## ### Stats and plots ######################################################################## from libs import basic_stats basic_stats.do(ref_fpath, contigs_fpaths, os.path.join(output_dirpath, 'basic_stats'), json_output_dirpath, output_dirpath) aligned_contigs_fpaths = [] aligned_lengths_lists = [] contig_alignment_plot_fpath = None if ref_fpath: ######################################################################## ### former PLANTAKOLYA, PLANTAGORA ######################################################################## from libs import contigs_analyzer nucmer_statuses, aligned_lengths_per_fpath = contigs_analyzer.do( ref_fpath, contigs_fpaths, qconfig.prokaryote, os.path.join(output_dirpath, 'contigs_reports'), old_contigs_fpaths, bed_fpath) for contigs_fpath in contigs_fpaths: if nucmer_statuses[contigs_fpath] == contigs_analyzer.NucmerStatus.OK: aligned_contigs_fpaths.append(contigs_fpath) aligned_lengths_lists.append(aligned_lengths_per_fpath[contigs_fpath]) # Before continue evaluating, check if nucmer didn't skip all of the contigs files. detailed_contigs_reports_dirpath = None if len(aligned_contigs_fpaths) and ref_fpath: detailed_contigs_reports_dirpath = os.path.join(output_dirpath, 'contigs_reports') ######################################################################## ### NAx and NGAx ("aligned Nx and NGx") ######################################################################## from libs import aligned_stats aligned_stats.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, os.path.join(output_dirpath, 'aligned_stats')) ######################################################################## ### GENOME_ANALYZER ######################################################################## from libs import genome_analyzer genome_analyzer.do( ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, os.path.join(output_dirpath, 'genome_stats')) if qconfig.gene_finding or qconfig.glimmer: if qconfig.glimmer: ######################################################################## ### Glimmer ######################################################################## from libs import glimmer glimmer.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes')) else: ######################################################################## ### GeneMark ######################################################################## from libs import genemark genemark.do(contigs_fpaths, qconfig.genes_lengths, os.path.join(output_dirpath, 'predicted_genes'), qconfig.prokaryote, qconfig.meta) else: logger.main_info("") logger.notice("Genes are not predicted by default. Use --gene-finding option to enable it.") ######################################################################## reports_fpaths, transposed_reports_fpaths = reporting.save_total(output_dirpath) ######################################################################## ### LARGE DRAWING TASKS ######################################################################## if qconfig.draw_plots: logger.print_timestamp() logger.main_info('Drawing large plots...') logger.main_info('This may take a while: press Ctrl-C to skip this step..') try: if detailed_contigs_reports_dirpath and qconfig.show_snps: contig_report_fpath_pattern = os.path.join(detailed_contigs_reports_dirpath, 'contigs_report_%s.stdout') else: contig_report_fpath_pattern = None number_of_steps = sum([int(bool(value)) for value in [contig_report_fpath_pattern, all_pdf_file]]) if contig_report_fpath_pattern: ######################################################################## ### VISUALIZE CONTIG ALIGNMENT ######################################################################## logger.main_info(' 1 of %d: Creating contig alignment plot...' % number_of_steps) from libs import contig_alignment_plotter contig_alignment_plot_fpath = contig_alignment_plotter.do( contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, similar=True) if all_pdf_file: # full report in PDF format: all tables and plots logger.main_info(' %d of %d: Creating PDF with all tables and plots...' % (number_of_steps, number_of_steps)) plotter.fill_all_pdf_file(all_pdf_file) logger.main_info('Done') except KeyboardInterrupt: logger.main_info('..step skipped!') os.remove(all_pdf_fpath) ######################################################################## ### TOTAL REPORT ######################################################################## logger.print_timestamp() logger.main_info('RESULTS:') logger.main_info(' Text versions of total report are saved to ' + reports_fpaths) logger.main_info(' Text versions of transposed total report are saved to ' + transposed_reports_fpaths) if json_output_dirpath: json_saver.save_total_report(json_output_dirpath, qconfig.min_contig, ref_fpath) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls) html_saver.save_total_report(output_dirpath, qconfig.min_contig, ref_fpath) if os.path.isfile(all_pdf_fpath): logger.main_info(' PDF version (tables and plots) saved to ' + all_pdf_fpath) if contig_alignment_plot_fpath: logger.main_info(' Contig alignment plot: %s' % contig_alignment_plot_fpath) _cleanup(corrected_dirpath) logger.finish_up(check_test=qconfig.test) return 0
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) results = dict() logger.info(' ' + qutils.index_to_str(index) + assembly_label) nucmer_base_fpath = os.path.join(nucmer_path_dirpath, assembly_name + '.coords') if qconfig.use_all_alignments: nucmer_fpath = nucmer_base_fpath else: nucmer_fpath = nucmer_base_fpath + '.filtered' if not os.path.isfile(nucmer_fpath): logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.', indent=' ') coordfile = open(nucmer_fpath, 'r') for line in coordfile: if line.startswith('='): break # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.iteritems(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True) sorted_contigs_names = [name for (name, seq) in contig_tuples] genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] for line in coordfile: if line.strip() == '': break s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 coordfile.close() # counting genome coverage and gaps number covered_bp = 0 gaps_count = 0 gaps_fpath = os.path.join(genome_stats_dirpath, assembly_name + '_gaps.txt') gaps_file = open(gaps_fpath, 'w') for chr_name, chr_len in reference_chromosomes.iteritems(): print >>gaps_file, chr_name cur_gap_size = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >>gaps_file, i - cur_gap_size, i - 1 covered_bp += 1 cur_gap_size = 0 else: cur_gap_size += 1 if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >>gaps_file, chr_len - cur_gap_size + 1, chr_len gaps_file.close() results["covered_bp"] = covered_bp results["gaps_count"] = gaps_count # finding genes and operons for container, feature_in_contigs, field, suffix in [ (genes_container, genes_in_contigs, reporting.Fields.GENES, '_genes.txt'), (operons_container, operons_in_contigs, reporting.Fields.OPERONS, '_operons.txt')]: if not container.region_list: results[field + "_full"] = None results[field + "_partial"] = None continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, assembly_name + suffix) found_file = open(found_fpath, 'w') print >>found_file, '%s\t\t%s\t%s' % ('ID or #', 'Start', 'End') print >>found_file, '============================' # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if container.chr_names_dict[region.seqname] != cur_block.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)] else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 i = str(region.id) if i == 'None': i = '# ' + str(region.number + 1) print >>found_file, '%s\t\t%d\t%d' % (i, region.start, region.end) feature_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig cur_feature_is_found = True break elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap: found_list[i] = 2 total_partial += 1 if cur_feature_is_found: break if cur_feature_is_found: break results[field + "_full"] = total_full results[field + "_partial"] = total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') return results, genes_in_contigs, operons_in_contigs
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None if ref_fpath: reference_length = sum( fastaparser.get_lengths_from_fastafile(ref_fpath)) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size logger.info(' Estimated reference length = ' + str(reference_length)) if reference_length: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_length(json_output_dir, reference_length) # Saving for an HTML report if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_reference_length(results_dir, reference_length) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, lists_of_lengths) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, lists_of_lengths) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate( itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath) list_of_GC_distributions.append(GC_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) ) report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field( reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) if json_output_dir: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.draw_plots: import plotter ########################################################################import plotter plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length') ######################################################################## # Drawing GC content plot... list_of_GC_distributions_with_ref = list_of_GC_distributions if ref_fpath: list_of_GC_distributions_with_ref.append(reference_GC_distribution) # Drawing cumulative plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot') ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', []) if reference_length: plotter.Nx_plot( contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))]) logger.info('Done.')
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None if ref_fpath: reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(" Reference genome:") logger.info( " " + os.path.basename(ref_fpath) + ", Reference length = " + str(reference_length) + ", Reference GC % = " + "%.2f" % reference_GC ) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size logger.info(" Estimated reference length = " + str(reference_length)) if reference_length: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_length(json_output_dir, reference_length) # Saving for an HTML report if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_reference_length(results_dir, reference_length) logger.info(" Contig files: ") lists_of_lengths = [] numbers_of_Ns = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(" " + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count("N") lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs / multiplicator lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] corr_lists_of_lengths = [ [ sum(list_of_length[((i - 1) * multiplicator) : (i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length) ] for list_of_length in lists_of_lengths ] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator :])) else: corr_lists_of_lengths = lists_of_lengths # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths) json_saver.save_tick_x(json_output_dir, multiplicator) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(" Calculating N50 and L50...") list_of_GC_distributions = [] largest_contig = 0 import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate( itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns) ): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) logger.info( " " + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + ", N50 = " + str(n50) + ", L50 = " + str(l50) + ", Total length = " + str(total_length) + ", GC % = " + ("%.2f" % total_GC if total_GC is not None else "undefined") + ", # N's per 100 kbp = " + " %.2f" % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else "undefined" ) report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.GC, ("%.2f" % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field( reporting.Fields.UNCALLED_PERCENT, ("%.2f" % (float(number_of_Ns) * 100000.0 / float(total_length))) ) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, "%.2f" % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil((largest_contig / 1000) / 600) # divide on height of plot if json_output_dir: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.html_report and not qconfig.is_combined_ref: from libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions) import plotter ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot( results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + "/Nx_plot", "Nx", [], json_output_dir=json_output_dir, ) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot( results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + "/NGx_plot", "NGx", [reference_length for i in range(len(contigs_fpaths))], json_output_dir=json_output_dir, ) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot( ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + "/cumulative_plot", "Cumulative length" ) if not qconfig.is_combined_ref: ######################################################################## # Drawing GC content plot... list_of_GC_distributions_with_ref = list_of_GC_distributions if ref_fpath: list_of_GC_distributions_with_ref.append(reference_GC_distribution) plotter.GC_content_plot( ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + "/GC_content_plot" ) logger.main_info("Done.")
def do(fasta_fpaths, gene_lengths, out_dirpath, prokaryote, meta): logger.print_timestamp() if LICENSE_LIMITATIONS_MODE: logger.warning( "GeneMark tool can't be started because of license limitations!") return if meta: tool_name = 'MetaGeneMark' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_metagenomic elif prokaryote: tool_name = 'GeneMarkS' tool_dirname = 'genemark' gmhmm_p_function = gmhmm_p_everyGC else: tool_name = 'GeneMark-ES' tool_dirname = 'genemark-es' gmhmm_p_function = gm_es logger.main_info('Running %s...' % tool_name) tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, tool_dirname, qconfig.platform_name) if not os.path.exists(tool_dirpath): logger.warning( ' Sorry, can\'t use %s on this platform, skipping gene prediction.' % tool_name) else: successful = install_genemark( os.path.join(qconfig.LIBS_LOCATION, 'genemark', qconfig.platform_name)) if not successful: return if not os.path.isdir(out_dirpath): os.mkdir(out_dirpath) tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) n_jobs = min(len(fasta_fpaths), qconfig.max_threads) num_threads = max(1, qconfig.max_threads // n_jobs) from joblib import Parallel, delayed results = Parallel(n_jobs=n_jobs)( delayed(predict_genes)(index, fasta_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads) for index, fasta_fpath in enumerate(fasta_fpaths)) # saving results for i, fasta_path in enumerate(fasta_fpaths): report = reporting.get(fasta_path) unique_count, count = results[i] if unique_count is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique_count) if count is not None: report.add_field(reporting.Fields.PREDICTED_GENES, count) if unique_count is None and count is None: logger.error( ' ' + qutils.index_to_str(i) + 'Failed predicting genes in ' + qutils.label_from_fpath(fasta_path) + '. ' + ('File may be too small for GeneMark-ES. Try to use GeneMarkS instead (remove --eukaryote option).' if tool_name == 'GeneMark-ES' and os.path.getsize(fasta_path) < 2000000 else '')) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def js_data_gen(assemblies, contigs_fpaths, chr_names, chromosomes_length, output_dir_path, cov_fpath, ref_fpath, genome_size): chr_to_aligned_blocks = dict() for chr in chr_names: chr_init = [] for fpath in contigs_fpaths: f = Alignment('FICTIVE', 0, 0, 0, 0, False, 0, 0, None) f.label = qutils.label_from_fpath(fpath) f.unshifted_start = 0 f.unshifted_end = 0 chr_init.append(f) chr_to_aligned_blocks.setdefault(chr, chr_init) for assembly in assemblies.assemblies: for align in assembly.alignments: chr_to_aligned_blocks[align.ref_name].append(align) summary_fname = 'alignment_summary.html' summary_path = os.path.join(output_dir_path, summary_fname) output_all_files_dir_path = os.path.join(output_dir_path, alignment_plots_dirname) if not os.path.exists(output_all_files_dir_path): os.mkdir(output_all_files_dir_path) import contigs_analyzer if contigs_analyzer.ref_labels_by_chromosomes: contig_names_by_refs = contigs_analyzer.ref_labels_by_chromosomes chr_full_names = list(set([contig_names_by_refs[contig] for contig in chr_names])) elif genome_size < MAX_SIZE_FOR_COMB_PLOT and len(chr_names) >= MIN_CONTIGS_FOR_COMB_PLOT: chr_full_names = [NAME_FOR_ONE_PLOT] else: chr_full_names = chr_names if cov_fpath: cov_data = dict() not_covered = dict() cur_len = dict() with open(cov_fpath, 'r') as coverage: name = chr_names[0] contig_to_chr = {} for chr in chr_full_names: cov_data.setdefault(chr, []) not_covered.setdefault(chr, []) cur_len.setdefault(chr, 0) if contigs_analyzer.ref_labels_by_chromosomes: contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr] elif chr == NAME_FOR_ONE_PLOT: contigs = chr_names else: contigs = [chr] for contig in contigs: contig_to_chr[contig] = chr for index, line in enumerate(coverage): c = list(line.split()) name = contig_to_chr[qutils.correct_name(c[0])] cur_len[name] += int(c[2]) if index % 100 == 0 and index > 0: cov_data[name].append(cur_len[name]/100) cur_len[name] = 0 if c[2] == '0': not_covered[name].append(c[1]) chr_sizes = {} num_contigs = {} aligned_bases = genome_analyzer.get_ref_aligned_lengths() aligned_bases_by_chr = {} num_misassemblies = {} aligned_assemblies = {} for i, chr in enumerate(chr_full_names): short_chr = chr[:30] num_misassemblies[chr] = 0 aligned_bases_by_chr[chr] = [] aligned_assemblies[chr] = [] with open(os.path.join(output_all_files_dir_path, 'data_%s.js' % short_chr), 'w') as result: result.write('"use strict";\n') if contigs_analyzer.ref_labels_by_chromosomes: contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr] result.write('var links_to_chromosomes = {};\n') links_to_chromosomes = [] used_chromosomes = [] elif chr == NAME_FOR_ONE_PLOT: contigs = chr_names else: contigs = [chr] chr_size = sum([chromosomes_length[contig] for contig in contigs]) chr_sizes[chr] = chr_size num_contigs[chr] = len(contigs) for contig in contigs: aligned_bases_by_chr[chr].extend(aligned_bases[contig]) data_str = 'var chromosomes_len = {};\n' for contig in contigs: l = chromosomes_length[contig] data_str += 'chromosomes_len["{contig}"] = {l};\n'.format(**locals()) result.write(data_str) # adding assembly data data_str = 'var contig_data = {};\n' data_str += 'contig_data["{chr}"] = [ '.format(**locals()) prev_len = 0 chr_lengths = [0] + [chromosomes_length[contig] for contig in contigs] for num_contig, contig in enumerate(contigs): if num_contig > 0: prev_len += chr_lengths[num_contig] if len(chr_to_aligned_blocks[contig]) > 0: for alignment in chr_to_aligned_blocks[contig]: if alignment.misassembled: num_misassemblies[chr] += 1 corr_start = prev_len + alignment.unshifted_start corr_end = prev_len + alignment.unshifted_end data_str += '{{name: "{alignment.name}", corr_start: {corr_start}, corr_end: {corr_end},' \ 'start: {alignment.unshifted_start}, end: {alignment.unshifted_end}, assembly: "{alignment.label}", similar: "{alignment.similar}", misassembled: "{alignment.misassembled}" '.format(**locals()) if alignment.name != 'FICTIVE': if len(aligned_assemblies[chr]) < len(contigs_fpaths) and alignment.label not in aligned_assemblies[chr]: aligned_assemblies[chr].append(alignment.label) data_str += ', structure: [' for el in alignment.misassembled_structure: if type(el) == list: if el[5] in contigs: num_chr = contigs.index(el[5]) corr_len = sum(chr_lengths[:num_chr+1]) else: corr_len = -int(el[1]) if contigs_analyzer.ref_labels_by_chromosomes and el[5] not in used_chromosomes: used_chromosomes.append(el[5]) new_chr = contig_names_by_refs[el[5]] links_to_chromosomes.append('links_to_chromosomes["{el[5]}"] = "{new_chr}";\n'.format(**locals())) corr_start = corr_len + int(el[0]) corr_end = corr_len + int(el[1]) data_str += '{{type: "A", corr_start: {corr_start}, corr_end: {corr_end}, start: {el[0]}, end: {el[1]}, start_in_contig: {el[2]}, end_in_contig: {el[3]}, IDY: {el[4]}, chr: "{el[5]}"}},'.format(**locals()) elif type(el) == str: data_str += '{{type: "M", mstype: "{el}"}},'.format(**locals()) if data_str[-1] == '[': data_str = data_str + ']},' else: data_str = data_str[: -1] + ']},' else: data_str += '},' data_str = data_str[:-1] + '];\n\n' result.write(data_str) if contigs_analyzer.ref_labels_by_chromosomes: result.write(''.join(links_to_chromosomes)) if cov_fpath: # adding coverage data data_str = 'var coverage_data = {};\n' if cov_data[chr]: data_str += 'coverage_data["{chr}"] = [ '.format(**locals()) for e in cov_data[chr]: data_str += '{e},'.format(**locals()) if len(data_str) > 10000 and e != cov_data[chr][-1]: result.write(data_str) data_str = '' data_str = data_str[:-1] + '];\n' result.write(data_str) data_str = '' data_str = 'var not_covered = {};\n' data_str += 'not_covered["{chr}"] = [ '.format(**locals()) if len(not_covered[chr]) > 0: for e in not_covered[chr]: data_str += '{e},'.format(**locals()) if len(data_str) > 10000 and e != cov_data[chr][-1]: result.write(data_str) data_str = '' data_str = data_str[:-1] data_str += '];\n' result.write(data_str) data_str = '' with open(html_saver.get_real_path('_chr_templ.html'), 'r') as template: with open(os.path.join(output_all_files_dir_path, '_{short_chr}.html'.format(**locals())), 'w') as result: for line in template: if line.find('<script type="text/javascript" src=""></script>') != -1: result.write('<script type="text/javascript" src="data_{short_chr}.js"></script>\n'.format(**locals())) else: result.write(line) if line.find('<body>') != -1: chr_size = chr_sizes[chr] chr_name = chr.replace('_', ' ') if len(chr_name) > 50: chr_name = chr_name[:50] + '...' title = 'CONTIG ALIGNMENT BROWSER: %s (' % chr_name + ('%s fragments, ' % num_contigs[chr] if num_contigs[chr] > 1 else '') + '%s bp)' % format_long_numbers(chr_size) result.write('<div class = "block title"><a href="../{summary_fname}"><button class="back_button">↵</button></a>{title}</div>\n'.format(**locals())) if line.find('<script type="text/javascript">') != -1: chromosome = '","'.join(contigs) result.write('var CHROMOSOME = "{chr}";\n'.format(**locals())) result.write('var chrContigs = ["{chromosome}"];\n'.format(**locals())) with open(html_saver.get_real_path('alignment_summary_templ.html'), 'r') as template: with open(summary_path, 'w') as result: num_aligned_assemblies = [len(aligned_assemblies[chr]) for chr in chr_full_names] is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1 for line in template: result.write(line) if line.find('<!--- assemblies: ---->') != -1: if not is_unaligned_asm_exists: result.write('<div class="subtitle"># assemblies: %s</div>' % len(contigs_fpaths)) if line.find('<!--- th_assemblies: ---->') != -1: if is_unaligned_asm_exists: result.write('<th># assemblies</th>') if line.find('<!--- references: ---->') != -1: for chr in sorted(chr_full_names): result.write('<tr>') short_chr = chr[:30] chr_link = os.path.join(alignment_plots_dirname, '_{short_chr}.html'.format(**locals())) chr_name = chr.replace('_', ' ') aligned_lengths = [aligned_len for aligned_len in aligned_bases_by_chr[chr] if aligned_len is not None] chr_genome = sum(aligned_lengths) * 100.0 / (chr_sizes[chr] * len(contigs_fpaths)) chr_size = chr_sizes[chr] result.write('<td><a href="%s">%s</a></td>' % (chr_link, chr_name)) result.write('<td>%s</td>' % num_contigs[chr]) result.write('<td>%s</td>' % format_long_numbers(chr_size)) if is_unaligned_asm_exists: result.write('<td>%s</td>' % len(aligned_assemblies[chr])) result.write('<td>%.3f</td>' % chr_genome) result.write('<td>%s</td>' % num_misassemblies[chr]) result.write('</tr>') copyfile(html_saver.get_real_path(os.path.join('static', 'contig_alignment_plot.css')), os.path.join(output_all_files_dir_path, 'contig_alignment_plot.css')) copyfile(html_saver.get_real_path(os.path.join('static', 'd3.js')), os.path.join(output_all_files_dir_path, 'd3.js')) copyfile(html_saver.get_real_path(os.path.join('static', 'scripts', 'contig_alignment_plot_script.js')), os.path.join(output_all_files_dir_path, 'contig_alignment_plot_script.js'))
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) results = dict() logger.info(' ' + qutils.index_to_str(index) + assembly_label) nucmer_base_fpath = os.path.join(nucmer_path_dirpath, assembly_name + '.coords') if qconfig.use_all_alignments: nucmer_fpath = nucmer_base_fpath else: nucmer_fpath = nucmer_base_fpath + '.filtered' if not os.path.isfile(nucmer_fpath): logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.', indent=' ') coordfile = open(nucmer_fpath, 'r') for line in coordfile: if line.startswith('='): break # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.iteritems(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta( contigs_fpath) # list of FASTA entries (in tuples: name, seq) contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True) sorted_contigs_names = [name for (name, seq) in contig_tuples] genes_in_contigs = [0] * len( sorted_contigs_names ) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = { } # for gene finding: contig_name --> list of AlignedBlock for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] for line in coordfile: if line.strip() == '': break s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") aligned_blocks_by_contig_name[contig_name].append( AlignedBlock(seqname=chr_name, start=s1, end=e1)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 coordfile.close() # counting genome coverage and gaps number covered_bp = 0 gaps_count = 0 gaps_fpath = os.path.join(genome_stats_dirpath, assembly_name + '_gaps.txt') gaps_file = open(gaps_fpath, 'w') for chr_name, chr_len in reference_chromosomes.iteritems(): print >> gaps_file, chr_name cur_gap_size = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >> gaps_file, i - cur_gap_size, i - 1 covered_bp += 1 cur_gap_size = 0 else: cur_gap_size += 1 if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >> gaps_file, chr_len - cur_gap_size + 1, chr_len gaps_file.close() results["covered_bp"] = covered_bp results["gaps_count"] = gaps_count # finding genes and operons for container, feature_in_contigs, field, suffix in [ (genes_container, genes_in_contigs, reporting.Fields.GENES, '_genes.txt'), (operons_container, operons_in_contigs, reporting.Fields.OPERONS, '_operons.txt') ]: if not container.region_list: results[field + "_full"] = None results[field + "_partial"] = None continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, assembly_name + suffix) found_file = open(found_fpath, 'w') print >> found_file, '%s\t\t%s\t%s' % ('ID or #', 'Start', 'End') print >> found_file, '============================' # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if container.chr_names_dict[ region.seqname] != cur_block.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [ AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end) ] else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[ i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 i = str(region.id) if i == 'None': i = '# ' + str(region.number + 1) print >> found_file, '%s\t\t%d\t%d' % ( i, region.start, region.end) feature_in_contigs[ contig_id] += 1 # inc number of found genes/operons in id-th contig cur_feature_is_found = True break elif found_list[i] == 0 and min( region.end, block.end) - max( region.start, block.start) >= qconfig.min_gene_overlap: found_list[i] = 2 total_partial += 1 if cur_feature_is_found: break if cur_feature_is_found: break results[field + "_full"] = total_full results[field + "_partial"] = total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') return results, genes_in_contigs, operons_in_contigs
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None if ref_fpath: reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size logger.info(' Estimated reference length = ' + str(reference_length)) if reference_length: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_length(json_output_dir, reference_length) # Saving for an HTML report if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_reference_length(results_dir, reference_length) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points*2): import math multiplicator = int(num_contigs/qconfig.max_points) max_points = num_contigs/multiplicator lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] corr_lists_of_lengths = [[sum(list_of_length[((i-1)*multiplicator):(i*multiplicator)]) for i in range(1, max_points) if (i*multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index*multiplicator:])) else: corr_lists_of_lengths = lists_of_lengths # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths) json_saver.save_tick_x(json_output_dir, multiplicator) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] largest_contig = 0 import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil((largest_contig/1000)/600) # divide on height of plot if json_output_dir: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.html_report and not qconfig.is_combined_ref: from libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions) import plotter ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', [], json_output_dir=json_output_dir) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))], json_output_dir=json_output_dir) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length') if not qconfig.is_combined_ref: ######################################################################## # Drawing GC content plot... list_of_GC_distributions_with_ref = list_of_GC_distributions if ref_fpath: list_of_GC_distributions_with_ref.append(reference_GC_distribution) plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot') logger.main_info('Done.')