def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath
def _correct_contigs(contigs_fpaths, corrected_dirpath, min_contig, labels): assemblies = [] for i, contigs_fpath in enumerate(contigs_fpaths): contigs_fname = os.path.basename(contigs_fpath) fname, ctg_fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[i] corr_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, label + ctg_fasta_ext)) assembly = Assembly(corr_fpath, label) logger.info(' %s ==> %s' % (contigs_fpath, label)) # Handle fasta lengths = fastaparser.get_lengths_from_fastafile(contigs_fpath) if not sum(l for l in lengths if l >= min_contig): logger.warning( "Skipping %s because it doesn't contain contigs >= %d bp." % (os.path.basename(contigs_fpath), min_contig)) continue # correcting if not quast.correct_fasta(contigs_fpath, corr_fpath, min_contig): continue assemblies.append(assembly) return assemblies
def _correct_contigs(contigs_fpaths, corrected_dirpath, min_contig, labels): assemblies = [] for i, contigs_fpath in enumerate(contigs_fpaths): contigs_fname = os.path.basename(contigs_fpath) fname, ctg_fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[i] corr_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, label + ctg_fasta_ext)) assembly = Assembly(corr_fpath, label) logger.info(' %s ==> %s' % (contigs_fpath, label)) # Handle fasta lengths = fastaparser.get_lengths_from_fastafile(contigs_fpath) if not sum(l for l in lengths if l >= min_contig): logger.warning("Skipping %s because it doesn't contain contigs >= %d bp." % (os.path.basename(contigs_fpath), min_contig)) continue # correcting if not quast.correct_fasta(contigs_fpath, corr_fpath, min_contig): continue assemblies.append(assembly) return assemblies
def _correct_reference(ref_fpath, corrected_dirpath): ref_fname = os.path.basename(ref_fpath) name, fasta_ext = qutils.splitext_for_fasta_file(ref_fname) corr_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, name + fasta_ext)) if not correct_fasta(ref_fpath, corr_fpath, qconfig.min_contig, is_reference=True): ref_fpath = '' else: logger.main_info(' %s ==> %s' % (ref_fpath, qutils.name_from_fpath(corr_fpath))) ref_fpath = corr_fpath return ref_fpath
def _correct_reference(ref_fpath, corrected_dirpath): ref_fname = os.path.basename(ref_fpath) name, fasta_ext = qutils.splitext_for_fasta_file(ref_fname) corr_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, name + fasta_ext)) if not correct_fasta(ref_fpath, corr_fpath, qconfig.min_contig, is_reference=True): ref_fpath = '' else: logger.info(' %s ==> %s' % (ref_fpath, qutils.name_from_fpath(corr_fpath))) ref_fpath = corr_fpath return ref_fpath
def _correct_contigs(contigs_fpaths, output_dirpath, labels): assemblies = [Assembly(contigs_fpaths[i], labels[i]) for i in range(len(contigs_fpaths))] corr_assemblies = [] for file_counter, (contigs_fpath, label) in enumerate(zip(contigs_fpaths, labels)): contigs_fname = os.path.basename(contigs_fpath) fname, ctg_fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) corr_fpath = qutils.unique_corrected_fpath( os.path.join(output_dirpath, qconfig.corrected_dirname, label + ctg_fasta_ext)) corr_assemblies.append(Assembly(corr_fpath, label)) logger.main_info(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label)) return assemblies, corr_assemblies
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references): seq_fname = ref_name if total_references > 1: seq_fname += '_' + qutils.correct_name(seq_name[:20]) seq_fname += ref_fasta_ext corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corrected_ref_fpaths.append(corr_seq_fpath) fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') return corr_seq_name
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references): seq_fname = ref_name if total_references > 1: seq_fname += '_' + qutils.correct_name(seq_name[:20]) seq_fname += ref_fasta_ext corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corrected_ref_fpaths.append(corr_seq_fpath) fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') return corr_seq_name
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = { 'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N' } pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[ corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath
def _correct_contigs(contigs_fpaths, output_dirpath, labels): assemblies = [ Assembly(contigs_fpaths[i], labels[i]) for i in range(len(contigs_fpaths)) ] corr_assemblies = [] for file_counter, (contigs_fpath, label) in enumerate(zip(contigs_fpaths, labels)): contigs_fname = os.path.basename(contigs_fpath) fname, ctg_fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) corr_fpath = qutils.unique_corrected_fpath( os.path.join(output_dirpath, qconfig.corrected_dirname, label + ctg_fasta_ext)) corr_assemblies.append(Assembly(corr_fpath, label)) logger.main_info( ' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label)) return assemblies, corr_assemblies
def _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels): ## removing from contigs' names special characters because: ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc ## 2) Nucmer fails on names like "contig 1_bla_bla", "contig 2_bla_bla" (it interprets as a contig's name only the first word of caption and gets ambiguous contigs names) corrected_contigs_fpaths = [] for i, contigs_fpath in enumerate(contigs_fpaths): contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[i] corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext)) qconfig.assembly_labels_by_fpath[corr_fpath] = label logger.info(' %s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds splitted version of assemblies to the comparison if qconfig.scaffolds: logger.info(" breaking scaffolds into contigs:") corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 for i, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)): i = 0 cur_contig_number = 1 cur_contig_start = 0 while (i < len(seq)) and (seq.find("N", i) != -1): start = seq.find("N", i) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 i = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:start])) cur_contig_number += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:])) contigs_counter += cur_contig_number fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) qconfig.assembly_labels_by_fpath[broken_scaffolds_fpath] = label + ' broken' logger.info(" %d scaffolds (%s) were broken into %d contigs (%s)" % (i + 1, qutils.name_from_fpath(corr_fpath), contigs_counter, qutils.name_from_fpath(broken_scaffolds_fpath))) if _handle_fasta(broken_scaffolds_fpath, broken_scaffolds_fpath, reporting): corrected_contigs_fpaths.append(broken_scaffolds_fpath) qconfig.list_of_broken_scaffolds.append(qutils.name_from_fpath(broken_scaffolds_fpath)) if _handle_fasta(contigs_fpath, corr_fpath, reporting): corrected_contigs_fpaths.append(corr_fpath) return corrected_contigs_fpaths
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath, labels): broken_scaffolds = None contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[file_counter] corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext)) logs = [] logs.append(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison if qconfig.scaffolds: logger.info(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + ' breaking scaffolds into contigs:') corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 scaffold_counter = 0 for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)): if contigs_counter % 100 == 0: pass if contigs_counter > 520: pass cumul_contig_length = 0 total_contigs_for_the_scaf = 1 cur_contig_start = 0 while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1): start = seq.find("N", cumul_contig_length) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 cumul_contig_length = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:start])) total_contigs_for_the_scaf += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:])) contigs_counter += total_contigs_for_the_scaf if scaffold_counter + 1 != contigs_counter: fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) logs.append(" " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " %d scaffolds (%s) were broken into %d contigs (%s)" % (scaffold_counter + 1, label, contigs_counter, label + ' broken')) broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath) else: logs.append(" " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " WARNING: nothing was broken, skipping '%s broken' from further analysis" % label) corr_fpaths = (contigs_fpath, corr_fpath) return corr_fpaths, broken_scaffolds, logs
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath, labels): broken_scaffolds = None contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[file_counter] corr_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, label + fasta_ext)) logs = [] logs.append(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison if qconfig.scaffolds: logger.info( ' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + ' breaking scaffolds into contigs:') corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 scaffold_counter = 0 for scaffold_counter, (name, seq) in enumerate( fastaparser.read_fasta(contigs_fpath)): if contigs_counter % 100 == 0: pass if contigs_counter > 520: pass cumul_contig_length = 0 total_contigs_for_the_scaf = 1 cur_contig_start = 0 while (cumul_contig_length < len(seq)) and (seq.find( 'N', cumul_contig_length) != -1): start = seq.find("N", cumul_contig_length) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 cumul_contig_length = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:start])) total_contigs_for_the_scaf += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:])) contigs_counter += total_contigs_for_the_scaf if scaffold_counter + 1 != contigs_counter: fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) logs.append( " " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " %d scaffolds (%s) were broken into %d contigs (%s)" % (scaffold_counter + 1, label, contigs_counter, label + ' broken')) broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath) else: logs.append( " " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " WARNING: nothing was broken, skipping '%s broken' from further analysis" % label) corr_fpaths = (contigs_fpath, corr_fpath) return corr_fpaths, broken_scaffolds, logs