def analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic=None): maxun = 10 epsilon = 0.99 unaligned = 0 partially_unaligned = 0 fully_unaligned_bases = 0 partially_unaligned_bases = 0 ambiguous_contigs = 0 ambiguous_contigs_extra_bases = 0 ambiguous_contigs_len = 0 half_unaligned_with_misassembly = 0 misassembly_internal_overlap = 0 ref_aligns = dict() contigs_aligned_lengths = [] aligned_lengths = [] region_misassemblies = [] misassembled_contigs = dict() misassemblies_in_contigs = [] region_struct_variations = find_all_sv(qconfig.bed) istranslocations_by_ref = dict() misassemblies_by_ref = defaultdict(list) for ref in ref_labels_by_chromosomes.values(): istranslocations_by_ref[ref] = dict((key, 0) for key in ref_labels_by_chromosomes.values()) # for counting SNPs and indels (both original (.all_snps) and corrected from local misassemblies) total_indels_info = IndelsInfo() unaligned_file = open(unaligned_fpath, 'w') unaligned_info_file = open(unaligned_info_fpath, 'w') unaligned_info_file.write('\t'.join(['Contig', 'Total_length', 'Unaligned_length', 'Unaligned_type', 'Unaligned_parts']) + '\n') for contig, seq in fastaparser.read_fasta(contigs_fpath): #Recording contig stats ctg_len = len(seq) ca_output.stdout_f.write('CONTIG: %s (%dbp)\n' % (contig, ctg_len)) contig_type = 'unaligned' misassemblies_in_contigs.append(0) contigs_aligned_lengths.append(0) filtered_aligns = [] if contig in aligns: filtered_aligns = [align for align in aligns[contig] if align.len2 >= qconfig.min_alignment] #Check if this contig aligned to the reference if filtered_aligns: contig_type = 'correct' #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS) sorted_aligns = sorted(filtered_aligns, key=lambda x: (score_single_align(x), x.len2), reverse=True) top_len = sorted_aligns[0].len2 top_id = sorted_aligns[0].idy top_score = score_single_align(sorted_aligns[0]) top_aligns = [] ca_output.stdout_f.write('Best alignment score: %.1f (LEN: %d, IDY: %.2f), Total number of alignments: %d\n' % (top_score, top_len, top_id, len(sorted_aligns))) #Check that top hit captures most of the contig if top_len > ctg_len * epsilon or ctg_len - top_len < maxun: #Reset top aligns: aligns that share the same value of longest and highest identity top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Continue grabbing alignments while length and identity are identical #while sorted_aligns and top_len == sorted_aligns[0].len2 and top_id == sorted_aligns[0].idy: while sorted_aligns and (score_single_align(sorted_aligns[0]) >= qconfig.ambiguity_score * top_score): top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Mark other alignments as insignificant (former ambiguous) if sorted_aligns: ca_output.stdout_f.write('\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):\n' % str(qconfig.ambiguity_score)) for align in sorted_aligns: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n') if len(top_aligns) == 1: #There is only one top align, life is good ca_output.stdout_f.write('\t\tOne align captures most of this contig: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str() + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + '\n') aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[0].len2 else: #There is more than one top align ca_output.stdout_f.write('\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]\n' % len(top_aligns)) #Increment count of ambiguously mapped contigs and bases in them ambiguous_contigs += 1 # we count only extra bases, so we shouldn't include bases in the first alignment # if --ambiguity-usage is 'none', the number of extra bases will be negative! ambiguous_contigs_len += ctg_len # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= top_aligns[0].len2 ca_output.stdout_f.write('\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):\n') for align in top_aligns: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n') elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 ca_output.stdout_f.write('\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):\n') ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str() + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[0].len2 ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + '\n') top_aligns = top_aligns[1:] for align in top_aligns: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n') elif qconfig.ambiguity_usage == "all": ambiguous_contigs_extra_bases -= top_aligns[0].len2 ca_output.stdout_f.write('\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):\n') # we count only extra bases, so we shouldn't include bases in the first alignment first_alignment = True contig_type = 'ambiguous' while len(top_aligns): ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str(ambiguity=True) + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) if first_alignment: first_alignment = False aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[0].len2 ambiguous_contigs_extra_bases += top_aligns[0].len2 ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + ' ambiguous\n') top_aligns = top_aligns[1:] else: # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies) is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets( sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens, is_cyclic, region_struct_variations) the_best_set = best_sets[0] used_indexes = list(range(len(sorted_aligns)) if too_much_best_sets else get_used_indexes(best_sets)) if len(used_indexes) < len(sorted_aligns): ca_output.stdout_f.write('\t\t\tSkipping redundant alignments after choosing the best set of alignments\n') for idx in set([idx for idx in range(len(sorted_aligns)) if idx not in used_indexes]): ca_output.stdout_f.write('\t\tSkipping redundant alignment ' + str(sorted_aligns[idx]) + '\n') if is_ambiguous: ca_output.stdout_f.write('\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]\n') # similar to regular ambiguous contigs, see above ambiguous_contigs += 1 ambiguous_contigs_len += ctg_len if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= (ctg_len - the_best_set.uncovered) ca_output.stdout_f.write('\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):\n') for idx in used_indexes: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n') continue elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 ca_output.stdout_f.write('\t\tUsing only the very best set (option --ambiguity-usage is set to "one").\n') if len(the_best_set.indexes) < len(used_indexes): ca_output.stdout_f.write('\t\tSo, skipping alignments from other sets:\n') for idx in used_indexes: if idx not in the_best_set.indexes: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n') elif qconfig.ambiguity_usage == "all": ca_output.stdout_f.write('\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):\n') ca_output.stdout_f.write('\t\t\tThe very best set is shown in details below, the rest are:\n') for idx, cur_set in enumerate(best_sets[1:]): ca_output.stdout_f.write('\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \ (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered)) if too_much_best_sets: ca_output.stdout_f.write('\t\t\t\tetc...\n') if len(the_best_set.indexes) < len(used_indexes): ambiguous_contigs_extra_bases -= (ctg_len - the_best_set.uncovered) ca_output.stdout_f.write('\t\t\tList of alignments used in the sets above:\n') for idx in used_indexes: align = sorted_aligns[idx] ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(align)) ref_aligns.setdefault(align.ref, []).append(align) ambiguous_contigs_extra_bases += align.len2 ca_output.coords_filtered_f.write(align.coords_str() + " ambiguous\n") if idx not in the_best_set.indexes: ca_output.icarus_out_f.write(align.icarus_report_str(is_best=False) + '\n') ca_output.stdout_f.write('\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \ (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered)) real_aligns = [sorted_aligns[i] for i in the_best_set.indexes] # main processing part if len(real_aligns) == 1: the_only_align = real_aligns[0] #There is only one alignment of this contig to the reference ca_output.coords_filtered_f.write(the_only_align.coords_str() + '\n') aligned_lengths.append(the_only_align.len2) contigs_aligned_lengths[-1] = the_only_align.len2 begin, end = the_only_align.start(), the_only_align.end() unaligned_bases = (begin - 1) + (ctg_len - end) number_unaligned_ns = seq[:begin - 1].count('N') + seq[end:].count('N') aligned_bases_in_contig = ctg_len - unaligned_bases acgt_ctg_len = ctg_len - seq.count('N') is_partially_unaligned = check_partially_unaligned(seq, real_aligns, ctg_len) if is_partially_unaligned: partially_unaligned += 1 partially_unaligned_bases += unaligned_bases - number_unaligned_ns if aligned_bases_in_contig < qconfig.unaligned_mis_threshold * acgt_ctg_len: contig_type = 'correct_unaligned' ca_output.stdout_f.write('\t\tThis contig is partially unaligned. ' '(Aligned %d out of %d non-N bases (%.2f%%))\n' % (aligned_bases_in_contig, acgt_ctg_len, 100.0 * aligned_bases_in_contig / acgt_ctg_len)) save_unaligned_info(real_aligns, contig, ctg_len, unaligned_bases, unaligned_info_file) ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(the_only_align)) ca_output.icarus_out_f.write(the_only_align.icarus_report_str() + '\n') if is_partially_unaligned: if begin - 1: ca_output.stdout_f.write('\t\tUnaligned bases: 1 to %d (%d)\n' % (begin - 1, begin - 1)) if ctg_len - end: ca_output.stdout_f.write('\t\tUnaligned bases: %d to %d (%d)\n' % (end + 1, ctg_len, ctg_len - end)) if qconfig.is_combined_ref: check_for_potential_translocation(seq, ctg_len, real_aligns, region_misassemblies, misassemblies_by_ref, ca_output.stdout_f) ref_aligns.setdefault(the_only_align.ref, []).append(the_only_align) else: #Sort real alignments by position on the contig sorted_aligns = sorted(real_aligns, key=lambda x: (x.end(), x.start())) #There is more than one alignment of this contig to the reference ca_output.stdout_f.write('\t\tThis contig is misassembled.\n') unaligned_bases = the_best_set.uncovered number_unaligned_ns, prev_pos = 0, 0 for align in sorted_aligns: number_unaligned_ns += seq[prev_pos: align.start() - 1].count('N') prev_pos = align.end() number_unaligned_ns += seq[prev_pos:].count('N') aligned_bases_in_contig = ctg_len - unaligned_bases number_ns = seq.count('N') acgt_ctg_len = ctg_len - number_ns is_partially_unaligned = check_partially_unaligned(seq, sorted_aligns, ctg_len) if is_partially_unaligned: partially_unaligned += 1 partially_unaligned_bases += unaligned_bases - number_unaligned_ns ca_output.stdout_f.write('\t\tThis contig is partially unaligned. ' '(Aligned %d out of %d non-N bases (%.2f%%))\n' % (aligned_bases_in_contig, acgt_ctg_len, 100.0 * aligned_bases_in_contig / acgt_ctg_len)) save_unaligned_info(sorted_aligns, contig, ctg_len, unaligned_bases, unaligned_info_file) if aligned_bases_in_contig < qconfig.unaligned_mis_threshold * acgt_ctg_len: ca_output.stdout_f.write('\t\t\tWarning! This contig is more unaligned than misassembled. ' + \ 'Contig length is %d (number of Ns: %d) and total length of all aligns is %d\n' % (ctg_len, number_ns, aligned_bases_in_contig)) contigs_aligned_lengths[-1] = sum(align.len2 for align in sorted_aligns) for align in sorted_aligns: ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(align)) ca_output.icarus_out_f.write(align.icarus_report_str() + '\n') ca_output.icarus_out_f.write('unknown\n') ca_output.coords_filtered_f.write(align.coords_str() + '\n') aligned_lengths.append(align.len2) ref_aligns.setdefault(align.ref, []).append(align) half_unaligned_with_misassembly += 1 ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' % unaligned_bases) contig_type = 'mis_unaligned' ca_output.icarus_out_f.write('\t'.join(['CONTIG', contig, str(ctg_len), contig_type + '\n'])) ca_output.stdout_f.write('\n') continue ### processing misassemblies is_misassembled, current_mio, indels_info, cnt_misassemblies, contig_aligned_length = \ process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns, ref_features, seq, misassemblies_by_ref, istranslocations_by_ref, region_struct_variations, ca_output) contigs_aligned_lengths[-1] = contig_aligned_length misassembly_internal_overlap += current_mio total_indels_info += indels_info if is_misassembled: misassembled_contigs[contig] = ctg_len contig_type = 'misassembled' misassemblies_in_contigs[-1] = cnt_misassemblies if is_partially_unaligned: ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' % unaligned_bases) if qconfig.is_combined_ref: check_for_potential_translocation(seq, ctg_len, sorted_aligns, region_misassemblies, misassemblies_by_ref, ca_output.stdout_f) else: #No aligns to this contig ca_output.stdout_f.write('\t\tThis contig is unaligned. (%d bp)\n' % ctg_len) unaligned_file.write(contig + '\n') #Increment unaligned contig count and bases unaligned += 1 number_ns = seq.count('N') fully_unaligned_bases += ctg_len - number_ns ca_output.stdout_f.write('\t\tUnaligned bases: %d (number of Ns: %d)\n' % (ctg_len, number_ns)) save_unaligned_info([], contig, ctg_len, ctg_len, unaligned_info_file) ca_output.icarus_out_f.write('\t'.join(['CONTIG', contig, str(ctg_len), contig_type]) + '\n') ca_output.stdout_f.write('\n') unaligned_file.close() unaligned_info_file.close() misassembled_bases = sum(misassembled_contigs.values()) # special case: --skip-unaligned-mis-contigs is specified if qconfig.unaligned_mis_threshold == 0.0: half_unaligned_with_misassembly = None result = {'region_misassemblies': region_misassemblies, 'region_struct_variations': region_struct_variations.get_count() if region_struct_variations else None, 'misassembled_contigs': misassembled_contigs, 'misassembled_bases': misassembled_bases, 'misassembly_internal_overlap': misassembly_internal_overlap, 'unaligned': unaligned, 'partially_unaligned': partially_unaligned, 'partially_unaligned_bases': partially_unaligned_bases, 'fully_unaligned_bases': fully_unaligned_bases, 'ambiguous_contigs': ambiguous_contigs, 'ambiguous_contigs_extra_bases': ambiguous_contigs_extra_bases, 'ambiguous_contigs_len': ambiguous_contigs_len, 'half_unaligned_with_misassembly': half_unaligned_with_misassembly, 'misassemblies_by_ref': misassemblies_by_ref, 'istranslocations_by_refs': istranslocations_by_ref} return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, contigs_aligned_lengths
def analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic=None): maxun = 10 epsilon = 0.99 umt = 0.5 # threshold for misassembled contigs with aligned less than $umt * 100% (Unaligned Missassembled Threshold) unaligned = 0 partially_unaligned = 0 fully_unaligned_bases = 0 partially_unaligned_bases = 0 ambiguous_contigs = 0 ambiguous_contigs_extra_bases = 0 ambiguous_contigs_len = 0 half_unaligned_with_misassembly = 0 misassembly_internal_overlap = 0 misassemblies_matched_sv = 0 ref_aligns = dict() contigs_aligned_lengths = [] aligned_lengths = [] region_misassemblies = [] misassembled_contigs = dict() misassemblies_in_contigs = [] region_struct_variations = find_all_sv(qconfig.bed) istranslocations_by_ref = dict() misassemblies_by_ref = defaultdict(list) for ref in ref_labels_by_chromosomes.values(): istranslocations_by_ref[ref] = dict( (key, 0) for key in ref_labels_by_chromosomes.values()) # for counting SNPs and indels (both original (.all_snps) and corrected from Nucmer's local misassemblies) total_indels_info = IndelsInfo() unaligned_file = open(unaligned_fpath, 'w') unaligned_info_file = open(unaligned_info_fpath, 'w') unaligned_info_file.write('\t'.join([ 'Contig', 'Total_length', 'Unaligned_length', 'Unaligned_type', 'Unaligned_parts' ]) + '\n') for contig, seq in fastaparser.read_fasta(contigs_fpath): #Recording contig stats ctg_len = len(seq) ca_output.stdout_f.write('CONTIG: %s (%dbp)\n' % (contig, ctg_len)) contig_type = 'unaligned' misassemblies_in_contigs.append(0) contigs_aligned_lengths.append(0) #Check if this contig aligned to the reference if contig in aligns: for align in aligns[contig]: sub_seq = seq[align.start():align.end()] if 'N' in sub_seq: ns_pos = [ pos for pos in range(align.start(), align.end()) if seq[pos] == 'N' ] contig_type = 'correct' #Pull all aligns for this contig num_aligns = len(aligns[contig]) #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS) sorted_aligns = sorted(aligns[contig], key=lambda x: (score_single_align(x), x.len2), reverse=True) top_len = sorted_aligns[0].len2 top_id = sorted_aligns[0].idy top_score = score_single_align(sorted_aligns[0]) top_aligns = [] ca_output.stdout_f.write( 'Best alignment score: %.1f (LEN: %d, IDY: %.2f)\n' % (top_score, top_len, top_id)) #Check that top hit captures most of the contig if top_len > ctg_len * epsilon or ctg_len - top_len < maxun: #Reset top aligns: aligns that share the same value of longest and highest identity top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Continue grabbing alignments while length and identity are identical #while sorted_aligns and top_len == sorted_aligns[0].len2 and top_id == sorted_aligns[0].idy: while sorted_aligns and (score_single_align( sorted_aligns[0]) >= qconfig.ambiguity_score * top_score): top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Mark other alignments as insignificant (former ambiguous) if sorted_aligns: ca_output.stdout_f.write( '\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):\n' % str(qconfig.ambiguity_score)) for align in sorted_aligns: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n') if len(top_aligns) == 1: #There is only one top align, life is good ca_output.stdout_f.write( '\t\tOne align captures most of this contig: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write( top_aligns[0].icarus_report_str() + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) ca_output.coords_filtered_f.write( str(top_aligns[0]) + '\n') aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[0].len2 else: #There is more than one top align ca_output.stdout_f.write( '\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]\n' % len(top_aligns)) #Increment count of ambiguously mapped contigs and bases in them ambiguous_contigs += 1 # we count only extra bases, so we shouldn't include bases in the first alignment # if --ambiguity-usage is 'none', the number of extra bases will be negative! ambiguous_contigs_len += ctg_len # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= top_aligns[0].len2 ca_output.stdout_f.write( '\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):\n' ) for align in top_aligns: ca_output.stdout_f.write( '\t\t\tSkipping alignment ' + str(align) + '\n') elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 ca_output.stdout_f.write( '\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):\n' ) ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write( top_aligns[0].icarus_report_str() + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[0].len2 ca_output.coords_filtered_f.write( str(top_aligns[0]) + '\n') top_aligns = top_aligns[1:] for align in top_aligns: ca_output.stdout_f.write( '\t\t\tSkipping alignment ' + str(align) + '\n') elif qconfig.ambiguity_usage == "all": ambiguous_contigs_extra_bases -= top_aligns[0].len2 ca_output.stdout_f.write( '\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):\n' ) # we count only extra bases, so we shouldn't include bases in the first alignment first_alignment = True contig_type = 'ambiguous' while len(top_aligns): ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write( top_aligns[0].icarus_report_str( ambiguity=True) + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) if first_alignment: first_alignment = False aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[ 0].len2 ambiguous_contigs_extra_bases += top_aligns[0].len2 ca_output.coords_filtered_f.write( str(top_aligns[0]) + ' ambiguous\n') top_aligns = top_aligns[1:] else: # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies) is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets( sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens, is_cyclic, region_struct_variations) the_best_set = best_sets[0] used_indexes = list( range(len(sorted_aligns)) if too_much_best_sets else get_used_indexes(best_sets)) if len(used_indexes) < len(sorted_aligns): ca_output.stdout_f.write( '\t\t\tSkipping redundant alignments after choosing the best set of alignments\n' ) for idx in set([ idx for idx in range(len(sorted_aligns)) if idx not in used_indexes ]): ca_output.stdout_f.write( '\t\tSkipping redundant alignment ' + str(sorted_aligns[idx]) + '\n') if is_ambiguous: ca_output.stdout_f.write( '\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]\n' ) # similar to regular ambiguous contigs, see above ambiguous_contigs += 1 ambiguous_contigs_len += ctg_len if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= ( ctg_len - the_best_set.uncovered) ca_output.stdout_f.write( '\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):\n' ) for idx in used_indexes: ca_output.stdout_f.write( '\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n') continue elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 ca_output.stdout_f.write( '\t\tUsing only the very best set (option --ambiguity-usage is set to "one").\n' ) if len(the_best_set.indexes) < len(used_indexes): ca_output.stdout_f.write( '\t\tSo, skipping alignments from other sets:\n' ) for idx in used_indexes: if idx not in the_best_set.indexes: ca_output.stdout_f.write( '\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n') elif qconfig.ambiguity_usage == "all": ca_output.stdout_f.write( '\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):\n' ) ca_output.stdout_f.write( '\t\t\tThe very best set is shown in details below, the rest are:\n' ) for idx, cur_set in enumerate(best_sets[1:]): ca_output.stdout_f.write('\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \ (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered)) if too_much_best_sets: ca_output.stdout_f.write('\t\t\t\tetc...\n') if len(the_best_set.indexes) < len(used_indexes): ambiguous_contigs_extra_bases -= ( ctg_len - the_best_set.uncovered) ca_output.stdout_f.write( '\t\t\tList of alignments used in the sets above:\n' ) for idx in used_indexes: align = sorted_aligns[idx] ca_output.stdout_f.write( '\t\tAlignment: %s\n' % str(align)) ref_aligns.setdefault(align.ref, []).append(align) ambiguous_contigs_extra_bases += align.len2 ca_output.coords_filtered_f.write( str(align) + " ambiguous\n") if idx not in the_best_set.indexes: ca_output.icarus_out_f.write( align.icarus_report_str( is_best=False) + '\n') ca_output.stdout_f.write('\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \ (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered)) real_aligns = [sorted_aligns[i] for i in the_best_set.indexes] # main processing part if len(real_aligns) == 1: the_only_align = real_aligns[0] #There is only one alignment of this contig to the reference ca_output.coords_filtered_f.write( str(the_only_align) + '\n') aligned_lengths.append(the_only_align.len2) contigs_aligned_lengths[-1] = the_only_align.len2 begin, end = the_only_align.start(), the_only_align.end() unaligned_bases = (begin - 1) + (ctg_len - end) aligned_bases_in_contig = ctg_len - unaligned_bases is_partially_unaligned = check_partially_unaligned( real_aligns, ctg_len) if is_partially_unaligned: partially_unaligned += 1 partially_unaligned_bases += unaligned_bases if aligned_bases_in_contig < umt * ctg_len: contig_type = 'correct_unaligned' ca_output.stdout_f.write( '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)\n' % (aligned_bases_in_contig, ctg_len)) save_unaligned_info(real_aligns, contig, ctg_len, unaligned_bases, unaligned_info_file) ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(the_only_align)) ca_output.icarus_out_f.write( the_only_align.icarus_report_str() + '\n') if is_partially_unaligned: if begin - 1: ca_output.stdout_f.write( '\t\tUnaligned bases: 1 to %d (%d)\n' % (begin - 1, begin - 1)) if ctg_len - end: ca_output.stdout_f.write( '\t\tUnaligned bases: %d to %d (%d)\n' % (end + 1, ctg_len, ctg_len - end)) if qconfig.is_combined_ref and aligned_bases_in_contig >= umt * ctg_len: check_for_potential_translocation( seq, ctg_len, real_aligns, region_misassemblies, misassemblies_by_ref, ca_output.stdout_f) ref_aligns.setdefault(the_only_align.ref, []).append(the_only_align) else: #Sort real alignments by position on the contig sorted_aligns = sorted(real_aligns, key=lambda x: (x.end(), x.start())) #There is more than one alignment of this contig to the reference ca_output.stdout_f.write( '\t\tThis contig is misassembled. %d total aligns.\n' % num_aligns) unaligned_bases = the_best_set.uncovered aligned_bases_in_contig = ctg_len - unaligned_bases is_partially_unaligned = check_partially_unaligned( sorted_aligns, ctg_len) if is_partially_unaligned: partially_unaligned += 1 partially_unaligned_bases += unaligned_bases if aligned_bases_in_contig >= umt * ctg_len: ca_output.stdout_f.write( '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)\n' % (aligned_bases_in_contig, ctg_len)) save_unaligned_info(sorted_aligns, contig, ctg_len, unaligned_bases, unaligned_info_file) if aligned_bases_in_contig < umt * ctg_len: ca_output.stdout_f.write('\t\t\tWarning! This contig is more unaligned than misassembled. ' + \ 'Contig length is %d and total length of all aligns is %d\n' % (ctg_len, aligned_bases_in_contig)) contigs_aligned_lengths[-1] = sum( align.len2 for align in sorted_aligns) for align in sorted_aligns: ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(align)) ca_output.icarus_out_f.write( align.icarus_report_str() + '\n') ca_output.icarus_out_f.write('unknown\n') ca_output.coords_filtered_f.write( str(align) + '\n') aligned_lengths.append(align.len2) ref_aligns.setdefault(align.ref, []).append(align) half_unaligned_with_misassembly += 1 ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' % unaligned_bases) contig_type = 'mis_unaligned' ca_output.icarus_out_f.write('\t'.join([ 'CONTIG', contig, str(ctg_len), contig_type + '\n' ])) ca_output.stdout_f.write('\n') continue ### processing misassemblies is_misassembled, current_mio, indels_info, misassemblies_matched_sv, cnt_misassemblies, contig_aligned_length = \ process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns, ref_features, seq, misassemblies_by_ref, istranslocations_by_ref, region_struct_variations, misassemblies_matched_sv, ca_output) contigs_aligned_lengths[-1] = contig_aligned_length misassembly_internal_overlap += current_mio total_indels_info += indels_info if is_misassembled: misassembled_contigs[contig] = ctg_len contig_type = 'misassembled' misassemblies_in_contigs[-1] = cnt_misassemblies if is_partially_unaligned: ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' % unaligned_bases) if qconfig.is_combined_ref: check_for_potential_translocation( seq, ctg_len, sorted_aligns, region_misassemblies, misassemblies_by_ref, ca_output.stdout_f) else: #No aligns to this contig ca_output.stdout_f.write( '\t\tThis contig is unaligned. (%d bp)\n' % ctg_len) unaligned_file.write(contig) #Increment unaligned contig count and bases unaligned += 1 fully_unaligned_bases += ctg_len ca_output.stdout_f.write('\t\tUnaligned bases: %d total: %d\n' % (ctg_len, fully_unaligned_bases)) save_unaligned_info([], contig, ctg_len, ctg_len, unaligned_info_file) ca_output.icarus_out_f.write('\t'.join( ['CONTIG', contig, str(ctg_len), contig_type]) + '\n') ca_output.stdout_f.write('\n') unaligned_file.close() unaligned_info_file.close() misassembled_bases = sum(misassembled_contigs.values()) result = { 'region_misassemblies': region_misassemblies, 'region_struct_variations': region_struct_variations.get_count() if region_struct_variations else None, 'misassemblies_matched_sv': misassemblies_matched_sv, 'misassembled_contigs': misassembled_contigs, 'misassembled_bases': misassembled_bases, 'misassembly_internal_overlap': misassembly_internal_overlap, 'unaligned': unaligned, 'partially_unaligned': partially_unaligned, 'partially_unaligned_bases': partially_unaligned_bases, 'fully_unaligned_bases': fully_unaligned_bases, 'ambiguous_contigs': ambiguous_contigs, 'ambiguous_contigs_extra_bases': ambiguous_contigs_extra_bases, 'ambiguous_contigs_len': ambiguous_contigs_len, 'half_unaligned_with_misassembly': half_unaligned_with_misassembly, 'misassemblies_by_ref': misassemblies_by_ref, 'istranslocations_by_refs': istranslocations_by_ref } return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, contigs_aligned_lengths
def analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, aligns, ref_features, ref_lens, cyclic=None): maxun = 10 epsilon = 0.99 umt = 0.5 # threshold for misassembled contigs with aligned less than $umt * 100% (Unaligned Missassembled Threshold) unaligned = 0 partially_unaligned = 0 fully_unaligned_bases = 0 partially_unaligned_bases = 0 ambiguous_contigs = 0 ambiguous_contigs_extra_bases = 0 ambiguous_contigs_len = 0 partially_unaligned_with_misassembly = 0 partially_unaligned_with_significant_parts = 0 misassembly_internal_overlap = 0 contigs_with_istranslocations = 0 misassemblies_matched_sv = 0 ref_aligns = dict() aligned_lengths = [] region_misassemblies = [] misassembled_contigs = dict() region_struct_variations = find_all_sv(qconfig.bed) references_misassemblies = {} for ref in ref_labels_by_chromosomes.values(): references_misassemblies[ref] = dict( (key, 0) for key in ref_labels_by_chromosomes.values()) # for counting SNPs and indels (both original (.all_snps) and corrected from Nucmer's local misassemblies) total_indels_info = IndelsInfo() unaligned_file = open(unaligned_fpath, 'w') for contig, seq in fastaparser.read_fasta(contigs_fpath): #Recording contig stats ctg_len = len(seq) print >> ca_output.stdout_f, 'CONTIG: %s (%dbp)' % (contig, ctg_len) contig_type = 'unaligned' #Check if this contig aligned to the reference if contig in aligns: for align in aligns[contig]: #sub_seq = seq[align.start(): align.end()] sub_seq = seq[_start(align):_end(align)] if 'N' in sub_seq: ns_pos = [ pos for pos in xrange(_start(align), _end(align)) if seq[pos] == 'N' ] # ns_pos = [pos for pos in xrange(align.start(), align.end()) if seq[pos] == 'N'] contig_type = 'correct' #Pull all aligns for this contig num_aligns = len(aligns[contig]) #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS) sorted_aligns = sorted(aligns[contig], key=lambda x: (score_single_align(x), x[5]), reverse=True) top_len = sorted_aligns[0][5] top_id = sorted_aligns[0][6] top_score = score_single_align(sorted_aligns[0]) top_aligns = [] print >> ca_output.stdout_f, 'Top Length: %d Top ID: %.2f (Score: %.1f)' % ( top_len, top_id, top_score) #Check that top hit captures most of the contig if top_len > ctg_len * epsilon or ctg_len - top_len < maxun: #Reset top aligns: aligns that share the same value of longest and highest identity top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Continue grabbing alignments while length and identity are identical #while sorted_aligns and top_len == sorted_aligns[0][5] and top_id == sorted_aligns[0][6]: while sorted_aligns and (score_single_align( sorted_aligns[0]) >= qconfig.ambiguity_score * top_score): top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Mark other alignments as insignificant (former ambiguous) if sorted_aligns: print >> ca_output.stdout_f, '\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):' % str( qconfig.ambiguity_score) for align in sorted_aligns: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align if len(top_aligns) == 1: #There is only one top align, life is good print >> ca_output.stdout_f, '\t\tOne align captures most of this contig: %s' % str( top_aligns[0]) # print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( top_aligns[0]) ref_aligns.setdefault(top_aligns[0][7], []).append(top_aligns[0]) print >> ca_output.coords_filtered_f, str(top_aligns[0]) aligned_lengths.append(top_aligns[0][5]) else: #There is more than one top align print >> ca_output.stdout_f, '\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]' % len( top_aligns) #Increment count of ambiguously mapped contigs and bases in them ambiguous_contigs += 1 # we count only extra bases, so we shouldn't include bases in the first alignment # if --ambiguity-usage is 'none', the number of extra bases will be negative! ambiguous_contigs_len += ctg_len # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= top_aligns[0][5] print >> ca_output.stdout_f, '\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):' for align in top_aligns: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 print >> ca_output.stdout_f, '\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):' print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str( top_aligns[0]) # print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( top_aligns[0]) ref_aligns.setdefault(top_aligns[0][7], []).append(top_aligns[0]) aligned_lengths.append(top_aligns[0][5]) print >> ca_output.coords_filtered_f, str( top_aligns[0]) top_aligns = top_aligns[1:] for align in top_aligns: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align elif qconfig.ambiguity_usage == "all": ambiguous_contigs_extra_bases -= top_aligns[0][5] print >> ca_output.stdout_f, '\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):' # we count only extra bases, so we shouldn't include bases in the first alignment first_alignment = True while len(top_aligns): print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str( top_aligns[0]) # print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str(ambiguity=True) print >> ca_output.icarus_out_f, icarus_report_str( top_aligns[0], ambiguity=True) ref_aligns.setdefault(top_aligns[0][7], []).append(top_aligns[0]) if first_alignment: first_alignment = False aligned_lengths.append(top_aligns[0][5]) ambiguous_contigs_extra_bases += top_aligns[0][5] print >> ca_output.coords_filtered_f, str( top_aligns[0]), "ambiguous" top_aligns = top_aligns[1:] else: # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies) is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets( sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens, cyclic, region_struct_variations) the_best_set = best_sets[0] used_indexes = range( len(sorted_aligns) ) if too_much_best_sets else get_used_indexes(best_sets) if len(used_indexes) < len(sorted_aligns): print >> ca_output.stdout_f, '\t\t\tSkipping redundant alignments after choosing the best set of alignments' for idx in set(range(len(sorted_aligns))) - used_indexes: print >> ca_output.stdout_f, '\t\tSkipping redundant alignment', sorted_aligns[ idx] if is_ambiguous: print >> ca_output.stdout_f, '\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]' # similar to regular ambiguous contigs, see above ambiguous_contigs += 1 ambiguous_contigs_len += ctg_len if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= ( ctg_len - the_best_set.uncovered) print >> ca_output.stdout_f, '\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):' for idx in used_indexes: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[ idx] continue elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 print >> ca_output.stdout_f, '\t\tUsing only the very best set (option --ambiguity-usage is set to "one").' if len(the_best_set.indexes) < len(used_indexes): print >> ca_output.stdout_f, '\t\tSo, skipping alignments from other sets:' for idx in used_indexes: if idx not in the_best_set.indexes: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[ idx] elif qconfig.ambiguity_usage == "all": print >> ca_output.stdout_f, '\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):' print >> ca_output.stdout_f, '\t\t\tThe very best set is shown in details below, the rest are:' for idx, cur_set in enumerate(best_sets[1:]): print >> ca_output.stdout_f, '\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \ (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered) if too_much_best_sets: print >> ca_output.stdout_f, '\t\t\t\tetc...' if len(the_best_set.indexes) < len(used_indexes): ambiguous_contigs_extra_bases -= ( ctg_len - the_best_set.uncovered) print >> ca_output.stdout_f, '\t\t\tList of alignments used in the sets above:' for idx in used_indexes: align = sorted_aligns[idx] print >> ca_output.stdout_f, '\t\tAlignment: %s' % str( align) ref_aligns.setdefault(align[7], []).append(align) ambiguous_contigs_extra_bases += align[5] print >> ca_output.coords_filtered_f, str( align), "ambiguous" if idx not in the_best_set.indexes: print >> ca_output.icarus_out_f, icarus_report_str( align, is_best=False) # print >> ca_output.icarus_out_f, align.icarus_report_str(is_best=False) print >> ca_output.stdout_f, '\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \ (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered) real_aligns = [sorted_aligns[i] for i in the_best_set.indexes] # main processing part if len(real_aligns) == 1: the_only_align = real_aligns[0] #There is only one alignment of this contig to the reference print >> ca_output.coords_filtered_f, str(the_only_align) aligned_lengths.append(the_only_align[5]) # begin, end = the_only_align.start(), the_only_align.end() begin, end = _start(the_only_align), _end(the_only_align) unaligned_bases = 0 if (begin - 1) or (ctg_len - end): partially_unaligned += 1 unaligned_bases = (begin - 1) + (ctg_len - end) partially_unaligned_bases += unaligned_bases print >> ca_output.stdout_f, '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)' % ( top_len, ctg_len) print >> ca_output.stdout_f, '\t\tAlignment: %s' % str( the_only_align) # print >> ca_output.icarus_out_f, the_only_align.icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( the_only_align) if begin - 1: print >> ca_output.stdout_f, '\t\tUnaligned bases: 1 to %d (%d)' % ( begin - 1, begin - 1) if ctg_len - end: print >> ca_output.stdout_f, '\t\tUnaligned bases: %d to %d (%d)' % ( end + 1, ctg_len, ctg_len - end) # check if both parts (aligned and unaligned) have significant length if (unaligned_bases >= qconfig.significant_part_size) and ( ctg_len - unaligned_bases >= qconfig.significant_part_size): print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \ '(of length >= %d)!' % (qconfig.significant_part_size) partially_unaligned_with_significant_parts += 1 if qconfig.meta: contigs_with_istranslocations += check_for_potential_translocation( seq, ctg_len, real_aligns, ca_output.stdout_f) ref_aligns.setdefault(the_only_align[7], []).append(the_only_align) else: #Sort real alignments by position on the contig sorted_aligns = sorted(real_aligns, key=lambda x: (_end(x), _start(x))) # sorted_aligns = sorted(real_aligns, key=lambda x: (x.end(), x.start())) #There is more than one alignment of this contig to the reference print >> ca_output.stdout_f, '\t\tThis contig is misassembled. %d total aligns.' % num_aligns aligned_bases_in_contig = ctg_len - the_best_set.uncovered if aligned_bases_in_contig < umt * ctg_len: print >> ca_output.stdout_f, '\t\t\tWarning! This contig is more unaligned than misassembled. ' + \ 'Contig length is %d and total length of all aligns is %d' % (ctg_len, aligned_bases_in_contig) for align in sorted_aligns: print >> ca_output.stdout_f, '\t\tAlignment: %s' % str( align) # print >> ca_output.icarus_out_f, align.icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( align) print >> ca_output.coords_filtered_f, str(align) aligned_lengths.append(align[5]) ref_aligns.setdefault(align[7], []).append(align) partially_unaligned_with_misassembly += 1 partially_unaligned += 1 partially_unaligned_bases += ctg_len - aligned_bases_in_contig print >> ca_output.stdout_f, '\t\tUnaligned bases: %d' % ( ctg_len - aligned_bases_in_contig) # check if both parts (aligned and unaligned) have significant length if (aligned_bases_in_contig >= qconfig.significant_part_size) and ( ctg_len - aligned_bases_in_contig >= qconfig.significant_part_size): print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \ '(of length >= %d)!' % (qconfig.significant_part_size) partially_unaligned_with_significant_parts += 1 if qconfig.meta: contigs_with_istranslocations += check_for_potential_translocation( seq, ctg_len, sorted_aligns, ca_output.stdout_f) contig_type = 'misassembled' print >> ca_output.icarus_out_f, '\t'.join( ['CONTIG', contig, str(ctg_len), contig_type]) print >> ca_output.stdout_f continue ### processing misassemblies is_misassembled, current_mio, references_misassemblies, indels_info, misassemblies_matched_sv = \ process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns, ref_features, seq, references_misassemblies, region_struct_variations, misassemblies_matched_sv, ca_output, is_ambiguous) misassembly_internal_overlap += current_mio total_indels_info += indels_info if is_misassembled: misassembled_contigs[contig] = ctg_len contig_type = 'misassembled' if ctg_len - aligned_bases_in_contig >= qconfig.significant_part_size: print >> ca_output.stdout_f, '\t\tThis contig has significant unaligned parts ' \ '(of length >= %d)!' % (qconfig.significant_part_size) if qconfig.meta: contigs_with_istranslocations += check_for_potential_translocation( seq, ctg_len, sorted_aligns, ca_output.stdout_f) else: #No aligns to this contig print >> ca_output.stdout_f, '\t\tThis contig is unaligned. (%d bp)' % ctg_len print >> unaligned_file, contig #Increment unaligned contig count and bases unaligned += 1 fully_unaligned_bases += ctg_len print >> ca_output.stdout_f, '\t\tUnaligned bases: %d total: %d' % ( ctg_len, fully_unaligned_bases) print >> ca_output.icarus_out_f, '\t'.join( ['CONTIG', contig, str(ctg_len), contig_type]) print >> ca_output.stdout_f ca_output.coords_filtered_f.close() unaligned_file.close() misassembled_bases = sum(misassembled_contigs.itervalues()) result = { 'region_misassemblies': region_misassemblies, 'region_struct_variations': region_struct_variations.get_count() if region_struct_variations else None, 'misassemblies_matched_sv': misassemblies_matched_sv, 'misassembled_contigs': misassembled_contigs, 'misassembled_bases': misassembled_bases, 'misassembly_internal_overlap': misassembly_internal_overlap, 'unaligned': unaligned, 'partially_unaligned': partially_unaligned, 'partially_unaligned_bases': partially_unaligned_bases, 'fully_unaligned_bases': fully_unaligned_bases, 'ambiguous_contigs': ambiguous_contigs, 'ambiguous_contigs_extra_bases': ambiguous_contigs_extra_bases, 'ambiguous_contigs_len': ambiguous_contigs_len, 'partially_unaligned_with_misassembly': partially_unaligned_with_misassembly, 'partially_unaligned_with_significant_parts': partially_unaligned_with_significant_parts, 'contigs_with_istranslocations': contigs_with_istranslocations, 'istranslocations_by_refs': references_misassemblies } return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs