def check_for_potential_translocation(seq, ctg_len, sorted_aligns, log_out_f): count_ns = 0 unaligned_len = 0 prev_start = 0 for align in sorted_aligns: # if align.start() > prev_start + 1: if _start(align) > prev_start + 1: # unaligned_part = seq[prev_start + 1: align.start()] unaligned_part = seq[prev_start + 1:_start(align)] unaligned_len += len(unaligned_part) count_ns += unaligned_part.count('N') # prev_start = align.end() prev_start = _end(align) # if ctg_len > sorted_aligns[-1].end() + 1: if ctg_len > _end(sorted_aligns[-1]) + 1: # unaligned_part = seq[sorted_aligns[-1].end() + 1: ctg_len] unaligned_part = seq[_end(sorted_aligns[-1]) + 1:ctg_len] unaligned_len += len(unaligned_part) count_ns += unaligned_part.count('N') # if contig consists mostly of Ns, it cannot contain interspecies translocations if count_ns / float( unaligned_len ) >= 0.95 or unaligned_len - count_ns < qconfig.significant_part_size: return 0 print >> log_out_f, '\t\tIt can contain interspecies translocations.' return 1
def intersect_and_go_next(self, align, solids, min_unique_len): # if self.unique_end - align.end() > min_unique_len: # if enough len on the right side if self.unique_end - _end(align) > min_unique_len: # if enough len on the right side if self.is_solid(min_unique_len): solids.append(self.align) return True self.unique_end = min(self.unique_end, _start(align) - 1) # self.unique_end = min(self.unique_end, align.start() - 1) return not self.is_solid(min_unique_len) # if self is not solid anymore we can switch to the next PSA
def get_added_len(set_aligns, cur_align): last_align_idx = -2 last_align = set_aligns[last_align_idx] # added_right = cur_align.end() - max(cur_align.start() - 1, last_align.end()) added_right = _end(cur_align) - max(_start(cur_align) - 1, _end(last_align)) added_left = 0 # while cur_align.start() < last_align.start(): while _start(cur_align) < _start(last_align): # added_left += last_align.start() - cur_align.start() added_left += _start(last_align) - _start(cur_align) last_align_idx -= 1 if -last_align_idx <= len(set_aligns): # prev_start = last_align.start() # in case of overlapping of old and new last_align prev_start = _start(last_align) # in case of overlapping of old and new last_align last_align = set_aligns[last_align_idx] added_left -= max(0, min(prev_start, _end(last_align)) - _start(cur_align) + 1) # added_left -= max(0, min(prev_start, last_align.end()) - cur_align.start() + 1) else: break return added_right + added_left
def is_misassembly(align1, align2, contig_seq, ref_lens, is_cyclic=False, region_struct_variations=None): #Calculate inconsistency between distances on the reference and on the contig # distance_on_contig = align2.start() - align1.end() - 1 distance_on_contig = _start(align2) - _end(align1) - 1 cyclic_ref_lens = ref_lens if is_cyclic else None if cyclic_ref_lens is not None and align1[7] == align2[7]: distance_on_reference, cyclic_moment = distance_between_alignments(align1, align2, align1[2] < align1[3], align2[2] < align2[3], cyclic_ref_lens[align1[7]]) else: distance_on_reference, cyclic_moment = distance_between_alignments(align1, align2, align1[2] < align1[3], align2[2] < align2[3]) misassembly_internal_overlap = 0 if distance_on_contig < 0: if distance_on_reference >= 0: misassembly_internal_overlap = (-distance_on_contig) elif (-distance_on_reference) < (-distance_on_contig): misassembly_internal_overlap = (distance_on_reference - distance_on_contig) strand1 = (align1[2] <= align1[3]) strand2 = (align2[2] <= align2[3]) inconsistency = distance_on_reference - distance_on_contig aux_data = {"inconsistency": inconsistency, "distance_on_contig": distance_on_contig, "misassembly_internal_overlap": misassembly_internal_overlap, "cyclic_moment": cyclic_moment, "is_sv": False, "is_scaffold_gap": False} if qconfig.scaffolds and contig_seq and check_is_scaffold_gap(inconsistency, contig_seq, align1, align2): aux_data["is_scaffold_gap"] = True return False, aux_data if region_struct_variations and check_sv(align1, align2, inconsistency, region_struct_variations): aux_data['is_sv'] = True return False, aux_data if align1[7] != align2[7] and is_fragmented_ref_fake_translocation(align1, align2, ref_lens): aux_data["inconsistency"] = sum(__get_border_gaps(align1, align2, ref_lens)) return False, aux_data if align1[7] != align2[7] or abs(inconsistency) > qconfig.extensive_misassembly_threshold or strand1 != strand2: return True, aux_data return False, aux_data # regular local misassembly
def is_gap_filled_ns(contig_seq, align1, align2): # gap_in_contig = contig_seq[align1.end(): align2.start() - 1] gap_in_contig = contig_seq[_end(align1): _start(align2) - 1] if len(gap_in_contig) < qconfig.Ns_break_threshold: return False return gap_in_contig.count('N')/len(gap_in_contig) > 0.95
def count_ns_and_not_ns_between_aligns(contig_seq, align1, align2): # gap_in_contig = contig_seq[align1.end(): align2.start() - 1] gap_in_contig = contig_seq[_end(align1): _start(align2) - 1] ns_count = gap_in_contig.count('N') return ns_count, len(gap_in_contig) - ns_count
def exclude_internal_overlaps(align1, align2, i=None, ca_output=None): # returns size of align1[5] decrease (or 0 if not changed). It is important for cur_aligned_len calculation def __shift_start(align, new_start, indent=''): if ca_output is not None: print >> ca_output.stdout_f, indent + '%s' % short_str(align), # print >> ca_output.stdout_f, indent + '%s' % align.short_str(), align = list(align) if align[2] < align[3]: align[0] += (new_start - align[2]) align[2] = new_start align[5] = align[3] - align[2] + 1 else: align[1] -= (new_start - align[3]) align[3] = new_start align[5] = align[2] - align[3] + 1 align[4] = align[1] - align[0] + 1 align = tuple(align) if ca_output is not None: print >> ca_output.stdout_f, '--> %s' % short_str(align) # print >> ca_output.stdout_f, '--> %s' % align.short_str() def __shift_end(align, new_end, indent=''): if ca_output is not None: print >> ca_output.stdout_f, indent + '%s' % short_str(align), # print >> ca_output.stdout_f, indent + '%s' % align.short_str(), align = list(align) if align[2] < align[3]: align[1] -= (align[3] - new_end) align[3] = new_end align[5] = align[3] - align[2] + 1 else: align[0] += (align[2] - new_end) align[2] = new_end align[5] = align[2] - align[3] + 1 align[4] = align[1] - align[0] + 1 align = tuple(align) if ca_output is not None: print >> ca_output.stdout_f, '--> %s' % short_str(align) # print >> ca_output.stdout_f, '--> %s' % align.short_str() if qconfig.ambiguity_usage == 'all': return 0 # distance_on_contig = align2.start() - align1.end() - 1 distance_on_contig = _start(align2) - _end(align1) - 1 if distance_on_contig >= 0: # no overlap return 0 prev_len2 = align1[5] if ca_output is not None: print >> ca_output.stdout_f, '\t\t\tExcluding internal overlap of size %d between Alignment %d and %d: ' \ % (-distance_on_contig, i+1, i+2), if qconfig.ambiguity_usage == 'one': # left only one of two copies (remove overlap from shorter alignment) if align1[5] >= align2[5]: # __shift_start(align2, align1.end() + 1) __shift_start(align2, _end(align1) + 1) else: # __shift_end(align1, align2.start() - 1) __shift_end(align1, _start(align2) - 1) elif qconfig.ambiguity_usage == 'none': # removing both copies if ca_output is not None: print >> ca_output.stdout_f # new_end = align2.start() - 1 # __shift_start(align2, align1.end() + 1, '\t\t\t ') new_end = _start(align2) - 1 __shift_start(align2, _end(align1) + 1, '\t\t\t ') __shift_end(align1, new_end, '\t\t\t ') return prev_len2 - align1[5]
def get_best_aligns_sets(sorted_aligns, ctg_len, planta_out_f, seq, ref_lens, is_cyclic=False, region_struct_variations=None): critical_number_of_aligns = 200 # use additional optimizations for large number of alignments penalties = dict() penalties['extensive'] = max(50, int(round(min(qconfig.extensive_misassembly_threshold / 4.0, ctg_len * 0.05)))) - 1 penalties['local'] = max(2, int(round(min(qconfig.MAX_INDEL_LENGTH / 2.0, ctg_len * 0.01)))) - 1 penalties['scaffold'] = 5 # sorted_aligns = sorted(sorted_aligns, key=lambda x: (x.end(), x.start())) sorted_aligns = sorted(sorted_aligns, key=lambda x: (_end(x), _start(x))) # trying to optimise the algorithm if the number of possible alignments is large if len(sorted_aligns) > critical_number_of_aligns: print >> planta_out_f, '\t\t\tSkipping redundant alignments which can\'t be in the best set of alignments A PRIORI' # FIRST STEP: find solid aligns (which are present in the best selection for sure) # they should have unique (not covered by other aligns) region of length > 2 * extensive_penalty min_unique_len = 2 * penalties['extensive'] possible_solids = [PSA(align) for align in sorted_aligns if align[5] > min_unique_len] solids = [] try: cur_PSA = possible_solids.pop() for align in reversed(sorted_aligns): if align != cur_PSA.align and cur_PSA.intersect_and_go_next(align, solids, min_unique_len): next_PSA = possible_solids.pop() while next_PSA.intersect_and_go_next(cur_PSA.align, solids, min_unique_len): next_PSA = possible_solids.pop() while align != next_PSA.align and next_PSA.intersect_and_go_next(align, solids, min_unique_len): next_PSA = possible_solids.pop() cur_PSA = next_PSA except IndexError: # possible_solids is empty pass # SECOND STEP: remove all aligns which are inside solid ones if len(solids): solid_regions = [] # intersection of all solid aligns cur_region = SolidRegion(solids[0]) for align in solids[1:]: # if align.end() + 1 < cur_region.start: if _end(align) + 1 < cur_region.start: solid_regions.append(cur_region) cur_region = SolidRegion(align) else: # shift start of the current region cur_region.start = _start(align) # cur_region.start = align.start() solid_regions.append(cur_region) filtered_aligns = solids idx = 0 try: cur_region = solid_regions.pop() for idx, align in enumerate(sorted_aligns): while not cur_region.include(align): # if align.start() > cur_region.end: if _start(align) > cur_region.end: cur_region = solid_regions.pop() continue filtered_aligns.append(align) break else: print >> planta_out_f, '\t\tSkipping redundant alignment %s' % (str(align)) except IndexError: # solid_regions is empty filtered_aligns += sorted_aligns[idx:] # sorted_aligns = sorted(filtered_aligns, key=lambda x: (x.end(), x.start())) sorted_aligns = sorted(filtered_aligns, key=lambda x: (_end(x), _start(x))) # Stage 1: Dynamic programming for finding the best score all_scored_sets = [ScoredSet(0, [], ctg_len)] max_score = 0 for idx, align in enumerate(sorted_aligns): local_max_score = 0 new_scored_set = None for scored_set in all_scored_sets: # cur_set_aligns = [sorted_aligns[i].clone() for i in scored_set.indexes] + [align.clone()] cur_set_aligns = [_clone(sorted_aligns[i]) for i in scored_set.indexes] + [_clone(align)] score, uncovered = get_score(scored_set.score, cur_set_aligns, ref_lens, is_cyclic, scored_set.uncovered, seq, region_struct_variations, penalties) if score is None: # incorrect set, i.e. internal overlap excluding resulted in incorrectly short alignment continue if score > local_max_score: local_max_score = score new_scored_set = ScoredSet(score, scored_set.indexes + [idx], uncovered) if new_scored_set: all_scored_sets.append(new_scored_set) if local_max_score > max_score: max_score = local_max_score # Stage 2: DFS for finding multiple best sets with almost equally good score max_allowed_score_drop = max_score - max_score * qconfig.ambiguity_score putative_sets = [] # TODO: use priority queue -- minimal score_drop first best_sets = [] for scored_set in all_scored_sets: score_drop = max_score - scored_set.score if score_drop <= max_allowed_score_drop: heappush(putative_sets, PutativeBestSet([scored_set.indexes[-1]], score_drop, scored_set.uncovered)) ambiguity_check_is_needed = True too_much_best_sets = False while len(putative_sets): putative_set = heappop(putative_sets) # special case: no options to enlarge this set, already at the left most point if putative_set.indexes[0] == -1: best_sets.append(ScoredSet(max_score - putative_set.score_drop, putative_set.indexes[1:], putative_set.uncovered)) # special case: we added the very best set and we need decide what to do next (based on ambiguity-usage) if ambiguity_check_is_needed and len(best_sets) == 1: if not putative_sets: # no ambiguity at all, only one good set was there return False, too_much_best_sets, sorted_aligns, best_sets elif not qconfig.ambiguity_usage == 'all': # several good sets are present (the contig is ambiguous) but we need only the best one return True, too_much_best_sets, sorted_aligns, best_sets ambiguity_check_is_needed = False if len(best_sets) >= qconfig.BSS_MAX_SETS_NUMBER: too_much_best_sets = (len(putative_sets) > 0) break continue # the main part: trying to enlarge the set to the left (use "earlier" alignments) align = sorted_aligns[putative_set.indexes[0]] local_max_score = 0 local_uncovered = putative_set.uncovered putative_predecessors = {} for scored_set in all_scored_sets: # we can enlarge the set with "earlier" alignments only if scored_set.indexes and scored_set.indexes[-1] >= putative_set.indexes[0]: break # cur_set_aligns = [sorted_aligns[i].clone() for i in scored_set.indexes] + [align.clone()] cur_set_aligns = [_clone(sorted_aligns[i]) for i in scored_set.indexes] + [_clone(align)] score, uncovered = get_score(scored_set.score, cur_set_aligns, ref_lens, is_cyclic, scored_set.uncovered, seq, region_struct_variations, penalties) if score is not None: putative_predecessors[scored_set] = (score, uncovered) if score > local_max_score: local_max_score = score local_uncovered = uncovered elif score == local_max_score and uncovered < local_uncovered: local_uncovered = uncovered for preceding_set, (score, uncovered) in putative_predecessors.iteritems(): score_drop = local_max_score - score + putative_set.score_drop if score_drop > max_allowed_score_drop: continue new_index = preceding_set.indexes[-1] if preceding_set.indexes else -1 new_uncovered = uncovered + (putative_set.uncovered - local_uncovered) heappush(putative_sets, PutativeBestSet([new_index] + putative_set.indexes, score_drop, new_uncovered)) return True, too_much_best_sets, sorted_aligns, best_sets
def include(self, align): return self.start <= _start(align) and _end(align) <= self.end
def __init__(self, align): self.start = _start(align) self.end = _end(align)
def __init__(self, align): self.align = align self.unique_start = _start(align) self.unique_end = _end(align)
def analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, aligns, ref_features, ref_lens, cyclic=None): maxun = 10 epsilon = 0.99 umt = 0.5 # threshold for misassembled contigs with aligned less than $umt * 100% (Unaligned Missassembled Threshold) unaligned = 0 partially_unaligned = 0 fully_unaligned_bases = 0 partially_unaligned_bases = 0 ambiguous_contigs = 0 ambiguous_contigs_extra_bases = 0 ambiguous_contigs_len = 0 partially_unaligned_with_misassembly = 0 partially_unaligned_with_significant_parts = 0 misassembly_internal_overlap = 0 contigs_with_istranslocations = 0 misassemblies_matched_sv = 0 ref_aligns = dict() aligned_lengths = [] region_misassemblies = [] misassembled_contigs = dict() region_struct_variations = find_all_sv(qconfig.bed) references_misassemblies = {} for ref in ref_labels_by_chromosomes.values(): references_misassemblies[ref] = dict( (key, 0) for key in ref_labels_by_chromosomes.values()) # for counting SNPs and indels (both original (.all_snps) and corrected from Nucmer's local misassemblies) total_indels_info = IndelsInfo() unaligned_file = open(unaligned_fpath, 'w') for contig, seq in fastaparser.read_fasta(contigs_fpath): #Recording contig stats ctg_len = len(seq) print >> ca_output.stdout_f, 'CONTIG: %s (%dbp)' % (contig, ctg_len) contig_type = 'unaligned' #Check if this contig aligned to the reference if contig in aligns: for align in aligns[contig]: #sub_seq = seq[align.start(): align.end()] sub_seq = seq[_start(align):_end(align)] if 'N' in sub_seq: ns_pos = [ pos for pos in xrange(_start(align), _end(align)) if seq[pos] == 'N' ] # ns_pos = [pos for pos in xrange(align.start(), align.end()) if seq[pos] == 'N'] contig_type = 'correct' #Pull all aligns for this contig num_aligns = len(aligns[contig]) #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS) sorted_aligns = sorted(aligns[contig], key=lambda x: (score_single_align(x), x[5]), reverse=True) top_len = sorted_aligns[0][5] top_id = sorted_aligns[0][6] top_score = score_single_align(sorted_aligns[0]) top_aligns = [] print >> ca_output.stdout_f, 'Top Length: %d Top ID: %.2f (Score: %.1f)' % ( top_len, top_id, top_score) #Check that top hit captures most of the contig if top_len > ctg_len * epsilon or ctg_len - top_len < maxun: #Reset top aligns: aligns that share the same value of longest and highest identity top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Continue grabbing alignments while length and identity are identical #while sorted_aligns and top_len == sorted_aligns[0][5] and top_id == sorted_aligns[0][6]: while sorted_aligns and (score_single_align( sorted_aligns[0]) >= qconfig.ambiguity_score * top_score): top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Mark other alignments as insignificant (former ambiguous) if sorted_aligns: print >> ca_output.stdout_f, '\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):' % str( qconfig.ambiguity_score) for align in sorted_aligns: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align if len(top_aligns) == 1: #There is only one top align, life is good print >> ca_output.stdout_f, '\t\tOne align captures most of this contig: %s' % str( top_aligns[0]) # print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( top_aligns[0]) ref_aligns.setdefault(top_aligns[0][7], []).append(top_aligns[0]) print >> ca_output.coords_filtered_f, str(top_aligns[0]) aligned_lengths.append(top_aligns[0][5]) else: #There is more than one top align print >> ca_output.stdout_f, '\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]' % len( top_aligns) #Increment count of ambiguously mapped contigs and bases in them ambiguous_contigs += 1 # we count only extra bases, so we shouldn't include bases in the first alignment # if --ambiguity-usage is 'none', the number of extra bases will be negative! ambiguous_contigs_len += ctg_len # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= top_aligns[0][5] print >> ca_output.stdout_f, '\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):' for align in top_aligns: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 print >> ca_output.stdout_f, '\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):' print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str( top_aligns[0]) # print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( top_aligns[0]) ref_aligns.setdefault(top_aligns[0][7], []).append(top_aligns[0]) aligned_lengths.append(top_aligns[0][5]) print >> ca_output.coords_filtered_f, str( top_aligns[0]) top_aligns = top_aligns[1:] for align in top_aligns: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align elif qconfig.ambiguity_usage == "all": ambiguous_contigs_extra_bases -= top_aligns[0][5] print >> ca_output.stdout_f, '\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):' # we count only extra bases, so we shouldn't include bases in the first alignment first_alignment = True while len(top_aligns): print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str( top_aligns[0]) # print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str(ambiguity=True) print >> ca_output.icarus_out_f, icarus_report_str( top_aligns[0], ambiguity=True) ref_aligns.setdefault(top_aligns[0][7], []).append(top_aligns[0]) if first_alignment: first_alignment = False aligned_lengths.append(top_aligns[0][5]) ambiguous_contigs_extra_bases += top_aligns[0][5] print >> ca_output.coords_filtered_f, str( top_aligns[0]), "ambiguous" top_aligns = top_aligns[1:] else: # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies) is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets( sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens, cyclic, region_struct_variations) the_best_set = best_sets[0] used_indexes = range( len(sorted_aligns) ) if too_much_best_sets else get_used_indexes(best_sets) if len(used_indexes) < len(sorted_aligns): print >> ca_output.stdout_f, '\t\t\tSkipping redundant alignments after choosing the best set of alignments' for idx in set(range(len(sorted_aligns))) - used_indexes: print >> ca_output.stdout_f, '\t\tSkipping redundant alignment', sorted_aligns[ idx] if is_ambiguous: print >> ca_output.stdout_f, '\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]' # similar to regular ambiguous contigs, see above ambiguous_contigs += 1 ambiguous_contigs_len += ctg_len if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= ( ctg_len - the_best_set.uncovered) print >> ca_output.stdout_f, '\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):' for idx in used_indexes: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[ idx] continue elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 print >> ca_output.stdout_f, '\t\tUsing only the very best set (option --ambiguity-usage is set to "one").' if len(the_best_set.indexes) < len(used_indexes): print >> ca_output.stdout_f, '\t\tSo, skipping alignments from other sets:' for idx in used_indexes: if idx not in the_best_set.indexes: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[ idx] elif qconfig.ambiguity_usage == "all": print >> ca_output.stdout_f, '\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):' print >> ca_output.stdout_f, '\t\t\tThe very best set is shown in details below, the rest are:' for idx, cur_set in enumerate(best_sets[1:]): print >> ca_output.stdout_f, '\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \ (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered) if too_much_best_sets: print >> ca_output.stdout_f, '\t\t\t\tetc...' if len(the_best_set.indexes) < len(used_indexes): ambiguous_contigs_extra_bases -= ( ctg_len - the_best_set.uncovered) print >> ca_output.stdout_f, '\t\t\tList of alignments used in the sets above:' for idx in used_indexes: align = sorted_aligns[idx] print >> ca_output.stdout_f, '\t\tAlignment: %s' % str( align) ref_aligns.setdefault(align[7], []).append(align) ambiguous_contigs_extra_bases += align[5] print >> ca_output.coords_filtered_f, str( align), "ambiguous" if idx not in the_best_set.indexes: print >> ca_output.icarus_out_f, icarus_report_str( align, is_best=False) # print >> ca_output.icarus_out_f, align.icarus_report_str(is_best=False) print >> ca_output.stdout_f, '\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \ (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered) real_aligns = [sorted_aligns[i] for i in the_best_set.indexes] # main processing part if len(real_aligns) == 1: the_only_align = real_aligns[0] #There is only one alignment of this contig to the reference print >> ca_output.coords_filtered_f, str(the_only_align) aligned_lengths.append(the_only_align[5]) # begin, end = the_only_align.start(), the_only_align.end() begin, end = _start(the_only_align), _end(the_only_align) unaligned_bases = 0 if (begin - 1) or (ctg_len - end): partially_unaligned += 1 unaligned_bases = (begin - 1) + (ctg_len - end) partially_unaligned_bases += unaligned_bases print >> ca_output.stdout_f, '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)' % ( top_len, ctg_len) print >> ca_output.stdout_f, '\t\tAlignment: %s' % str( the_only_align) # print >> ca_output.icarus_out_f, the_only_align.icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( the_only_align) if begin - 1: print >> ca_output.stdout_f, '\t\tUnaligned bases: 1 to %d (%d)' % ( begin - 1, begin - 1) if ctg_len - end: print >> ca_output.stdout_f, '\t\tUnaligned bases: %d to %d (%d)' % ( end + 1, ctg_len, ctg_len - end) # check if both parts (aligned and unaligned) have significant length if (unaligned_bases >= qconfig.significant_part_size) and ( ctg_len - unaligned_bases >= qconfig.significant_part_size): print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \ '(of length >= %d)!' % (qconfig.significant_part_size) partially_unaligned_with_significant_parts += 1 if qconfig.meta: contigs_with_istranslocations += check_for_potential_translocation( seq, ctg_len, real_aligns, ca_output.stdout_f) ref_aligns.setdefault(the_only_align[7], []).append(the_only_align) else: #Sort real alignments by position on the contig sorted_aligns = sorted(real_aligns, key=lambda x: (_end(x), _start(x))) # sorted_aligns = sorted(real_aligns, key=lambda x: (x.end(), x.start())) #There is more than one alignment of this contig to the reference print >> ca_output.stdout_f, '\t\tThis contig is misassembled. %d total aligns.' % num_aligns aligned_bases_in_contig = ctg_len - the_best_set.uncovered if aligned_bases_in_contig < umt * ctg_len: print >> ca_output.stdout_f, '\t\t\tWarning! This contig is more unaligned than misassembled. ' + \ 'Contig length is %d and total length of all aligns is %d' % (ctg_len, aligned_bases_in_contig) for align in sorted_aligns: print >> ca_output.stdout_f, '\t\tAlignment: %s' % str( align) # print >> ca_output.icarus_out_f, align.icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( align) print >> ca_output.coords_filtered_f, str(align) aligned_lengths.append(align[5]) ref_aligns.setdefault(align[7], []).append(align) partially_unaligned_with_misassembly += 1 partially_unaligned += 1 partially_unaligned_bases += ctg_len - aligned_bases_in_contig print >> ca_output.stdout_f, '\t\tUnaligned bases: %d' % ( ctg_len - aligned_bases_in_contig) # check if both parts (aligned and unaligned) have significant length if (aligned_bases_in_contig >= qconfig.significant_part_size) and ( ctg_len - aligned_bases_in_contig >= qconfig.significant_part_size): print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \ '(of length >= %d)!' % (qconfig.significant_part_size) partially_unaligned_with_significant_parts += 1 if qconfig.meta: contigs_with_istranslocations += check_for_potential_translocation( seq, ctg_len, sorted_aligns, ca_output.stdout_f) contig_type = 'misassembled' print >> ca_output.icarus_out_f, '\t'.join( ['CONTIG', contig, str(ctg_len), contig_type]) print >> ca_output.stdout_f continue ### processing misassemblies is_misassembled, current_mio, references_misassemblies, indels_info, misassemblies_matched_sv = \ process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns, ref_features, seq, references_misassemblies, region_struct_variations, misassemblies_matched_sv, ca_output, is_ambiguous) misassembly_internal_overlap += current_mio total_indels_info += indels_info if is_misassembled: misassembled_contigs[contig] = ctg_len contig_type = 'misassembled' if ctg_len - aligned_bases_in_contig >= qconfig.significant_part_size: print >> ca_output.stdout_f, '\t\tThis contig has significant unaligned parts ' \ '(of length >= %d)!' % (qconfig.significant_part_size) if qconfig.meta: contigs_with_istranslocations += check_for_potential_translocation( seq, ctg_len, sorted_aligns, ca_output.stdout_f) else: #No aligns to this contig print >> ca_output.stdout_f, '\t\tThis contig is unaligned. (%d bp)' % ctg_len print >> unaligned_file, contig #Increment unaligned contig count and bases unaligned += 1 fully_unaligned_bases += ctg_len print >> ca_output.stdout_f, '\t\tUnaligned bases: %d total: %d' % ( ctg_len, fully_unaligned_bases) print >> ca_output.icarus_out_f, '\t'.join( ['CONTIG', contig, str(ctg_len), contig_type]) print >> ca_output.stdout_f ca_output.coords_filtered_f.close() unaligned_file.close() misassembled_bases = sum(misassembled_contigs.itervalues()) result = { 'region_misassemblies': region_misassemblies, 'region_struct_variations': region_struct_variations.get_count() if region_struct_variations else None, 'misassemblies_matched_sv': misassemblies_matched_sv, 'misassembled_contigs': misassembled_contigs, 'misassembled_bases': misassembled_bases, 'misassembly_internal_overlap': misassembly_internal_overlap, 'unaligned': unaligned, 'partially_unaligned': partially_unaligned, 'partially_unaligned_bases': partially_unaligned_bases, 'fully_unaligned_bases': fully_unaligned_bases, 'ambiguous_contigs': ambiguous_contigs, 'ambiguous_contigs_extra_bases': ambiguous_contigs_extra_bases, 'ambiguous_contigs_len': ambiguous_contigs_len, 'partially_unaligned_with_misassembly': partially_unaligned_with_misassembly, 'partially_unaligned_with_significant_parts': partially_unaligned_with_significant_parts, 'contigs_with_istranslocations': contigs_with_istranslocations, 'istranslocations_by_refs': references_misassemblies } return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs