def _get_prg(self): prg = "" for interval in self.all_intervals: if interval in self.match_intervals: # all seqs are not necessarily exactly the same: some can have 'N' # thus still process all of them, to get the one with no 'N'. sub_alignment = self.alignment[:, interval.start:interval.stop + 1] seqs = get_expanded_sequences(sub_alignment) assert len( seqs) == 1, "Got >1 filtered sequences in match interval" seq = seqs[0] prg += seq else: # Define variant site number and increment for next available site_num = self.site self.site += 2 variant_prgs = self.get_variants(interval) # Add the variant seqs to the prg. prg += f"{self.delim_char}{site_num}{self.delim_char}" while len(variant_prgs) > 1: prg += variant_prgs.pop(0) prg += f"{self.delim_char}{site_num + 1}{self.delim_char}" prg += variant_prgs.pop() prg += f"{self.delim_char}{site_num}{self.delim_char}" return prg
def enforce_multisequence_nonmatch_intervals( cls, match_intervals: Intervals, non_match_intervals: Intervals, alignment: MSA) -> None: """ Goes through non-match intervals and makes sure there is more than one sequence there, else makes it a match interval. Modifies the intervals in-place. Example reasons for such a conversion to occur: - 'N' in a sequence causes it to be filtered out, and left with a single useable sequence - '-' in sequences causes them to appear different, but they are the same """ if len(alignment) == 0: # For testing convenience return for i in reversed(range(len(non_match_intervals))): interval = non_match_intervals[i] interval_alignment = alignment[:, interval.start:interval.stop + 1] interval_seqs = get_expanded_sequences(interval_alignment) if len(interval_seqs) < 2: changed_interval = non_match_intervals[i] match_intervals.append( Interval( IntervalType.Match, changed_interval.start, changed_interval.stop, )) non_match_intervals.pop(i)
def get_variants(self, interval) -> Sequences: variant_prgs = [] if self.skip_clustering( interval, self.nesting_level, self.max_nesting, self.min_match_length, self.alignment, ): sub_alignment = self.alignment[:, interval.start:interval.stop + 1] variant_prgs = get_expanded_sequences(sub_alignment) logging.debug(f"Variant seqs found: {variant_prgs}") else: clustering_result = kmeans_cluster_seqs_in_interval( [interval.start, interval.stop], self.alignment, self.min_match_length, ) if clustering_result.no_clustering: logging.debug( "Clustering did not group any sequences together, each seq is a cluster" ) variant_prgs = clustering_result.sequences logging.debug(f"Variant seqs found: {variant_prgs}") else: variant_prgs = self.prg_recur(interval, clustering_result.clustered_ids) assert len(variant_prgs) > 1, "Only have one variant seq" assert len(variant_prgs) == len(list( remove_duplicates(variant_prgs))), "have repeat variant seqs" return variant_prgs
def test_first_sequence_in_is_first_sequence_out(self): alignment = make_alignment(["TTTT", "AAAA", "CC-C"]) result = get_expanded_sequences(alignment) expected = ["TTTT", "AAAA", "CCC"] self.assertEqual(expected, result)
def test_ambiguous_bases_one_seq_with_repeated_base(self): alignment = AlignIO.MultipleSeqAlignment([SeqRecord(Seq("RRAAT"))]) result = get_expanded_sequences(alignment) expected = {"GAAAT", "AAAAT", "GGAAT", "AGAAT"} self.assertEqual(set(result), expected)