def __init__(self, consensus_string: str, min_match_length: int, alignment: MSA): self._match_intervals: Intervals = list() self._non_match_intervals: Intervals = list() self.mml = min_match_length if len(consensus_string) < self.mml: # In this case, a match of less than the min_match_length gets counted # as a match (usually, it counts as a non_match) it_type = IntervalType.Match if any(map(is_non_match, consensus_string)): it_type = IntervalType.NonMatch self._append(Interval(it_type, 0, len(consensus_string) - 1)) else: cur_interval = self._new_interval(consensus_string[0], 0) for i, letter in enumerate(consensus_string[1:], start=1): if is_type(letter, cur_interval.type): cur_interval.modify_by(0, 1) # simple interval extension else: new_interval = self._add_interval(cur_interval, alignment) if new_interval is None: cur_interval = self._new_interval(letter, i) else: cur_interval = new_interval self._add_interval(cur_interval, alignment, end=True) self.enforce_multisequence_nonmatch_intervals( self._match_intervals, self._non_match_intervals, alignment ) self.enforce_alignment_interval_bijection( self._match_intervals, self._non_match_intervals, alignment.get_alignment_length(), )
def get_sub_alignment_by_list_id( self, id_list: List[str], alignment: MSA, interval=None ): list_records = [record for record in alignment if record.id in id_list] sub_alignment = MSA(list_records) if interval is not None: sub_alignment = sub_alignment[:, interval[0] : interval[1] + 1] return sub_alignment
def make_alignment(seqs: List[str], ids: List[str] = None) -> MSA: seq_lengths = set(map(len, seqs)) assert ( len(seq_lengths) == 1 ), "Sequences are not the same length, does not represent an alignment" if ids is None: seqrecords = [ SeqRecord(Seq(seq), id=f"s{i}") for i, seq in enumerate(seqs) ] else: seqrecords = [SeqRecord(Seq(seq), id=ID) for seq, ID in zip(seqs, ids)] return MSA(seqrecords)
def get_consensus(cls, alignment: MSA): """Produces a 'consensus string' from an MSA: at each position of the MSA, the string has a base if all aligned sequences agree, and a "*" if not. IUPAC ambiguous bases result in non-consensus and are later expanded in the prg. N results in consensus at that position unless they are all N.""" consensus_string = "" for i in range(alignment.get_alignment_length()): column = set([record.seq[i] for record in alignment]) column = column.difference({"N"}) if (len(ambiguous_bases.intersection(column)) > 0 or len(column) != 1 or column == {"-"}): consensus_string += NONMATCH else: consensus_string += column.pop() return consensus_string