Exemplo n.º 1
0
def generate_rolls(num_rolls):
    """Generate a bunch of rolls corresponding to the casino probabilities.

    Returns:

    - The generate roll sequence
    - The state sequence that generated the roll.

    """
    # start off in the fair state
    cur_state = 'F'
    roll_seq = MutableSeq('', DiceRollAlphabet())
    state_seq = MutableSeq('', DiceTypeAlphabet())

    # generate the sequence
    for roll in range(num_rolls):
        state_seq.append(cur_state)
        # generate a random number
        chance_num = random.random()

        # add on a new roll to the sequence
        new_roll = _loaded_dice_roll(chance_num, cur_state)
        roll_seq.append(new_roll)

        # now give us a chance to switch to a new state
        chance_num = random.random()
        if cur_state == 'F':
            if chance_num <= .05:
                cur_state = 'L'
        elif cur_state == 'L':
            if chance_num <= .1:
                cur_state = 'F'

    return roll_seq.toseq(), state_seq.toseq()
def random_generator(num):
    states = MutableSeq('',state())
    for i in range(num):
        states.append(random.choice('123'))
    

    sequence = MutableSeq('',DNA())
    for i in range(num):
        sequence.append(random.choice('ACTG'))
    
    return states.toseq(),sequence.toseq()
Exemplo n.º 3
0
def random_population(genome_alphabet, genome_size, num_organisms,
                      fitness_calculator):
    """Generate a population of individuals with randomly set genomes.

    Arguments:

    o genome_alphabet -- An Alphabet object describing all of the
    possible letters that could potentially be in the genome of an
    organism.

    o genome_size -- The size of each organisms genome.

    o num_organism -- The number of organisms we want in the population.

    o fitness_calculator -- A function that will calculate the fitness
    of the organism when given the organisms genome.
    """
    all_orgs = []

    # a random number generator to get letters for the genome
    letter_rand = random.Random()

    # figure out what type of characters are in the alphabet
    if isinstance(genome_alphabet.letters[0], str):
        if sys.version_info[0] == 3:
            alphabet_type = "u"  # Use unicode string on Python 3
        else:
            alphabet_type = "c"  # Use byte string on Python 2
    elif isinstance(genome_alphabet.letters[0], int):
        alphabet_type = "i"
    elif isinstance(genome_alphabet.letters[0], float):
        alphabet_type = "d"
    else:
        raise ValueError(
            "Alphabet type is unsupported: %s" % genome_alphabet.letters)

    for org_num in range(num_organisms):
        new_genome = MutableSeq(array.array(alphabet_type), genome_alphabet)

        # generate the genome randomly
        for gene_num in range(genome_size):
            new_gene = letter_rand.choice(genome_alphabet.letters)
            new_genome.append(new_gene)

        # add the new organism with this genome
        all_orgs.append(Organism(new_genome, fitness_calculator))

    return all_orgs
Exemplo n.º 4
0
def random_population(genome_alphabet, genome_size, num_organisms,
                      fitness_calculator):
    """Generate a population of individuals with randomly set genomes.

    Arguments:

    o genome_alphabet -- An Alphabet object describing all of the
    possible letters that could potentially be in the genome of an
    organism.

    o genome_size -- The size of each organisms genome.

    o num_organism -- The number of organisms we want in the population.

    o fitness_calculator -- A function that will calculate the fitness
    of the organism when given the organisms genome.
    """
    all_orgs = []

    # a random number generator to get letters for the genome
    letter_rand = random.Random()

    # figure out what type of characters are in the alphabet
    if isinstance(genome_alphabet.letters[0], str):
        if sys.version_info[0] == 3:
            alphabet_type = "u"  # Use unicode string on Python 3
        else:
            alphabet_type = "c"  # Use byte string on Python 2
    elif isinstance(genome_alphabet.letters[0], int):
        alphabet_type = "i"
    elif isinstance(genome_alphabet.letters[0], float):
        alphabet_type = "d"
    else:
        raise ValueError("Alphabet type is unsupported: %s" %
                         genome_alphabet.letters)

    for org_num in range(num_organisms):
        new_genome = MutableSeq(array.array(alphabet_type), genome_alphabet)

        # generate the genome randomly
        for gene_num in range(genome_size):
            new_gene = letter_rand.choice(genome_alphabet.letters)
            new_genome.append(new_gene)

        # add the new organism with this genome
        all_orgs.append(Organism(new_genome, fitness_calculator))

    return all_orgs
Exemplo n.º 5
0
class TestMutableSeq(unittest.TestCase):
    def setUp(self):
        self.s = Seq.Seq("TCAAAAGGATGCATCATG")
        self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG")

    def test_mutableseq_creation(self):
        """Test creating MutableSeqs in multiple ways."""
        mutable_s = MutableSeq("TCAAAAGGATGCATCATG")
        self.assertIsInstance(mutable_s, MutableSeq, "Creating MutableSeq")

        mutable_s = self.s.tomutable()
        self.assertIsInstance(mutable_s, MutableSeq,
                              "Converting Seq to mutable")

        array_seq = MutableSeq(array.array("u", "TCAAAAGGATGCATCATG"))
        self.assertIsInstance(array_seq, MutableSeq,
                              "Creating MutableSeq using array")

    def test_repr(self):
        self.assertEqual("MutableSeq('TCAAAAGGATGCATCATG')",
                         repr(self.mutable_s))

    def test_truncated_repr(self):
        seq = "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGA"
        expected = (
            "MutableSeq('TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATG...GGA')"
        )
        self.assertEqual(expected, repr(MutableSeq(seq)))

    def test_equal_comparison(self):
        """Test __eq__ comparison method."""
        self.assertEqual(self.mutable_s, "TCAAAAGGATGCATCATG")

    def test_not_equal_comparison(self):
        """Test __ne__ comparison method."""
        self.assertNotEqual(self.mutable_s, "other thing")

    def test_less_than_comparison(self):
        """Test __lt__ comparison method."""
        self.assertLess(self.mutable_s[:-1], self.mutable_s)

    def test_less_than_comparison_of_incompatible_types(self):
        with self.assertRaises(TypeError):
            self.mutable_s < 1

    def test_less_than_comparison_without_alphabet(self):
        self.assertLessEqual(self.mutable_s[:-1], "TCAAAAGGATGCATCATG")

    def test_less_than_or_equal_comparison(self):
        """Test __le__ comparison method."""
        self.assertLessEqual(self.mutable_s[:-1], self.mutable_s)

    def test_less_than_or_equal_comparison_of_incompatible_types(self):
        with self.assertRaises(TypeError):
            self.mutable_s <= 1

    def test_less_than_or_equal_comparison_without_alphabet(self):
        self.assertLessEqual(self.mutable_s[:-1], "TCAAAAGGATGCATCATG")

    def test_greater_than_comparison(self):
        """Test __gt__ comparison method."""
        self.assertGreater(self.mutable_s, self.mutable_s[:-1])

    def test_greater_than_comparison_of_incompatible_types(self):
        with self.assertRaises(TypeError):
            self.mutable_s > 1

    def test_greater_than_comparison_without_alphabet(self):
        self.assertGreater(self.mutable_s, "TCAAAAGGATGCATCAT")

    def test_greater_than_or_equal_comparison(self):
        """Test __ge__ comparison method."""
        self.assertGreaterEqual(self.mutable_s, self.mutable_s)

    def test_greater_than_or_equal_comparison_of_incompatible_types(self):
        with self.assertRaises(TypeError):
            self.mutable_s >= 1

    def test_greater_than_or_equal_comparison_without_alphabet(self):
        self.assertGreaterEqual(self.mutable_s, "TCAAAAGGATGCATCATG")

    def test_add_method(self):
        """Test adding wrong type to MutableSeq."""
        with self.assertRaises(TypeError):
            self.mutable_s + 1234

    def test_radd_method(self):
        self.assertEqual(
            "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG",
            self.mutable_s.__radd__(self.mutable_s),
        )

    def test_radd_method_incompatible_alphabets(self):
        self.assertEqual(
            "UCAAAAGGATCAAAAGGATGCATCATG",
            self.mutable_s.__radd__(MutableSeq("UCAAAAGGA")),
        )

    def test_radd_method_using_seq_object(self):
        self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG",
                         self.mutable_s.__radd__(self.s))

    def test_radd_method_wrong_type(self):
        with self.assertRaises(TypeError):
            self.mutable_s.__radd__(1234)

    def test_as_string(self):
        self.assertEqual("TCAAAAGGATGCATCATG", str(self.mutable_s))

    def test_length(self):
        self.assertEqual(18, len(self.mutable_s))

    def test_converting_to_immutable(self):
        self.assertIsInstance(self.mutable_s.toseq(), Seq.Seq)

    def test_first_nucleotide(self):
        self.assertEqual("T", self.mutable_s[0])

    def test_setting_slices(self):
        self.assertEqual(
            MutableSeq("CAAA"),
            self.mutable_s[1:5],
            "Slice mutable seq",
        )

        self.mutable_s[1:3] = "GAT"
        self.assertEqual(
            MutableSeq("TGATAAAGGATGCATCATG"),
            self.mutable_s,
            "Set slice with string and adding extra nucleotide",
        )

        self.mutable_s[1:3] = self.mutable_s[5:7]
        self.assertEqual(
            MutableSeq("TAATAAAGGATGCATCATG"),
            self.mutable_s,
            "Set slice with MutableSeq",
        )

        self.mutable_s[1:3] = array.array("u", "GAT")
        self.assertEqual(
            MutableSeq("TGATTAAAGGATGCATCATG"),
            self.mutable_s,
            "Set slice with array",
        )

    def test_setting_item(self):
        self.mutable_s[3] = "G"
        self.assertEqual(MutableSeq("TCAGAAGGATGCATCATG"), self.mutable_s)

    def test_deleting_slice(self):
        del self.mutable_s[4:5]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG"), self.mutable_s)

    def test_deleting_item(self):
        del self.mutable_s[3]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG"), self.mutable_s)

    def test_appending(self):
        self.mutable_s.append("C")
        self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGC"), self.mutable_s)

    def test_inserting(self):
        self.mutable_s.insert(4, "G")
        self.assertEqual(MutableSeq("TCAAGAAGGATGCATCATG"), self.mutable_s)

    def test_popping_last_item(self):
        self.assertEqual("G", self.mutable_s.pop())

    def test_remove_items(self):
        self.mutable_s.remove("G")
        self.assertEqual(MutableSeq("TCAAAAGATGCATCATG"), self.mutable_s,
                         "Remove first G")

        self.assertRaises(ValueError, self.mutable_s.remove, "Z")

    def test_count(self):
        self.assertEqual(7, self.mutable_s.count("A"))
        self.assertEqual(2, self.mutable_s.count("AA"))

    def test_index(self):
        self.assertEqual(2, self.mutable_s.index("A"))
        self.assertRaises(ValueError, self.mutable_s.index, "8888")

    def test_reverse(self):
        """Test using reverse method."""
        self.mutable_s.reverse()
        self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT"), self.mutable_s)

    def test_reverse_with_stride(self):
        """Test reverse using -1 stride."""
        self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT"),
                         self.mutable_s[::-1])

    def test_complement(self):
        self.mutable_s.complement()
        self.assertEqual("AGTTTTCCTACGTAGTAC", str(self.mutable_s))

    def test_complement_rna(self):
        seq = Seq.MutableSeq("AUGaaaCUG")
        seq.complement()
        self.assertEqual("UACuuuGAC", str(seq))

    def test_complement_mixed_aphabets(self):
        seq = Seq.MutableSeq("AUGaaaCTG")
        with self.assertRaises(ValueError):
            seq.complement()

    def test_complement_rna_string(self):
        seq = Seq.MutableSeq("AUGaaaCUG")
        seq.complement()
        self.assertEqual("UACuuuGAC", str(seq))

    def test_complement_dna_string(self):
        seq = Seq.MutableSeq("ATGaaaCTG")
        seq.complement()
        self.assertEqual("TACtttGAC", str(seq))

    def test_reverse_complement(self):
        self.mutable_s.reverse_complement()
        self.assertEqual("CATGATGCATCCTTTTGA", str(self.mutable_s))

    def test_extend_method(self):
        self.mutable_s.extend("GAT")
        self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGGAT"), self.mutable_s)

    def test_extend_with_mutable_seq(self):
        self.mutable_s.extend(MutableSeq("TTT"))
        self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGTTT"), self.mutable_s)

    def test_delete_stride_slice(self):
        del self.mutable_s[4:6 - 1]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG"), self.mutable_s)

    def test_extract_third_nucleotide(self):
        """Test extracting every third nucleotide (slicing with stride 3)."""
        self.assertEqual(MutableSeq("TAGTAA"), self.mutable_s[0::3])
        self.assertEqual(MutableSeq("CAGGTT"), self.mutable_s[1::3])
        self.assertEqual(MutableSeq("AAACCG"), self.mutable_s[2::3])

    def test_set_wobble_codon_to_n(self):
        """Test setting wobble codon to N (set slice with stride 3)."""
        self.mutable_s[2::3] = "N" * len(self.mutable_s[2::3])
        self.assertEqual(MutableSeq("TCNAANGGNTGNATNATN"), self.mutable_s)
Exemplo n.º 6
0
    def viterbi(self, sequence, state_alphabet):
        """Calculate the most probable state path using the Viterbi algorithm.

        This implements the Viterbi algorithm (see pgs 55-57 in Durbin et
        al for a full explanation -- this is where I took my implementation
        ideas from), to allow decoding of the state path, given a sequence
        of emissions.

        Arguments:

        o sequence -- A Seq object with the emission sequence that we
        want to decode.

        o state_alphabet -- The alphabet of the possible state sequences
        that can be generated.
        """

        # calculate logarithms of the initial, transition, and emission probs
        log_initial = self._log_transform(self.initial_prob)
        log_trans = self._log_transform(self.transition_prob)
        log_emission = self._log_transform(self.emission_prob)

        viterbi_probs = {}
        pred_state_seq = {}
        state_letters = state_alphabet.letters

        # --- recursion
        # loop over the training squence (i = 1 .. L)
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        for i in range(0, len(sequence)):
            # loop over all of the possible i-th states in the state path
            for cur_state in state_letters:
                # e_{l}(x_{i})
                emission_part = log_emission[(cur_state, sequence[i])]

                max_prob = 0
                if i == 0:
                    # for the first state, use the initial probability rather
                    # than looking back to previous states
                    max_prob = log_initial[cur_state]
                else:
                    # loop over all possible (i-1)-th previous states
                    possible_state_probs = {}
                    for prev_state in self.transitions_to(cur_state):
                        # a_{kl}
                        trans_part = log_trans[(prev_state, cur_state)]

                        # v_{k}(i - 1)
                        viterbi_part = viterbi_probs[(prev_state, i - 1)]
                        cur_prob = viterbi_part + trans_part

                        possible_state_probs[prev_state] = cur_prob

                    # calculate the viterbi probability using the max
                    max_prob = max(possible_state_probs.values())

                # v_{k}(i)
                viterbi_probs[(cur_state, i)] = (emission_part + max_prob)

                if i > 0:
                    # get the most likely prev_state leading to cur_state
                    for state in possible_state_probs:
                        if possible_state_probs[state] == max_prob:
                            pred_state_seq[(i - 1, cur_state)] = state
                            break
                    
        # --- termination
        # calculate the probability of the state path
        # loop over all states
        all_probs = {}
        for state in state_letters:
            # v_{k}(L)
            all_probs[state] = viterbi_probs[(state, len(sequence) - 1)]

        state_path_prob = max(all_probs.values())

        # find the last pointer we need to trace back from
        last_state = ''
        for state in all_probs:
            if all_probs[state] == state_path_prob:
                last_state = state

        assert last_state != '', "Didn't find the last state to trace from!"
                
        # --- traceback
        traceback_seq = MutableSeq('', state_alphabet)
        
        loop_seq = range(1, len(sequence))
        loop_seq.reverse()

        # last_state is the last state in the most probable state sequence.
        # Compute that sequence by walking backwards in time. From the i-th
        # state in the sequence, find the (i-1)-th state as the most
        # probable state preceding the i-th state.
        state = last_state
        traceback_seq.append(state)
        for i in loop_seq:
            state = pred_state_seq[(i - 1, state)]
            traceback_seq.append(state)

        # put the traceback sequence in the proper orientation
        traceback_seq.reverse()

        return traceback_seq.toseq(), state_path_prob
Exemplo n.º 7
0
    def viterbi(self, sequence, state_alphabet):
        """Calculate the most probable state path using the Viterbi algorithm.

        This implements the Viterbi algorithm (see pgs 55-57 in Durbin et
        al for a full explanation -- this is where I took my implementation
        ideas from), to allow decoding of the state path, given a sequence
        of emissions.

        Arguments:

        o sequence -- A Seq object with the emission sequence that we
        want to decode.

        o state_alphabet -- The alphabet of the possible state sequences
        that can be generated.
        """

        # calculate logarithms of the initial, transition, and emission probs
        log_initial = self._log_transform(self.initial_prob)
        log_trans = self._log_transform(self.transition_prob)
        log_emission = self._log_transform(self.emission_prob)

        viterbi_probs = {}
        pred_state_seq = {}
        state_letters = state_alphabet.letters

        # --- recursion
        # loop over the training squence (i = 1 .. L)
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        for i in range(0, len(sequence)):
            # loop over all of the possible i-th states in the state path
            for cur_state in state_letters:
                # e_{l}(x_{i})
                emission_part = log_emission[(cur_state, sequence[i])]

                max_prob = 0
                if i == 0:
                    # for the first state, use the initial probability rather
                    # than looking back to previous states
                    max_prob = log_initial[cur_state]
                else:
                    # loop over all possible (i-1)-th previous states
                    possible_state_probs = {}
                    for prev_state in self.transitions_to(cur_state):
                        # a_{kl}
                        trans_part = log_trans[(prev_state, cur_state)]

                        # v_{k}(i - 1)
                        viterbi_part = viterbi_probs[(prev_state, i - 1)]
                        cur_prob = viterbi_part + trans_part

                        possible_state_probs[prev_state] = cur_prob

                    # calculate the viterbi probability using the max
                    max_prob = max(possible_state_probs.values())

                # v_{k}(i)
                viterbi_probs[(cur_state, i)] = (emission_part + max_prob)

                if i > 0:
                    # get the most likely prev_state leading to cur_state
                    for state in possible_state_probs:
                        if possible_state_probs[state] == max_prob:
                            pred_state_seq[(i - 1, cur_state)] = state
                            break

        # --- termination
        # calculate the probability of the state path
        # loop over all states
        all_probs = {}
        for state in state_letters:
            # v_{k}(L)
            all_probs[state] = viterbi_probs[(state, len(sequence) - 1)]

        state_path_prob = max(all_probs.values())

        # find the last pointer we need to trace back from
        last_state = ''
        for state in all_probs:
            if all_probs[state] == state_path_prob:
                last_state = state

        assert last_state != '', "Didn't find the last state to trace from!"

        # --- traceback
        traceback_seq = MutableSeq('', state_alphabet)

        loop_seq = list(range(1, len(sequence)))
        loop_seq.reverse()

        # last_state is the last state in the most probable state sequence.
        # Compute that sequence by walking backwards in time. From the i-th
        # state in the sequence, find the (i-1)-th state as the most
        # probable state preceding the i-th state.
        state = last_state
        traceback_seq.append(state)
        for i in loop_seq:
            state = pred_state_seq[(i - 1, state)]
            traceback_seq.append(state)

        # put the traceback sequence in the proper orientation
        traceback_seq.reverse()

        return traceback_seq.toseq(), state_path_prob
Exemplo n.º 8
0
class MuGen(object):
    """ performs mutations and deletion/insertion with desired porbability
	and desired structure. Gets a Seq object, a mutation or indel dicitonary,
	and the probablities for each item in those dictionaries.
	insertprob and deleteprob are base specefic probabilities of length 4
	mualphabet is a dictionary specifying the possible mutations for each letter of
	the sequence alphabet.
	muprob gives the mutation probality for each letter of the sequence alphabet."""
    def __init__(self,
                 seq,
                 alphaproperty=None,
                 insertprob=None,
                 deleteprob=None,
                 mualphabet=None,
                 muprob=None,
                 mupos=None,
                 delpos=None,
                 inpos=None,
                 verbose=False):
        try:
            self.occureddel = list(
            )  # This is to keep a history of chnges made to the reference
            self.occuredmu = list(
            )  # This is necessary for writing the haplotypes in the format
            self.occuredins = list()  # of haplotyping software's.
            self.inserted_allele = list(
            )  # keeps track of the inserted allele to be able to get them back when needed!
            self.alt_allele = list()  # keeps track of the substituted
            if not isinstance(verbose, bool):
                raise CustomException(
                    "ERROR: verbose must be set to either True or False. \
Default is to False")
            else:
                self.verbose = verbose
            if isinstance(seq, str):
                if alphaproperty is None:
                    if self.verbose:
                        print(
                            "WARNING: No alphabet type is specified for the sequence string!"
                        )
                    else:
                        pass
                    self.alphaproperty = Alphabet()
                else:
                    self.alphaproperty = alphaproperty
                self.seq = MutableSeq(seq, self.alphaproperty)
            elif isinstance(seq, Seq):
                self.alphaproperty = seq.__getattribute__('alphabet')
                self.seq = seq.tomutable()
            elif isinstance(seq, MutableSeq):
                self.alphaproperty = seq.__getattribute__('alphabet')
                self.seq = copy.deepcopy(seq)
            else:
                raise CustomException(
                    "ERROR: Should provide a Seq or MutableSeq object, \n \
or a string sequence!")
            self.alphabet = set(str(self.seq))
            self.ref = str(self.seq)
            if not delpos:
                self.delpos = []
            else:
                if set(delpos).issubset(set(range(len(self.ref)))):
                    self.delpos = list(
                        delpos)  # Deletion by specifying the positions
                else:
                    raise CustomException(
                        "ERROR: Deletion positions exceed the range of the reference or are not positive integers!"
                    )
            if not inpos:
                self.inpos = []
            else:
                if set(inpos).issubset(set(range(len(self.ref)))):
                    self.inpos = list(
                        inpos)  # Insertion by specifying the positions
                else:
                    raise CustomException(
                        "ERROR: Insertion positions exceed the range of the reference or are not positive integers!"
                    )
            if not mupos:
                self.mupos = []
            else:
                if set(mupos).issubset(set(range(len(self.ref)))):
                    self.mupos = list(
                        mupos)  # Mutation by specifying the positions
                else:
                    raise CustomException(
                        "ERROR: Mutation positions exceed the range of the reference or are not positive integers!"
                    )
            if not mualphabet:
                if self.verbose:
                    print(
                        "WARNING: You have specified no mutation alphabet! Mutations are set to random \
letters!")
                self.mualphabet = dict()
                for key in self.alphabet:
                    self.mualphabet[key] = ''.join(self.alphabet - {
                        key
                    })  # Non-specified mutations could happen to any letter
            else:
                mualphabet = dict([(str(k), str(v))
                                   for k, v in mualphabet.iteritems()])
                for key, value in mualphabet.iteritems():
                    if len(key) != 1:
                        raise CustomException(
                            "ERROR: the mutation alphabet deals with point mutations! Only single letters are\
 allowed as keys!")
                    elif key in set(''.join(value)):
                        raise CustomException(
                            "ERROR: Wrong mutation values specified! A letter could just be substituted with a\
 different letter for mutation!")
                if set(mualphabet.keys()) == self.alphabet and set(''.join(
                        mualphabet.values())) <= self.alphabet:
                    self.mualphabet = copy.deepcopy(mualphabet)
                elif set(mualphabet.keys()) < self.alphabet and set(''.join(
                        mualphabet.values())) < self.alphabet:
                    if self.verbose:
                        print(
                            "WARNING: Mutation is not specified for some letters! Those mutations are set\
 to random letters!")
                    self.mualphabet = copy.deepcopy(
                        mualphabet
                    )  # Whatever has been specified for mutation alphabet is kep intact
                    for key in self.alphabet - set(mualphabet.keys()):
                        self.mualphabet[key] = ''.join(
                            self.alphabet - {key}
                        )  # Non-specified mutations could happen to any letter
                else:
                    if self.verbose:
                        print(
                            "WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\
 updated and\nunspecified mutations are set to random letters!")
                    new_mualphabet = dict(
                    )  # As mutation may introduce novel alleles in the sequence, alphabet is updated first
                    for key, value in mualphabet.iteritems(
                    ):  # Whatever has been specified for mutation alphabet is kep intact
                        self.alphabet.add(
                            key)  # Only the alphabet is updated if necessary
                        self.alphabet |= (set(''.join(value)) - self.alphabet)
                        new_mualphabet.update({key: value})
                    for key in self.alphabet - set(new_mualphabet.keys()):
                        new_mualphabet[key] = ''.join(
                            self.alphabet - {key}
                        )  # Non-specified mutations could happen to any letter
                    self.mualphabet = copy.deepcopy(new_mualphabet)
            if not insertprob:
                self.insertprob = dict(
                )  # If no insertprob is given, it is set to zero everywhere
                for key in self.alphabet:
                    self.insertprob[key] = 0
            else:
                if set(list(insertprob.keys())) != self.alphabet:
                    if self.verbose:
                        print(
                            "WARNING: Missing/Invalid letter(s) in insertion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!"
                        )
                new_insertprob = dict()
                for key, value in insertprob.iteritems():
                    if value >= 0 and value <= 1:
                        new_insertprob.update({key: value})
                    else:
                        raise CustomException(
                            "ERROR: Insertion probability must be >=0 and <=1!"
                        )
                for key in self.alphabet - set(new_insertprob.keys()):
                    new_insertprob[key] = 0
                self.insertprob = copy.deepcopy(new_insertprob)
            if not deleteprob:  # If no deleteprob is given, it is set to zero everywhere
                self.deleteprob = dict()
                for key in self.alphabet:
                    self.deleteprob[key] = 0
            else:
                if set(list(deleteprob.keys())) != self.alphabet:
                    if self.verbose:
                        print(
                            "WARNING: Missing/Invalid letter(s) in deletion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!"
                        )
                new_deleteprob = dict()
                for key, value in deleteprob.iteritems():
                    if value >= 0 and value <= 1:
                        new_deleteprob.update({key: value})
                    else:
                        raise CustomException(
                            "ERROR: Deletion probability must be >=0 and <=1!")
                for key in self.alphabet - set(new_deleteprob.keys()):
                    new_deleteprob[key] = 0
                self.deleteprob = copy.deepcopy(new_deleteprob)
            if not muprob:
                self.muprob = dict(
                )  # If no muprob is given, it is set to zero everywhere
                for key in self.alphabet:
                    self.muprob[key] = 0
            else:
                if set(list(muprob.keys())) != self.alphabet:
                    if self.verbose:
                        print(
                            "WARNING: Missing/Invalid letter(s) in mutation probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!"
                        )
                new_muprob = dict()
                for key, value in muprob.iteritems():
                    if value >= 0 and value <= 1:
                        new_muprob.update({key: value})
                    else:
                        raise CustomException(
                            "ERROR: Mutation probability must be >=0 and <=1!")
                for key in self.alphabet - set(new_muprob.keys()):
                    new_muprob[key] = 0
                self.muprob = copy.deepcopy(new_muprob)
        except CustomException as instance:
            print(instance)
            sys.exit(2)
        else:
            if self.verbose:
                print(
                    "MuGen object successfully created.\nWARNING: MuGen sequence is case sensitive!"
                )

    def __repr__(self):
        return "Haplotype: %s, \n Reference sequence: %s, \n Mutation probabilty: %s, \n Mutations: %s, \n \
Insertion probabilty: %s, \n Deletion Probability: %s, \n \
Insertion positions: %s, \n Deletion positions: %s, \n Mutation positions: %s \n" % (
            self.seq, self.ref, self.muprob, self.mualphabet, self.insertprob,
            self.deleteprob, self.inpos, self.delpos, self.mupos)

    def __str__(self):
        return repr(self)

    def get_hap(self):  # Access Methods
        return self.seq

    def get_ref(self):
        return self.ref

    def get_insertprob(self):
        return self.insertprob

    def get_deleteprob(self):
        return self.deleteprob

    def get_muprob(self):
        return self.muprob

    def get_mualphabet(self):
        return self.mualphabet

    def get_mupos(self):
        return self.mupos

    def get_inpos(self):
        return self.inpos

    def get_delpos(self):
        return self.delpos

    def get_occureddelpos(self):
        return self.occureddel

    def get_occuredmupos(self):
        return self.occuredmu

    def get_occuredinspos(self):
        return self.occuredins

    def get_ins_allele(self):
        return self.inserted_allele

    def get_mu_allele(self):
        return self.alt_allele

    def set_ref(self, ref):  # Modifier methods
        """Changes the reference sequence of the MuGen object. Could become problematic if the new reference
		has a different length than the current reference, while indel and mutation positions are specified.
		A useful method if reference is a mutable seq entity which is constantly called and changed by other
		methods and calsses."""
        try:
            if set(str(ref)).issubset(self.alphabet):
                if not set(self.mupos).issubset(set(range(len(str(ref))))):
                    raise CustomException(
                        "ERROR: Mutation positions exceed the range of the new reference!"
                    )
                elif not set(self.inpos).issubset(set(range(len(str(ref))))):
                    raise CustomException(
                        "ERROR: Insertion positions exceed the range of the new reference!"
                    )
                elif not set(self.delpos).issubset(set(range(len(str(ref))))):
                    raise CustomException(
                        "ERROR: Deletion positions exceed the range of the new reference!"
                    )
                else:
                    self.ref = str(ref)
            else:
                raise CustomException(
                    "ERROR: the new reference is not compatible with the current alphabet!"
                )
        except CustomException as instance:
            print("Failed to update the reference!")
            print(instance)
        except:
            print("Failed to update the reference!")
            raise
        else:
            if self.verbose:
                print("The reference sequence has been updated!")

    def set_pos(
        self,
        inpos=None,
        delpos=None,
        mupos=None,
    ):
        """Changes the insertion, deletion and substitution sites of the MuGen object. A useful method if
		posmu and probmu methods are constantly called."""
        try:
            changedel = 0  # If set to 1, delpos is changed. Otherwise no change to delpos.
            changein = 0  # If set to 1, inpos is changed. Otherwise no change to inpos.
            changemu = 0  # If set to 1, mupos is changed. Otherwise no change to mupos.
            if delpos is None:  # Default is no change
                pass
            else:
                if set(delpos).issubset(set(range(len(self.ref)))):
                    changedel = 1
                else:
                    raise CustomException(
                        "ERROR: New deletion positions exceed the range of the reference or are not positive integers!"
                    )
            if inpos is None:  # Deafult is no change
                pass
            else:
                if set(inpos).issubset(set(range(len(self.ref)))):
                    changein = 1
                else:
                    raise CustomException(
                        "ERROR: New insertion positions exceed the range of the reference or are not positive integers!"
                    )
            if mupos is None:  # Default is no change
                pass
            else:
                if set(mupos).issubset(set(range(len(self.ref)))):
                    changemu = 1
                else:
                    raise CustomException(
                        "ERROR: New mutation positions exceed the range of the reference or are not positive integers!"
                    )
            if changedel:
                self.delpos = list(delpos)  # Update delpos
            else:
                pass
            if changein:
                self.inpos = list(inpos)  # Update inpos
            else:
                pass
            if changemu:
                self.mupos = list(mupos)  # Update mupos
            else:
                pass
        except CustomException as instance:
            print("Failed to update indel and mutation positions!")
            print(instance)
        except:
            print("Failed to update indel and mutation positions!")
            raise
        else:
            if self.verbose:
                print("Indel and mutation positions updated!")

    def set_prob(self, insertprob=None, deleteprob=None, muprob=None):
        """Changes the insertion, deletion and mutation probabilities of the MuGen object. A useful method if
		posmu and probmu methods are constantly called."""
        try:
            noinsert = -1
            nodel = -1
            nomu = -1
            if insertprob is None:  # Default to no change
                noinsert = 0
            elif not insertprob:
                noinsert = 1
            elif set(list(insertprob.keys())) != self.alphabet:
                if self.verbose:
                    print(
                        "WARNING: Missing/Invalid letter(s) in insertion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!"
                    )
                new_insertprob = dict()
                for key, value in insertprob.iteritems():
                    if value >= 0 and value <= 1:
                        new_insertprob.update({key: value})
                    else:
                        raise CustomException(
                            "ERROR: Insertion probability must be >=0 and <=1!"
                        )
                for key in self.alphabet - set(new_insertprob.keys()):
                    new_insertprob[key] = 0
            else:
                new_insertprob = copy.deepcopy(insertprob)
            if deleteprob is None:  # Default to no change
                nodel = 0
            elif not deleteprob:  # If empty deleteprob is given, it is set to zero everywhere
                nodel = 1
            elif set(list(deleteprob.keys())) != self.alphabet:
                if self.verbose:
                    print(
                        "WARNING: Missing/Invalid letter(s) in deletion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!"
                    )
                new_deleteprob = dict()
                for key, value in deleteprob.iteritems():
                    if value >= 0 and value <= 1:
                        new_deleteprob.update({key: value})
                    else:
                        raise CustomException(
                            "ERROR: Deletion probability must be >=0 and <=1!")
                for key in self.alphabet - set(new_deleteprob.keys()):
                    new_deleteprob[key] = 0
            else:
                new_deleteprob = copy.deepcopy(deleteprob)
            if muprob is None:  # Default to no change
                nomu = 0
            elif not muprob:
                nomu = 1
            elif set(list(muprob.keys())) != self.alphabet:
                if self.verbose:
                    print(
                        "WARNING: Missing/Invalid letter(s) in mutation probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!"
                    )
                new_muprob = dict()
                for key, value in muprob.iteritems():
                    if value >= 0 and value <= 1:
                        new_muprob.update({key: value})
                    else:
                        raise CustomException(
                            "ERROR: Mutation probability must be >=0 and <=1!")
                for key in self.alphabet - set(new_muprob.keys()):
                    new_muprob[key] = 0
            else:
                new_muprob = copy.deepcopy(muprob)
            if nodel == 0:
                pass
            elif nodel == 1:
                self.deleteprob = dict()
                for key in self.alphabet:
                    self.deleteprob[key] = 0
            else:
                self.deleteprob = copy.deepcopy(
                    new_deleteprob)  # Update deleteprob
            if nomu == 0:
                pass
            elif nomu == 1:
                self.muprob = dict(
                )  # If empty muprob is given, it is set to zero everywhere
                for key in self.alphabet:
                    self.muprob[key] = 0
            else:
                self.muprob = copy.deepcopy(new_muprob)  # Update muprob
            if noinsert == 0:
                pass
            elif noinsert == 1:
                self.insertprob = dict(
                )  # If empty insertprob is given, it is set to zero everywhere
                for key in self.alphabet:
                    self.insertprob[key] = 0
            else:
                self.insertprob = copy.deepcopy(
                    new_insertprob)  # Update insertprob
        except CustomException as instance:
            print(instance)
            print("Failed to update indel and mutation probabilities!")
        except:
            print("Failed to update indel and mutation probabilities!")
            raise
        else:
            if self.verbose:
                print("Indel and mutation probabilities successfully updated!")

    def set_mualphabet(self, mualphabet=None):
        """Changes the mutation alphabet of the MuGen object. A useful method if posmu and probmu methods
		are constantly called."""
        try:
            if not mualphabet:
                if self.verbose:
                    print(
                        "WARNING: You have specified no mutation alphabet! Mutations are set to random \
letters!")
                self.mualphabet = dict()
                for key in self.alphabet:
                    self.mualphabet[key] = ''.join(self.alphabet - {
                        key
                    })  # Non-specified mutations could happen to any letter
            else:
                mualphabet = dict([(str(k), str(v))
                                   for k, v in mualphabet.iteritems()])
                for key, value in mualphabet.iteritems():
                    if len(key) != 1:
                        raise CustomException(
                            "ERROR: the mutation alphabet deals with point mutations! Only single letters are\
 allowed as keys!")
                    elif key in set(''.join(value)):
                        raise CustomException(
                            "ERROR: Wrong mutation values specified! A letter could just be substituted with a\
 different letter for mutation!")
                if set(mualphabet.keys()) == self.alphabet and set(''.join(
                        mualphabet.values())) <= self.alphabet:
                    self.mualphabet = copy.deepcopy(mualphabet)
                elif set(mualphabet.keys()) < self.alphabet and set(''.join(
                        mualphabet.values())) < self.alphabet:
                    if self.verbose:
                        print(
                            "WARNING: Mutation is not specified for some letters! Those mutations are set\
 to random letters!")
                    self.mualphabet = copy.deepcopy(
                        mualphabet
                    )  # Whatever has been specified for mutation alphabet is kep intact
                    for key in self.alphabet - set(mualphabet.keys()):
                        self.mualphabet[key] = ''.join(
                            self.alphabet - {key}
                        )  # Non-specified mutations could happen to any letter
                else:
                    if self.verbose:
                        print(
                            "WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\
 updated and\nunspecified mutations are set to random letters!")
                    new_mualphabet = dict(
                    )  # As mutation may introduce novel alleles in the sequence, alphabet is updated first
                    for key, value in mualphabet.iteritems(
                    ):  # Whatever has been specified for mutation alphabet is kep intact
                        self.alphabet.add(
                            key)  # Only the alphabet is updated if necessary
                        self.alphabet |= (set(''.join(value)) - self.alphabet)
                        new_mualphabet.update({key: value})
                    for key in self.alphabet - set(new_mualphabet.keys()):
                        new_mualphabet[key] = ''.join(
                            self.alphabet - {key}
                        )  # Non-specified mutations could happen to any letter
                    self.mualphabet = copy.deepcopy(new_mualphabet)

        except CustomException as instance:
            print(instance)
            print("Mualphabet could not be updated!")
        except:
            print("Mualphabet could not be updated!")
            raise
        else:
            if self.verbose:
                print("Mualphabet successfully updated!")

    def probmu(self):
        self.occuredmu = list()
        self.occureddel = list()
        self.occuredins = list()
        self.inserted_allele = list()
        self.alt_allele = list()
        """Operates on a MuGen object, and returns a Seq object obtained by making random changes
		to the reference sequence of the MuGen object, using the probabilities given to MuGen"""
        self.seq = []
        for __site, __base in enumerate(self.ref):
            if __site in set(self.mupos) | set(self.inpos) | set(self.delpos):
                self.seq.append(
                    __base)  # No change is made at indel/mutation positions
            else:
                __prob = {
                    'ins': self.insertprob.get(__base),
                    'del': self.deleteprob.get(__base),
                    'sub': self.muprob.get(__base)
                }
                __error = random.choice(
                    ['ins', 'del', 'sub',
                     'sub'])  # An error occurs randomly: insertion or \
                # deletion or substitution
                __rnd = float(int(
                    random.random() *
                    100000)) / 100000  # The probability that this error is \
                # not corrected by replication machinary is determined \
                if __rnd < __prob.get(
                        __error):  # by insertprob,deleteprob and muprob
                    if __error == 'sub':
                        self.seq.append(
                            random.choice(self.mualphabet.get(__base))
                        )  # Substitute tha letter with one from the mutation alphabet
                        self.occuredmu.append(
                            __site
                        )  # Update the list of the sites where a mutation has occured
                        self.alt_allele.extend([
                            self.seq[-1]
                        ])  # Update the list of alternative alleles
                    elif __error == 'ins':
                        self.seq.append(__base)
                        self.seq.append(
                            random.choice(list(self.alphabet))
                        )  # Insert a random letter right after the letter
                        self.occuredins.append(
                            __site
                        )  # Update the list of the sites after which an insertion has occured
                        self.inserted_allele.extend([
                            __base + self.seq[-1]
                        ])  # Update the list of inserted alleles
                    else:
                        self.occureddel.append(
                            __site
                        )  # Delete the letter in the progeny sequence by just not adding it
                else:  # Update the list of the sites which are deleted in the progeny sequence
                    self.seq.append(
                        __base
                    )  # No change is induced at the site in the progeny sequence
        self.seq = ''.join(self.seq)
        self.seq = MutableSeq(self.seq, self.alphaproperty)
        if (self.occuredins):
            _ins_allele = zip(self.occuredins, self.inserted_allele)
            _ins_allele.sort(key=lambda tup: tup[
                0])  # Sort the occured change positions in ascending order
            self.occuredins, self.inserted_allele = zip(*_ins_allele)
            self.occuredins = list(self.occuredins)
            self.inserted_allele = list(self.inserted_allele)
            _ins_allele = None
        else:
            self.inserted_allele = []
            self.occuredins = []
        if (self.occuredmu):
            _alt_allele = zip(self.occuredmu, self.alt_allele)
            _alt_allele.sort(key=lambda tup: tup[0])
            self.occuredmu, self.alt_allele = zip(*_alt_allele)
            self.occuredmu = list(self.occuredmu)
            self.alt_allele = list(self.alt_allele)
            _alt_allele = None
        else:
            self.occuredmu = []
            self.alt_allele = []
        if (self.occureddel):
            self.occureddel.sort()
        else:
            self.occureddel = []
        if self.verbose:
            print(
                "WARNING: If indel/mutation positions are specified, MuGen.probmu() makes no change at those sites. \n \
Use MuGen.posmu() or Mugen.hapchanger() to apply changes at those sites!")
            print("Changes made to the haplotype!")

    def posmu(self):
        """Operates on a MuGen object, and returns a Seq object obtained by making specefic changes
		at specefic locations on the reference sequence of the MuGen object, using the
		indel and mutation positions already given to MuGen"""
        __change = [None] * len(self.ref)
        self.occuredmu = list()
        self.occureddel = list()
        self.occuredins = list()
        self.inserted_allele = list(
        )  # Preservation and change site are determined
        self.alt_allele = list()
        for __site in self.inpos:  # Preservation and change site are determined
            __change[__site] = 'ins'  # with respect to the reference seq
        for __site in self.delpos:  # type of the change is also specified
            __change[__site] = 'del'  # The substituion base at the
        for __site in self.mupos:  # specified position is determined
            __change[__site] = 'sub'  # from the mutation alphabet.
        self.seq = []
        for __site, __error in iter(zip(range(len(self.ref)), __change)):
            __base = self.ref[__site]
            if __error is None:
                self.seq.append(__base)
            elif __error == 'sub':
                self.seq.append(
                    random.choice(self.mualphabet.get(__base))
                )  # Substitute tha letter with one from the mutation alphabet
                self.occuredmu.append(
                    __site
                )  # Update the list of the sites where a mutation has occured
                self.alt_allele.extend(
                    [self.seq[-1]])  # Update the list of alternative alleles
            elif __error == 'ins':
                self.seq.append(__base)
                self.seq.append(random.choice(
                    list(self.alphabet
                         )))  # Insert a random letter right after the letter
                self.occuredins.append(
                    __site
                )  # Update the list of the sites after which an insertion has occured
                self.inserted_allele.extend([
                    __base + self.seq[-1]
                ])  # Update the list of inserted alleles
            else:
                self.occureddel.append(
                    __site
                )  # Delete the letter in the progeny sequence by just not adding it
        self.seq = ''.join(self.seq)
        self.seq = MutableSeq(
            self.seq, self.alphaproperty
        )  # Update the list of the sites which are deleted in the progeny sequence
        if self.occuredins:
            _ins_allele = zip(self.occuredins, self.inserted_allele)
            _ins_allele.sort(
                key=lambda tup: tup[0])  # Sort the occured change positions
            self.occuredins, self.inserted_allele = zip(*_ins_allele)
            self.occuredins = list(self.occuredins)
            self.inserted_allele = list(self.inserted_allele)
            _ins_allele = None
        else:
            self.inserted_allele = []
            self.occuredins = []
        if (self.occuredmu):
            _alt_allele = zip(self.occuredmu, self.alt_allele)
            _alt_allele.sort(key=lambda tup: tup[0])
            self.occuredmu, self.alt_allele = zip(*_alt_allele)
            self.occuredmu = list(self.occuredmu)
            self.alt_allele = list(self.alt_allele)
            _alt_allele = None
        else:
            self.occuredmu = []
            self.alt_allele = []
        if (self.occureddel):
            self.occureddel.sort()
        else:
            self.occureddel = []
        if self.verbose:
            print(
                "WARNING: if there are overlaps betweeen deletion, insertion and mutation positions, \n \
just one of the changes takes place with the following priority: \n \
1)Mutation  2)Deletion 3)Insertion. \n")
            print("Changes made to the haplotype!")

    def hapchanger(self):
        """Operates on a MuGen object, and returns a Seq object obtained by making random and specified
		changes to the reference sequence of the MuGen object, using the probabilities as well as the
		positions given to MuGen."""
        self.seq = []
        self.occuredmu = list()
        self.occureddel = list()
        self.occuredins = list()
        self.inserted_allele = list()
        self.alt_allele = list()
        for __site, __base in enumerate(self.ref):
            if __site in set(
                    self.mupos
            ):  # Making specified changes at the specified positions
                self.seq.append(
                    random.choice(self.mualphabet.get(__base))
                )  # Induce mutation at the site whose position is given
                self.occuredmu.append(
                    __site
                )  # Update the list of the sites where a mutation has occured
                self.alt_allele.extend(
                    [self.seq[-1]])  # Update the list of alternative alleles
            elif __site in set(self.inpos):
                self.seq.append(
                    __base
                )  # Make an insertion right after the site whose position is given
                self.seq.append(random.choice(list(self.alphabet)))
                self.occuredins.append(
                    __site
                )  # Update the list of the sites after which an insertion has occured
                self.inserted_allele.extend([
                    __base + self.seq[-1]
                ])  # Update the list of inserted alleles
            elif __site in set(self.delpos):
                self.occureddel.append(
                    __site)  # Update the list of the sited with deleted letter
            else:  # If not change is specified at the position, \
                # make a random change according to the prob model
                __prob = {
                    'ins': self.insertprob.get(__base),
                    'del': self.deleteprob.get(__base),
                    'sub': self.muprob.get(__base)
                }
                __error = random.choice(
                    ['ins', 'del', 'sub',
                     'sub'])  # An error occurs randomly: insertion or \
                # deletion or substitution
                __rnd = float(int(
                    random.random() *
                    100000)) / 100000  # The probability that this error is \
                # not corrected by replication machinary is determined \
                if __rnd < __prob.get(
                        __error):  # by insertprob,deleteprob and muprob
                    if __error == 'sub':
                        self.seq.append(
                            random.choice(self.mualphabet.get(__base)))
                        self.occuredmu.append(
                            __site
                        )  # Update the list of the sites where a mutation has occured
                        self.alt_allele.extend([
                            self.seq[-1]
                        ])  # Update the list of alternative alleles
                    elif __error == 'ins':
                        self.seq.append(__base)
                        self.seq.append(random.choice(list(self.alphabet)))
                        self.occuredins.append(
                            __site
                        )  # Update the list of the sites after which an insertion has occured
                        self.inserted_allele.extend([
                            __base + self.seq[-1]
                        ])  # Update the list of inserted alleles
                    elif __error == 'del':
                        self.occureddel.append(
                            __site
                        )  # Update the list of the sited with deleted letter
                else:
                    self.seq.append(__base)
        self.seq = ''.join(self.seq)
        self.seq = MutableSeq(self.seq, self.alphaproperty)
        if (self.occuredins):
            _ins_allele = zip(self.occuredins, self.inserted_allele)
            _ins_allele.sort(
                key=lambda tup: tup[0])  # Sort the occured change positions
            self.occuredins, self.inserted_allele = zip(*_ins_allele)
            self.occuredins = list(self.occuredins)
            self.inserted_allele = list(self.inserted_allele)
            _ins_allele = None
        else:
            self.inserted_allele = []
            self.occuredins = []
        if (self.occuredmu):
            _alt_allele = zip(self.occuredmu, self.alt_allele)
            _alt_allele.sort(key=lambda tup: tup[0])
            self.occuredmu, self.alt_allele = zip(*_alt_allele)
            self.occuredmu = list(self.occuredmu)
            self.alt_allele = list(self.alt_allele)
            _alt_allele = None
        else:
            self.occuredmu = []
            self.alt_allele = []
        if (self.occureddel):
            self.occureddel.sort()
        else:
            self.occureddel = []
        if self.verbose:
            print("Changes made to the haplotype!")
Exemplo n.º 9
0
    def viterbi(self, sequence, state_alphabet):
        """Calculate the most probable state path using the Viterbi algorithm.

        This implements the Viterbi algorithm (see pgs 55-57 in Durbin et
        al for a full explanation -- this is where I took my implementation
        ideas from), to allow decoding of the state path, given a sequence
        of emissions.

        Arguments:

        o sequence -- A Seq object with the emission sequence that we
        want to decode.

        o state_alphabet -- The alphabet of the possible state sequences
        that can be generated.
        """
        # calculate logarithms of the transition and emission probs
        log_trans = self._log_transform(self.transition_prob)
        log_emission = self._log_transform(self.emission_prob)

        viterbi_probs = {}
        pred_state_seq = {}
        state_letters = state_alphabet.letters
        # --- initialization
        #
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        #
        # v_{0}(0) = 1
        viterbi_probs[(state_letters[0], -1)] = 1
        # v_{k}(0) = 0 for k > 0
        for state_letter in state_letters[1:]:
            viterbi_probs[(state_letter, -1)] = 0

        # --- recursion
        # loop over the training squence (i = 1 .. L)
        for i in range(0, len(sequence)):
            # now loop over all of the letters in the state path
            for main_state in state_letters:
                # e_{l}(x_{i})
                emission_part = log_emission[(main_state, sequence[i])]

                # loop over all possible states
                possible_state_probs = {}
                for cur_state in self.transitions_from(main_state):
                    # a_{kl}
                    trans_part = log_trans[(cur_state, main_state)]

                    # v_{k}(i - 1)
                    viterbi_part = viterbi_probs[(cur_state, i - 1)]
                    cur_prob = viterbi_part + trans_part

                    possible_state_probs[cur_state] = cur_prob

                # finally calculate the viterbi probability using the max
                max_prob = max(possible_state_probs.values())
                viterbi_probs[(main_state, i)] = (emission_part + max_prob)

                # now get the most likely state
                for state in possible_state_probs:
                    if possible_state_probs[state] == max_prob:
                        pred_state_seq[(i - 1, main_state)] = state
                        break
                    
        # --- termination
        # calculate the probability of the state path
        # loop over all letters
        all_probs = {}
        for state in state_letters:
            # v_{k}(L)
            viterbi_part = viterbi_probs[(state, len(sequence) - 1)]
            # a_{k0}
            transition_part = log_trans[(state, state_letters[0])]

            all_probs[state] = viterbi_part * transition_part

        state_path_prob = max(all_probs.values())

        # find the last pointer we need to trace back from
        last_state = ''
        for state in all_probs:
            if all_probs[state] == state_path_prob:
                last_state = state

        assert last_state != '', "Didn't find the last state to trace from!"
                
        # --- traceback
        traceback_seq = MutableSeq('', state_alphabet)
        
        loop_seq = range(0, len(sequence))
        loop_seq.reverse()

        cur_state = last_state
        for i in loop_seq:
            traceback_seq.append(cur_state)
            
            cur_state = pred_state_seq[(i - 1, cur_state)]

        # put the traceback sequence in the proper orientation
        traceback_seq.reverse()

        return traceback_seq.toseq(), state_path_prob
Exemplo n.º 10
0
class TestMutableSeq(unittest.TestCase):
    def setUp(self):
        self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna)
        self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna)

    def test_mutableseq_creation(self):
        """Test creating MutableSeqs in multiple ways"""
        mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna)
        self.assertIsInstance(mutable_s, MutableSeq, "Creating MutableSeq")

        mutable_s = self.s.tomutable()
        self.assertIsInstance(mutable_s, MutableSeq,
                              "Converting Seq to mutable")

        array_seq = MutableSeq(
            array.array(array_indicator, "TCAAAAGGATGCATCATG"),
            IUPAC.ambiguous_dna)
        self.assertIsInstance(array_seq, MutableSeq,
                              "Creating MutableSeq using array")

    def test_repr(self):
        self.assertEqual(
            "MutableSeq('TCAAAAGGATGCATCATG', IUPACAmbiguousDNA())",
            repr(self.mutable_s))

    def test_truncated_repr(self):
        seq = "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGA"
        expected = "MutableSeq('TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATG...GGA', IUPACAmbiguousDNA())"
        self.assertEqual(expected, repr(MutableSeq(seq, IUPAC.ambiguous_dna)))

    def test_equal_comparison(self):
        """Test __eq__ comparison method"""
        self.assertEqual(self.mutable_s, "TCAAAAGGATGCATCATG")

    def test_equal_comparison_of_incompatible_alphabets(self):
        with warnings.catch_warnings(record=True):
            self.mutable_s == MutableSeq('UCAAAAGGA', IUPAC.ambiguous_rna)

    def test_not_equal_comparison(self):
        """Test __ne__ comparison method"""
        self.assertNotEqual(self.mutable_s, "other thing")

    def test_less_than_comparison(self):
        """Test __lt__ comparison method"""
        self.assertTrue(self.mutable_s[:-1] < self.mutable_s)

    def test_less_than_comparison_of_incompatible_alphabets(self):
        with warnings.catch_warnings(record=True):
            self.mutable_s[:-1] < MutableSeq("UCAAAAGGAUGCAUCAUG",
                                             IUPAC.ambiguous_rna)

    def test_less_than_comparison_without_alphabet(self):
        self.assertTrue(self.mutable_s[:-1] < "TCAAAAGGATGCATCATG")

    def test_less_than_or_equal_comparison(self):
        """Test __le__ comparison method"""
        self.assertTrue(self.mutable_s[:-1] <= self.mutable_s)

    def test_less_than_or_equal_comparison_of_incompatible_alphabets(self):
        with warnings.catch_warnings(record=True):
            self.mutable_s[:-1] <= MutableSeq("UCAAAAGGAUGCAUCAUG",
                                              IUPAC.ambiguous_rna)

    def test_less_than_or_equal_comparison_without_alphabet(self):
        self.assertTrue(self.mutable_s[:-1] <= "TCAAAAGGATGCATCATG")

    def test_add_method(self):
        """Test adding wrong type to MutableSeq"""
        with self.assertRaises(TypeError):
            self.mutable_s + 1234

    def test_radd_method(self):
        self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG",
                         self.mutable_s.__radd__(self.mutable_s))

    def test_radd_method_incompatible_alphabets(self):
        with self.assertRaises(TypeError):
            self.mutable_s.__radd__(
                MutableSeq("UCAAAAGGA", IUPAC.ambiguous_rna))

    def test_radd_method_using_seq_object(self):
        self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG",
                         self.mutable_s.__radd__(self.s))

    def test_radd_method_wrong_type(self):
        with self.assertRaises(TypeError):
            self.mutable_s.__radd__(1234)

    def test_as_string(self):
        self.assertEqual("TCAAAAGGATGCATCATG", str(self.mutable_s))

    def test_length(self):
        self.assertEqual(18, len(self.mutable_s))

    def test_converting_to_immutable(self):
        self.assertIsInstance(self.mutable_s.toseq(), Seq.Seq)

    def test_first_nucleotide(self):
        self.assertEqual('T', self.mutable_s[0])

    def test_setting_slices(self):
        self.assertEqual(MutableSeq('CAAA', IUPAC.ambiguous_dna),
                         self.mutable_s[1:5], "Slice mutable seq")

        self.mutable_s[1:3] = "GAT"
        self.assertEqual(
            MutableSeq("TGATAAAGGATGCATCATG",
                       IUPAC.ambiguous_dna), self.mutable_s,
            "Set slice with string and adding extra nucleotide")

        self.mutable_s[1:3] = self.mutable_s[5:7]
        self.assertEqual(
            MutableSeq("TAATAAAGGATGCATCATG", IUPAC.ambiguous_dna),
            self.mutable_s, "Set slice with MutableSeq")

        self.mutable_s[1:3] = array.array(array_indicator, "GAT")
        self.assertEqual(
            MutableSeq("TGATTAAAGGATGCATCATG", IUPAC.ambiguous_dna),
            self.mutable_s, "Set slice with array")

    def test_setting_item(self):
        self.mutable_s[3] = "G"
        self.assertEqual(MutableSeq("TCAGAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_deleting_slice(self):
        del self.mutable_s[4:5]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_deleting_item(self):
        del self.mutable_s[3]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_appending(self):
        self.mutable_s.append("C")
        self.assertEqual(
            MutableSeq("TCAAAAGGATGCATCATGC", IUPAC.ambiguous_dna),
            self.mutable_s)

    def test_inserting(self):
        self.mutable_s.insert(4, "G")
        self.assertEqual(
            MutableSeq("TCAAGAAGGATGCATCATG", IUPAC.ambiguous_dna),
            self.mutable_s)

    def test_popping_last_item(self):
        self.assertEqual("G", self.mutable_s.pop())

    def test_remove_items(self):
        self.mutable_s.remove("G")
        self.assertEqual(MutableSeq("TCAAAAGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s, "Remove first G")

        self.assertRaises(ValueError, self.mutable_s.remove, 'Z')

    def test_count(self):
        self.assertEqual(7, self.mutable_s.count("A"))
        self.assertEqual(2, self.mutable_s.count("AA"))

    def test_index(self):
        self.assertEqual(2, self.mutable_s.index("A"))
        self.assertRaises(ValueError, self.mutable_s.index, "8888")

    def test_reverse(self):
        """Test using reverse method"""
        self.mutable_s.reverse()
        self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_reverse_with_stride(self):
        """Test reverse using -1 stride"""
        self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna),
                         self.mutable_s[::-1])

    def test_complement(self):
        self.mutable_s.complement()
        self.assertEqual(str("AGTTTTCCTACGTAGTAC"), str(self.mutable_s))

    def test_complement_rna(self):
        seq = Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna)
        seq.complement()
        self.assertEqual(str("UACuuuGAC"), str(seq))

    def test_complement_mixed_aphabets(self):
        seq = Seq.MutableSeq("AUGaaaCTG")
        with self.assertRaises(ValueError):
            seq.complement()

    def test_complement_rna_string(self):
        seq = Seq.MutableSeq("AUGaaaCUG")
        seq.complement()
        self.assertEqual('UACuuuGAC', str(seq))

    def test_complement_dna_string(self):
        seq = Seq.MutableSeq("ATGaaaCTG")
        seq.complement()
        self.assertEqual('TACtttGAC', str(seq))

    def test_reverse_complement(self):
        self.mutable_s.reverse_complement()
        self.assertEqual("CATGATGCATCCTTTTGA", str(self.mutable_s))

    def test_reverse_complement_of_protein(self):
        seq = Seq.MutableSeq("ACTGTCGTCT", Alphabet.generic_protein)
        with self.assertRaises(ValueError):
            seq.reverse_complement()

    def test_to_string_method(self):
        """This method is currently deprecated, probably will need to remove this test soon"""
        with warnings.catch_warnings(record=True):
            self.mutable_s.tostring()

    def test_extend_method(self):
        self.mutable_s.extend("GAT")
        self.assertEqual(
            MutableSeq("TCAAAAGGATGCATCATGGAT", IUPAC.ambiguous_dna),
            self.mutable_s)

    def test_extend_with_mutable_seq(self):
        self.mutable_s.extend(MutableSeq("TTT", IUPAC.ambiguous_dna))
        self.assertEqual(
            MutableSeq("TCAAAAGGATGCATCATGTTT", IUPAC.ambiguous_dna),
            self.mutable_s)

    def test_delete_stride_slice(self):
        del self.mutable_s[4:6 - 1]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_extract_third_nucleotide(self):
        """Test extracting every third nucleotide (slicing with stride 3)"""
        self.assertEqual(MutableSeq("TAGTAA", IUPAC.ambiguous_dna),
                         self.mutable_s[0::3])
        self.assertEqual(MutableSeq("CAGGTT", IUPAC.ambiguous_dna),
                         self.mutable_s[1::3])
        self.assertEqual(MutableSeq("AAACCG", IUPAC.ambiguous_dna),
                         self.mutable_s[2::3])

    def test_set_wobble_codon_to_n(self):
        """Test setting wobble codon to N (set slice with stride 3)"""
        self.mutable_s[2::3] = "N" * len(self.mutable_s[2::3])
        self.assertEqual(MutableSeq("TCNAANGGNTGNATNATN", IUPAC.ambiguous_dna),
                         self.mutable_s)
Exemplo n.º 11
0
class TestMutableSeq(unittest.TestCase):
    def setUp(self):
        self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna)
        self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna)

    def test_mutableseq_creation(self):
        """Test creating MutableSeqs in multiple ways"""
        mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna)
        self.assertIsInstance(mutable_s, MutableSeq, "Creating MutableSeq")

        mutable_s = self.s.tomutable()
        self.assertIsInstance(mutable_s, MutableSeq, "Converting Seq to mutable")

        array_seq = MutableSeq(array.array(array_indicator, "TCAAAAGGATGCATCATG"),
                               IUPAC.ambiguous_dna)
        self.assertIsInstance(array_seq, MutableSeq, "Creating MutableSeq using array")

    def test_repr(self):
        self.assertEqual("MutableSeq('TCAAAAGGATGCATCATG', IUPACAmbiguousDNA())",
                         repr(self.mutable_s))

    def test_truncated_repr(self):
        seq = "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGA"
        expected = "MutableSeq('TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATG...GGA', IUPACAmbiguousDNA())"
        self.assertEqual(expected, repr(MutableSeq(seq, IUPAC.ambiguous_dna)))

    def test_equal_comparison(self):
        """Test __eq__ comparison method"""
        self.assertEqual(self.mutable_s, "TCAAAAGGATGCATCATG")

    def test_equal_comparison_of_incompatible_alphabets(self):
        with warnings.catch_warnings(record=True):
            self.mutable_s == MutableSeq('UCAAAAGGA', IUPAC.ambiguous_rna)

    def test_not_equal_comparison(self):
        """Test __ne__ comparison method"""
        self.assertNotEqual(self.mutable_s, "other thing")

    def test_less_than_comparison(self):
        """Test __lt__ comparison method"""
        self.assertTrue(self.mutable_s[:-1] < self.mutable_s)

    def test_less_than_comparison_of_incompatible_alphabets(self):
        with warnings.catch_warnings(record=True):
            self.mutable_s[:-1] < MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna)

    def test_less_than_comparison_without_alphabet(self):
        self.assertTrue(self.mutable_s[:-1] < "TCAAAAGGATGCATCATG")

    def test_less_than_or_equal_comparison(self):
        """Test __le__ comparison method"""
        self.assertTrue(self.mutable_s[:-1] <= self.mutable_s)

    def test_less_than_or_equal_comparison_of_incompatible_alphabets(self):
        with warnings.catch_warnings(record=True):
            self.mutable_s[:-1] <= MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna)

    def test_less_than_or_equal_comparison_without_alphabet(self):
        self.assertTrue(self.mutable_s[:-1] <= "TCAAAAGGATGCATCATG")

    def test_add_method(self):
        """Test adding wrong type to MutableSeq"""
        with self.assertRaises(TypeError):
            self.mutable_s + 1234

    def test_radd_method(self):
        self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG",
                         self.mutable_s.__radd__(self.mutable_s))

    def test_radd_method_incompatible_alphabets(self):
        with self.assertRaises(TypeError):
            self.mutable_s.__radd__(MutableSeq("UCAAAAGGA", IUPAC.ambiguous_rna))

    def test_radd_method_using_seq_object(self):
        self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG",
                         self.mutable_s.__radd__(self.s))

    def test_radd_method_wrong_type(self):
        with self.assertRaises(TypeError):
            self.mutable_s.__radd__(1234)

    def test_as_string(self):
        self.assertEqual("TCAAAAGGATGCATCATG", str(self.mutable_s))

    def test_length(self):
        self.assertEqual(18, len(self.mutable_s))

    def test_converting_to_immutable(self):
        self.assertIsInstance(self.mutable_s.toseq(), Seq.Seq)

    def test_first_nucleotide(self):
        self.assertEqual('T', self.mutable_s[0])

    def test_setting_slices(self):
        self.assertEqual(MutableSeq('CAAA', IUPAC.ambiguous_dna),
                         self.mutable_s[1:5], "Slice mutable seq")

        self.mutable_s[1:3] = "GAT"
        self.assertEqual(MutableSeq("TGATAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s,
                         "Set slice with string and adding extra nucleotide")

        self.mutable_s[1:3] = self.mutable_s[5:7]
        self.assertEqual(MutableSeq("TAATAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s, "Set slice with MutableSeq")

        self.mutable_s[1:3] = array.array(array_indicator, "GAT")
        self.assertEqual(MutableSeq("TGATTAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s, "Set slice with array")

    def test_setting_item(self):
        self.mutable_s[3] = "G"
        self.assertEqual(MutableSeq("TCAGAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_deleting_slice(self):
        del self.mutable_s[4:5]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_deleting_item(self):
        del self.mutable_s[3]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_appending(self):
        self.mutable_s.append("C")
        self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGC", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_inserting(self):
        self.mutable_s.insert(4, "G")
        self.assertEqual(MutableSeq("TCAAGAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_popping_last_item(self):
        self.assertEqual("G", self.mutable_s.pop())

    def test_remove_items(self):
        self.mutable_s.remove("G")
        self.assertEqual(MutableSeq("TCAAAAGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s, "Remove first G")

        self.assertRaises(ValueError, self.mutable_s.remove, 'Z')

    def test_count(self):
        self.assertEqual(7, self.mutable_s.count("A"))
        self.assertEqual(2, self.mutable_s.count("AA"))

    def test_index(self):
        self.assertEqual(2, self.mutable_s.index("A"))
        self.assertRaises(ValueError, self.mutable_s.index, "8888")

    def test_reverse(self):
        """Test using reverse method"""
        self.mutable_s.reverse()
        self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_reverse_with_stride(self):
        """Test reverse using -1 stride"""
        self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna),
                         self.mutable_s[::-1])

    def test_complement(self):
        self.mutable_s.complement()
        self.assertEqual(str("AGTTTTCCTACGTAGTAC"), str(self.mutable_s))

    def test_complement_rna(self):
        seq = Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna)
        seq.complement()
        self.assertEqual(str("UACuuuGAC"), str(seq))

    def test_complement_mixed_aphabets(self):
        seq = Seq.MutableSeq("AUGaaaCTG")
        with self.assertRaises(ValueError):
            seq.complement()

    def test_complement_rna_string(self):
        seq = Seq.MutableSeq("AUGaaaCUG")
        seq.complement()
        self.assertEqual('UACuuuGAC', str(seq))

    def test_complement_dna_string(self):
        seq = Seq.MutableSeq("ATGaaaCTG")
        seq.complement()
        self.assertEqual('TACtttGAC', str(seq))

    def test_reverse_complement(self):
        self.mutable_s.reverse_complement()
        self.assertEqual("CATGATGCATCCTTTTGA", str(self.mutable_s))

    def test_reverse_complement_of_protein(self):
        seq = Seq.MutableSeq("ACTGTCGTCT", Alphabet.generic_protein)
        with self.assertRaises(ValueError):
            seq.reverse_complement()

    def test_to_string_method(self):
        """This method is currently deprecated, probably will need to remove this test soon"""
        with warnings.catch_warnings(record=True):
            self.mutable_s.tostring()

    def test_extend_method(self):
        self.mutable_s.extend("GAT")
        self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGGAT", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_extend_with_mutable_seq(self):
        self.mutable_s.extend(MutableSeq("TTT", IUPAC.ambiguous_dna))
        self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGTTT", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_delete_stride_slice(self):
        del self.mutable_s[4:6 - 1]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_extract_third_nucleotide(self):
        """Test extracting every third nucleotide (slicing with stride 3)"""
        self.assertEqual(MutableSeq("TAGTAA", IUPAC.ambiguous_dna), self.mutable_s[0::3])
        self.assertEqual(MutableSeq("CAGGTT", IUPAC.ambiguous_dna), self.mutable_s[1::3])
        self.assertEqual(MutableSeq("AAACCG", IUPAC.ambiguous_dna), self.mutable_s[2::3])

    def test_set_wobble_codon_to_n(self):
        """Test setting wobble codon to N (set slice with stride 3)"""
        self.mutable_s[2::3] = "N" * len(self.mutable_s[2::3])
        self.assertEqual(MutableSeq("TCNAANGGNTGNATNATN", IUPAC.ambiguous_dna),
                         self.mutable_s)
Exemplo n.º 12
0
 def get_optimal_alignment(self):
     """Follow the traceback to get the optimal alignment."""
     # intialize the two sequences which will return the alignment
     align_seq1 = MutableSeq(array.array("c"), 
       Alphabet.Gapped(IUPAC.protein, GAP_CHAR))
     align_seq2 = MutableSeq(array.array("c"), 
       Alphabet.Gapped(IUPAC.protein, GAP_CHAR))
       
     # take care of the initial case with the bottom corner matrix
     # item
     current_cell = self.dpmatrix[(len(self.seq1), len(self.seq2))]
     align_seq1.append(current_cell.seq1item)
     align_seq2.append(current_cell.seq2item)
     
     next_cell = current_cell.get_parent()
     current_cell = next_cell
     next_cell = current_cell.get_parent()
     
     # keeping adding sequence until we reach (0, 0)
     while next_cell:
         # add the new sequence--three cases:
         # 1. Move up diaganolly, add a new seq1 and seq2 to the 
         # aligned sequences
         if ((next_cell.col_pos == current_cell.col_pos - 1) and
           (next_cell.row_pos == current_cell.row_pos - 1)):
             # print "case 1 -> seq1 %s, seq2 %s" % (
             # current_cell.seq1item, current_cell.seq2item)
             align_seq1.append(current_cell.seq1item)
             align_seq2.append(current_cell.seq2item)
         # 2. Move upwards, add a new seq2 and a gap in seq1
         elif ((next_cell.col_pos  == current_cell.col_pos) and
           (next_cell.row_pos == current_cell.row_pos - 1)):
             #print "case 2 -> seq2 %s" % current_cell.seq2item
             align_seq1.append(GAP_CHAR)
             align_seq2.append(current_cell.seq2item)
         # 3. Move to the right, add a new seq1 and a gap in seq2
         elif ((next_cell.col_pos == current_cell.col_pos - 1) and
           (next_cell.row_pos == current_cell.row_pos)):
             #print "case 3 -> seq1 % s" % current_cell.seq1item
             align_seq1.append(current_cell.seq1item)
             align_seq2.append(GAP_CHAR)
         
         # now move on to the next sequence
         current_cell = next_cell
         next_cell = current_cell.get_parent()
     
     # reverse the returned alignments since we are reading them in
     # backwards
     align_seq1.reverse()
     align_seq2.reverse()
     return align_seq1.toseq(), align_seq2.toseq()
Exemplo n.º 13
0
    def viterbi(self, sequence, state_alphabet):
        """Calculate the most probable state path using the Viterbi algorithm.

        This implements the Viterbi algorithm (see pgs 55-57 in Durbin et
        al for a full explanation -- this is where I took my implementation
        ideas from), to allow decoding of the state path, given a sequence
        of emissions.

        Arguments:

        o sequence -- A Seq object with the emission sequence that we
        want to decode.

        o state_alphabet -- The alphabet of the possible state sequences
        that can be generated.
        """
        # calculate logarithms of the transition and emission probs
        log_trans = self._log_transform(self.transition_prob)
        log_emission = self._log_transform(self.emission_prob)

        viterbi_probs = {}
        pred_state_seq = {}
        state_letters = state_alphabet.letters
        # --- initialization
        #
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        #
        # v_{0}(0) = 1
        viterbi_probs[(state_letters[0], -1)] = 1
        # v_{k}(0) = 0 for k > 0
        for state_letter in state_letters[1:]:
            viterbi_probs[(state_letter, -1)] = 0

        # --- recursion
        # loop over the training squence (i = 1 .. L)
        for i in range(0, len(sequence)):
            # now loop over all of the letters in the state path
            for main_state in state_letters:
                # e_{l}(x_{i})
                emission_part = log_emission[(main_state, sequence[i])]

                # loop over all possible states
                possible_state_probs = {}
                for cur_state in self.transitions_from(main_state):
                    # a_{kl}
                    trans_part = log_trans[(cur_state, main_state)]

                    # v_{k}(i - 1)
                    viterbi_part = viterbi_probs[(cur_state, i - 1)]
                    cur_prob = viterbi_part + trans_part

                    possible_state_probs[cur_state] = cur_prob

                # finally calculate the viterbi probability using the max
                max_prob = max(possible_state_probs.values())
                viterbi_probs[(main_state, i)] = (emission_part + max_prob)

                # now get the most likely state
                for state in possible_state_probs.keys():
                    if possible_state_probs[state] == max_prob:
                        pred_state_seq[(i - 1, main_state)] = state
                        break

        # --- termination
        # calculate the probability of the state path
        # loop over all letters
        all_probs = {}
        for state in state_letters:
            # v_{k}(L)
            viterbi_part = viterbi_probs[(state, len(sequence) - 1)]
            # a_{k0}
            transition_part = log_trans[(state, state_letters[0])]

            all_probs[state] = viterbi_part * transition_part

        state_path_prob = max(all_probs.values())

        # find the last pointer we need to trace back from
        last_state = ''
        for state in all_probs.keys():
            if all_probs[state] == state_path_prob:
                last_state = state

        assert last_state != '', "Didn't find the last state to trace from!"

        # --- traceback
        traceback_seq = MutableSeq('', state_alphabet)

        loop_seq = range(0, len(sequence))
        loop_seq.reverse()

        cur_state = last_state
        for i in loop_seq:
            traceback_seq.append(cur_state)

            cur_state = pred_state_seq[(i - 1, cur_state)]

        # put the traceback sequence in the proper orientation
        traceback_seq.reverse()

        return traceback_seq.toseq(), state_path_prob