Exemplo n.º 1
0
def create_genome_seq(aligned):

    aligned_seq = aligned.seq if type(aligned.seq) == str else aligned.seq.decode('UTF-8')
        
    genome_seq = MutableSeq(aligned_seq)

    # see samtools documentation for MD string
    
    err = re.findall(MD_REGEX, aligned.opt("MD"))
    
    seq_ix = 0
    
    # step through sequence
    for matched_bases, curr_err in err:
        
        seq_ix += int(matched_bases)
        
        assert '^' not in curr_err
        assert curr_err != genome_seq[seq_ix]
        
        genome_seq[seq_ix] = curr_err
        seq_ix += 1
    
    if aligned.is_reverse:
        genome_seq.reverse_complement()

    return genome_seq
Exemplo n.º 2
0
def Gthg01471():
    ori=Seq("ATGAGCATAAGTTTATCGGTTCCAAAATGGTTATTAACAGTTTTATCAATTTTATCTTTAGTCGTAGCATTTATTTTCGGTACCGTTTCCAATGCATCAGCAACAATTAACTATGGGGAGGAAGTCGCGGCAGTAGCAAATGACTATGTAGGAAGCCCATATAAATATGGAGGTACAACGCCAAAAGGATTTGATGCGAGTGGCTTTACTCAGTATGTGTATAAAAATGCTGCAACCAAATTGGCTATTCCGCGAACGAGTGCCGCACAGTATAAAGTCGGTAAATTTGTTAAACAAAGTGCGTTACAAAGAGGCGATTTAGTGTTTTATGCAACAGGAGCAAAAGGAAAGGTATCCTTTGTGGGAATTTATAATGGAAATGGTACGTTTATTGGTGCCACATCAAAAGGGGTAAAAGTGGTTAAAATGAGTGATAAATATTGGAAAGACCGGTATATAGGGGCTAAGCGAGTCATTAAGTAA", IUPAC.unambiguous_dna)
    mut=MutableSeq("ATGAGCATAAGTTTATCGGTTCCAAAATGGTTATTAACAGTTTTATCAATTTTATCTTTAGTCGTAGCATTTATTTTCGGTACCGTTTCCAATGCATCAGCAACAATTAACTATGGGGAGGAAGTCGCGGCAGTAGCAAATGACTATGTAGGAAGCCCATATAAATATGGAGGTACAACGCCAAAAGGATTTGATGCGAGTGGCTTTACTCAGTATGTGTATAAAAATGCTGCAACCAAATTGGCTATTCCGCGAACGAGTGCCGCACAGTATAAAGTCGGTAAATTTGTTAAACAAAGTGCGTTACAAAGAGGCGATTTAGTGTTTTATGCAACAGGAGCAAAAGGAAAGGTATCCTTTGTGGGAATTTATAATGGAAATGGTACGTTTATTGGTGCCACATCAAAAGGGGTAAAAGTGGTTAAAATGAGTGATAAATATTGGAAAGACCGGTATATAGGGGCTAAGCGAGTCATTAAGTAA", IUPAC.unambiguous_dna)

    a="AGTCGA"
    b="GACTAG"
    for i,v in enumerate([259,277,282,295,299,306]):
        print(mut[v-1]+a[i])
        mut[v-1]=b[i]
    print(ori.translate())
    print(mut.toseq().translate())
Exemplo n.º 3
0
def generate_rolls(num_rolls):
    """Generate a bunch of rolls corresponding to the casino probabilities.

    Returns:

    - The generate roll sequence
    - The state sequence that generated the roll.

    """
    # start off in the fair state
    cur_state = 'F'
    roll_seq = MutableSeq('', DiceRollAlphabet())
    state_seq = MutableSeq('', DiceTypeAlphabet())

    # generate the sequence
    for roll in range(num_rolls):
        state_seq.append(cur_state)
        # generate a random number
        chance_num = random.random()

        # add on a new roll to the sequence
        new_roll = _loaded_dice_roll(chance_num, cur_state)
        roll_seq.append(new_roll)

        # now give us a chance to switch to a new state
        chance_num = random.random()
        if cur_state == 'F':
            if chance_num <= .05:
                cur_state = 'L'
        elif cur_state == 'L':
            if chance_num <= .1:
                cur_state = 'F'

    return roll_seq.toseq(), state_seq.toseq()
Exemplo n.º 4
0
def random_population(genome_alphabet, genome_size, num_organisms,
                      fitness_calculator):
    """Generate a population of individuals with randomly set genomes.

    Arguments:

    o genome_alphabet -- An Alphabet object describing all of the
    possible letters that could potentially be in the genome of an
    organism.

    o genome_size -- The size of each organisms genome.

    o num_organism -- The number of organisms we want in the population.

    o fitness_calculator -- A function that will calculate the fitness
    of the organism when given the organisms genome.
    """
    all_orgs = []

    # a random number generator to get letters for the genome
    letter_rand = random.Random()

    # figure out what type of characters are in the alphabet
    if isinstance(genome_alphabet.letters[0], str):
        if sys.version_info[0] == 3:
            alphabet_type = "u"  # Use unicode string on Python 3
        else:
            alphabet_type = "c"  # Use byte string on Python 2
    elif isinstance(genome_alphabet.letters[0], int):
        alphabet_type = "i"
    elif isinstance(genome_alphabet.letters[0], float):
        alphabet_type = "d"
    else:
        raise ValueError(
            "Alphabet type is unsupported: %s" % genome_alphabet.letters)

    for org_num in range(num_organisms):
        new_genome = MutableSeq(array.array(alphabet_type), genome_alphabet)

        # generate the genome randomly
        for gene_num in range(genome_size):
            new_gene = letter_rand.choice(genome_alphabet.letters)
            new_genome.append(new_gene)

        # add the new organism with this genome
        all_orgs.append(Organism(new_genome, fitness_calculator))

    return all_orgs
Exemplo n.º 5
0
def add_to_pileup_dict(sams, aligned_read_set, pileup_dict):
    
    # sanity check that all the qnames (RNA read IDs) are the same
    for read in aligned_read_set:
        assert read.qname == aligned_read_set[0].qname

    if not True in [read.is_unmapped for read in aligned_read_set]:
        
        # all alignments mapped
        for read in aligned_read_set:
            
            for op, op_len in read.cigar:
                
                if op > 0 and op < 7:
                    # do not sample reads where there are insertions or deletions   
                    return
                    
            assert len(read.seq) == len(aligned_read_set[0].seq)
          
        # if aligned reads are reversed, we reverse them and hold on to that info.
        
        pos_dicts = [dict(read.aligned_pairs) for read in aligned_read_set]
        genome_seqs = [create_genome_seq(read) for read in aligned_read_set]
        qual = bytearray(aligned_read_set[0].qual, 'utf-8')
        seq = MutableSeq(aligned_read_set[0].seq if type(aligned_read_set[0].seq) == str else aligned_read_set[0].seq.decode('UTF-8'))  
        if aligned_read_set[0].is_reverse:
            seq.reverse_complement()
            qual = qual[::-1]
        
        for genome_seq in genome_seqs:
            assert len(genome_seq) == len(seq)

        for i in range(0, len(seq)):
                        
            # need (chrom, pos, genome_seq[i]) tuples for each aligned_read
            chroms = [sam.getrname(a.tid) for sam, a in izip(sams, aligned_read_set)]
            positions = [d[i] if not a.is_reverse else d[len(seq) - i - 1] for d, a in zip(pos_dicts, aligned_read_set)]
            genome_seq_i = [g[i] for g in genome_seqs]
            
            genomic_locs = tuple(zip(chroms, positions, genome_seq_i))
            
            pileup_dict[genomic_locs][seq[i]][qual[i]] += 1
Exemplo n.º 6
0
    def __init__(self, sequence, hmmLength, origSeqLength, evalue, seqStart=None, seqEnd=None, hmmStart=None, hmmEnd=None):
        """Intialise HMMSequence with the hmmer unit. Must run align and 
        determineGapPositions.

        Parameters:
        unit - HMMUnit object. 
        hmmLength - int. length of the HMM.
        align - boolean. Enables alignment algorithm based on HMM values.
                See HMMSequence.align() for more info.
        gaps - boolean. Enables gap counting algorithm to create pileup.
               See HMMSequence.determineGapPositions()
        """
        self.hmmLength = int(hmmLength)
        self.gaps = [0]*self.hmmLength
        self.origSeqLength = origSeqLength
        self.evalue = evalue
        self.seqStart = seqStart
        self.seqEnd = seqEnd
        self.hmmStart = hmmStart
        self.hmmEnd = hmmEnd
        HMMPileUp.total_seqs += 1
        MutableSeq.__init__(self, sequence)
Exemplo n.º 7
0
def seq_batch_query():
    con = sqlite3.connect('./data/DB')
    cur = con.cursor()
    list_file = input('list file name:\n')
    with open(list_file, 'r') as In:
        organism_list = In.read().split(sep='\n')
    cur.execute('CREATE TABLE IF NOT EXISTS tasklist (Name TEXT);')
    for organism in organism_list:
        cur.execute('INSERT INTO tasklist (Name) VALUES (?);', (organism,))
    cur.execute(
        'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE Organism IN (SELECT Name FROM tasklist) ORDER BY Head',
        (organism))
    result = cur.fetchall()
    cur.execute('DROP TABLE tasklist;')
    cur.close()
    con.close()
    query_result = []
    for i in result:
        title = '|'.join([str(i[0]), i[1], i[2], i[3]])
        filename = i[2]
        sequence = MutableSeq(i[5])
        if i[4] == '-1':
            sequence.seq = sequence.reverse_complement()
        record = [title, filename, sequence]
        query_result.append(record)
    for i in query_result:
        with open(''.join(['./out/', i[1], '.fasta']), 'a') as Fileout:
            Fileout.write('>%s\n%s\n' % (i[0], i[2]))
            # rps12 may have larger than 50k fragments,  here to filter it
    rps12 = SeqIO.parse('./out/rps12.fasta', 'fasta')
    rps12short = list()
    for item in rps12:
        if len(item.seq) < 4000:
            rps12short.append(item)
    SeqIO.write(rps12short, './out/rps12short.fasta', 'fasta')
    print('Done.\n')
Exemplo n.º 8
0
    def viterbi(self, sequence, state_alphabet):
        """Calculate the most probable state path using the Viterbi algorithm.

        This implements the Viterbi algorithm (see pgs 55-57 in Durbin et
        al for a full explanation -- this is where I took my implementation
        ideas from), to allow decoding of the state path, given a sequence
        of emissions.

        Arguments:

        o sequence -- A Seq object with the emission sequence that we
        want to decode.

        o state_alphabet -- The alphabet of the possible state sequences
        that can be generated.
        """

        # calculate logarithms of the initial, transition, and emission probs
        log_initial = self._log_transform(self.initial_prob)
        log_trans = self._log_transform(self.transition_prob)
        log_emission = self._log_transform(self.emission_prob)

        viterbi_probs = {}
        pred_state_seq = {}
        state_letters = state_alphabet.letters

        # --- recursion
        # loop over the training squence (i = 1 .. L)
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        for i in range(0, len(sequence)):
            # loop over all of the possible i-th states in the state path
            for cur_state in state_letters:
                # e_{l}(x_{i})
                emission_part = log_emission[(cur_state, sequence[i])]

                max_prob = 0
                if i == 0:
                    # for the first state, use the initial probability rather
                    # than looking back to previous states
                    max_prob = log_initial[cur_state]
                else:
                    # loop over all possible (i-1)-th previous states
                    possible_state_probs = {}
                    for prev_state in self.transitions_to(cur_state):
                        # a_{kl}
                        trans_part = log_trans[(prev_state, cur_state)]

                        # v_{k}(i - 1)
                        viterbi_part = viterbi_probs[(prev_state, i - 1)]
                        cur_prob = viterbi_part + trans_part

                        possible_state_probs[prev_state] = cur_prob

                    # calculate the viterbi probability using the max
                    max_prob = max(possible_state_probs.values())

                # v_{k}(i)
                viterbi_probs[(cur_state, i)] = (emission_part + max_prob)

                if i > 0:
                    # get the most likely prev_state leading to cur_state
                    for state in possible_state_probs:
                        if possible_state_probs[state] == max_prob:
                            pred_state_seq[(i - 1, cur_state)] = state
                            break

        # --- termination
        # calculate the probability of the state path
        # loop over all states
        all_probs = {}
        for state in state_letters:
            # v_{k}(L)
            all_probs[state] = viterbi_probs[(state, len(sequence) - 1)]

        state_path_prob = max(all_probs.values())

        # find the last pointer we need to trace back from
        last_state = ''
        for state in all_probs:
            if all_probs[state] == state_path_prob:
                last_state = state

        assert last_state != '', "Didn't find the last state to trace from!"

        # --- traceback
        traceback_seq = MutableSeq('', state_alphabet)

        loop_seq = list(range(1, len(sequence)))
        loop_seq.reverse()

        # last_state is the last state in the most probable state sequence.
        # Compute that sequence by walking backwards in time. From the i-th
        # state in the sequence, find the (i-1)-th state as the most
        # probable state preceding the i-th state.
        state = last_state
        traceback_seq.append(state)
        for i in loop_seq:
            state = pred_state_seq[(i - 1, state)]
            traceback_seq.append(state)

        # put the traceback sequence in the proper orientation
        traceback_seq.reverse()

        return traceback_seq.toseq(), state_path_prob
Exemplo n.º 9
0
 def setUp(self):
     self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna)
     self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna)
Exemplo n.º 10
0
def genome_generator():
    """Generate a genome for testing purposes.
    """
    return MutableSeq("1234", TestAlphabet())
Exemplo n.º 11
0
def count_one_fraction(alignment, refname, debug, start_offset, end_trail):
    """
    Don't bother with expected/allowed mutations, just find everything and filter later
    Final format: {DNA error: [(protein error), fraction,
    1. Read reference file
    2. Scan over reference sequence to generate all possible mutations
    3. For each ref & read in multiple alignment:
        - verify the read is good quality
        - call the mutation
        - add to count table
    4. Print counts
    """
    # use a regular dictionary
    # when a protein mutation is first encountered, create an entry
    one_lane_counts = {}

    # reading & looping over read/reference sequence in multiple sequence alignment
    # use AlignIO parser and keep sequence only, allowing it to change (important for gap shifts)
    for pair in AlignIO.parse(alignment,
                              "fasta",
                              alphabet=IUPAC.ambiguous_dna,
                              seq_count=2):
        # both read and ref are MutableSeq
        ref = pair[0].seq.tomutable()
        read = pair[1].seq.tomutable()
        read = MutableSeq(str(read).replace('N', '.'), read.alphabet)
        readname = pair[1].id

        # trim sequencing read to reference
        ref, read = trim_read(ref, read)

        # if read_is_wt(read, ref):
        #     if debug:
        #         trimmed_read = re.search(r'^-+([AGCTN][ACGTN-]+[ACGTN])-+$', str(read))
        #         print()
        #         print(trimmed_read.group(1))
        #         printErrors("WT", read, ref, True)
        #     continue

        dna_errors, dna_hgvs, prot_errors = None, None, None

        try:
            dna_errors = find_DNA_diff(read, ref, debug, start_offset,
                                       end_trail)  # errors = a tuple
            dna_hgvs = find_DNA_hgvs(
                read, ref, refname, debug, start_offset,
                end_trail)  # string according to HGVS format (ish)
            prot_errors = find_protein_diff(read, ref, debug, start_offset,
                                            end_trail)
            # print()
            # print(readname)
            # print(dna_hgvs, prot_errors)
            # printErrors(dna_errors, read, ref, True)

        except:
            if not dna_errors:
                print(dna_errors)
            print_coloured_diff(readname, read, ref, debug)
            raise

        try:
            one_lane_counts[prot_errors]['total'] += 1
            one_lane_counts[prot_errors]['dna'][dna_errors] += 1
            one_lane_counts[prot_errors]['dna_hgvs'][dna_hgvs] += 1
        except KeyError:
            one_lane_counts[prot_errors] = {
                'dna': defaultdict(int),
                'dna_hgvs': defaultdict(int),
                'total': 1
            }
            one_lane_counts[prot_errors]['dna'][dna_errors] += 1
            one_lane_counts[prot_errors]['dna_hgvs'][dna_hgvs] += 1

    # count the mutations
    n = 0
    threshold = 10
    for error in one_lane_counts.keys():
        if one_lane_counts[error]['total'] > threshold:
            n += 1

    print(
        'Fount {0} total protein mutations, of which {1} have more than {2} counts'
        .format(len(one_lane_counts), n, threshold))

    return one_lane_counts
Exemplo n.º 12
0
	def posmu(self):
		"""Operates on a MuGen object, and returns a Seq object obtained by making specefic changes
		at specefic locations on the reference sequence of the MuGen object, using the
		indel and mutation positions already given to MuGen"""
		__change = [None] * len(self.ref)
		self.occuredmu = list()
		self.occureddel = list()
		self.occuredins = list()
		self.inserted_allele = list()  # Preservation and change site are determined
		self.alt_allele = list()
		for __site in self.inpos:  # Preservation and change site are determined
			__change[
				__site] = 'ins'  # with respect to the reference seq
		for __site in self.delpos:  # type of the change is also specified
			__change[__site] = 'del'  # The substituion base at the
		for __site in self.mupos:  # specified position is determined
			__change[__site] = 'sub'  # from the mutation alphabet.
		self.seq = []
		for __site, __error in iter(
			zip(range(len(self.ref)), __change)):
			__base = self.ref[__site]
			if __error is None:
				self.seq.append(__base)
			elif __error == 'sub':
				self.seq.append(random.choice(
					self.mualphabet.get(
						__base)))  # Substitute tha letter with one from the mutation alphabet
				self.occuredmu.append(
					__site)  # Update the list of the sites where a mutation has occured
				self.alt_allele.extend([self.seq[
								-1]])  # Update the list of alternative alleles
			elif __error == 'ins':
				self.seq.append(__base)
				self.seq.append(random.choice(list(
					self.alphabet)))  # Insert a random letter right after the letter
				self.occuredins.append(
					__site)  # Update the list of the sites after which an insertion has occured
				self.inserted_allele.extend([__base + self.seq[
					-1]])  # Update the list of inserted alleles
			else:
				self.occureddel.append(
					__site)  # Delete the letter in the progeny sequence by just not adding it
		self.seq = ''.join(self.seq)
		self.seq = MutableSeq(self.seq,
				      self.alphaproperty)  # Update the list of the sites which are deleted in the progeny sequence
		if self.occuredins:
			_ins_allele = zip(self.occuredins,
					  self.inserted_allele)
			_ins_allele.sort(key=lambda tup: tup[
				0])  # Sort the occured change positions
			self.occuredins, self.inserted_allele = zip(
				*_ins_allele)
			self.occuredins = list(self.occuredins)
			self.inserted_allele = list(self.inserted_allele)
			_ins_allele = None
		else:
			self.inserted_allele = []
			self.occuredins = []
		if (self.occuredmu):
			_alt_allele = zip(self.occuredmu, self.alt_allele)
			_alt_allele.sort(key=lambda tup: tup[0])
			self.occuredmu, self.alt_allele = zip(*_alt_allele)
			self.occuredmu = list(self.occuredmu)
			self.alt_allele = list(self.alt_allele)
			_alt_allele = None
		else:
			self.occuredmu = []
			self.alt_allele = []
		if (self.occureddel):
			self.occureddel.sort()
		else:
			self.occureddel = []
		if self.verbose:
			print("WARNING: if there are overlaps betweeen deletion, insertion and mutation positions, \n \
just one of the changes takes place with the following priority: \n \
1)Mutation  2)Deletion 3)Insertion. \n")
			print("Changes made to the haplotype!")
Exemplo n.º 13
0
markovBuilder.set_emission_score('O', 'C', .33)
markovBuilder.set_emission_score('O', 'G', .33)
markovBuilder.set_emission_score('O', 'S', .33)
markovBuilder.set_emission_score('P', 'A', .67)
markovBuilder.set_emission_score('P', 'T', .33)

#Menginisialisasi Hidden Markov Model
markovModel = markovBuilder.get_markov_model()

#3 sequence yang akan dialign
seq1 = Seq('ATGA', arrayDNA())
seq2 = Seq('A CCA', arrayDNA())
seq3 = Seq('ACAST', arrayDNA())

#state untuk tiap sequence
seq1State = MutableSeq('MNOP', arrayState())
seq2State = MutableSeq('MDIOP', arrayState())
seq3State = MutableSeq('MNIOP', arrayState())

seq = [seq1, seq2, seq3]
states = [seq1State, seq2State, seq3State]

#training Hidden Markov Model dengan sequence di atas
trainer = Trainer.KnownStateTrainer(markovModel)
for i in range(len(seq)):
    trainingseq = Trainer.TrainingSequence(seq[i], states[i])
    trainedhmm = trainer.train([trainingseq])

#contoh query yang lain
testSeq = Seq('ATSA', arrayDNA())
testState = MutableSeq('MNOP', arrayState())
Exemplo n.º 14
0

# Direct translation (DNA -> Protein
from Bio.Seq import Seq, translate
from Bio.Alphabet import IUPAC
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
translate(coding_dna)

# we can specify other translation tables by name
translate(coding_dna, table="Vertebrate Mitochondrial")
# or by NCBI number
translate(coding_dna, table=2)

# 3.9 Transcription and Translation

# 3.10 Mutable Seqs
# convert existing sequence to mutable
mutable_seq = my_seq.tomutable()  
# or directly create a mutable one
from Bio.Seq import MutableSeq
from Bio.Alphabet import IUPAC
mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)

# now we can do
mutable_seq[5] = "T"

# and convert it back to an inmutable seq
new_seq = mutable_seq.toseq()


Exemplo n.º 15
0
def determine_synonymous(nuc_muts_on_branch, parent_diffs_from_ref,
                         reference_gene_locations, reference_gene_codon,
                         reference_sequence_nt, reference_sequence_aa):
    """
    Check every nucleotide mutation that occurred on a branch to determine whether or not it is synonymous.

    For each node, all nucleotide mutations that occurred in parents of the node are applied to the reference sequence to give the genome prior to this node. Then, each nucleotide mutation at the node is made to the appropriate codon from this genome and determined to be synonymous or nonsynonymous.

    Returns a dictionary of synonymous mutations where the key is a gene and the value is a list of synonymous mutations in this gene.
    """
    parent_diffs_pos = [int(k) for k, v in parent_diffs_from_ref.items()]

    # make dictionary of synonymous (and noncoding) mutations to add to tree
    syn_muts = {}

    # don't care about deletions because they are obviously not synonymous
    for mut in nuc_muts_on_branch:
        if mut[-1] != '-' and mut[0] != '-':
            mut_pos = int(mut[1:-1])
            # find what gene this mut happens in
            if (mut_pos - 1) in reference_gene_locations.keys():
                mut_gene = reference_gene_locations[mut_pos - 1]
                mut_codon_num = reference_gene_codon[mut_pos - 1][0]
                mut_codon_pos = reference_gene_codon[mut_pos - 1][1]

                # find the reference sequence of the codon this mutation occurs in
                codon_ref_aa = reference_sequence_aa[mut_gene][mut_codon_num]

                codon_ref_nt = reference_sequence_nt[mut_gene][(
                    mut_codon_num * 3):(mut_codon_num * 3 + 3)]

                # check if a mutation occurred within the same codon in a parent
                # and if so, change the reference codon sequence accordingly,
                # to tell whether the mutation at this branch is synonymous or not
                codon_genome_pos = list(
                    range((mut_pos - 1 - mut_codon_pos),
                          (mut_pos - 1 - mut_codon_pos + 3)))

                parent_codon = codon_ref_nt
                for parent_diff in parent_diffs_pos:
                    parent_diff_zero_based = parent_diff - 1
                    if parent_diff_zero_based in codon_genome_pos:
                        parent_diff_pos = codon_genome_pos.index(
                            parent_diff_zero_based)
                        parent_codon = MutableSeq(str(codon_ref_nt))
                        parent_codon[parent_diff_pos] = parent_diffs_from_ref[
                            parent_diff]
                        parent_codon = Seq(parent_codon)

                codon_mutated = MutableSeq(str(parent_codon))
                #if deletion (or seq error) has happened at neighboring nucleotide
                if '-' in codon_mutated:
                    pass
                else:
                    codon_mutated[mut_codon_pos] = mut[-1]
                    codon_mutated = Seq(codon_mutated)
                    codon_mutated_translation = codon_mutated.translate()

                    if str(codon_ref_aa) == str(codon_mutated_translation):
                        if mut_gene in syn_muts.keys():
                            syn_muts[mut_gene] += [mut]
                        else:
                            syn_muts[mut_gene] = [mut]

            else:
                if 'noncoding' in syn_muts.keys():
                    syn_muts['noncoding'] += [mut]
                else:
                    syn_muts['noncoding'] = [mut]

    return syn_muts
Exemplo n.º 16
0
 def setUp(self):
     genome = MutableSeq("1111", TestAlphabet())
     self.organism = Organism(genome, test_fitness)
Exemplo n.º 17
0
def MutableSeqFromFile(filename, alphabet):
    sequence_str = open(filename).read().strip()
    return MutableSeq(sequence_str.lower(), alphabet)
Exemplo n.º 18
0
    def setUp(self):
        self.alphabet = TestAlphabet()
        genome = MutableSeq("2", self.alphabet)
        self.org = Organism(genome, test_fitness)

        self.test_mutator = TestMutator()
Exemplo n.º 19
0
print(coding_dna.translate(to_stop=True))

from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]
print(mito_table)
print(mito_table.stop_codons)
print(mito_table.start_codons)
print(mito_table.forward_table["ACG"])

my_seq[1] = "N"

mutable_seq = my_seq.tomutable()
# or
from Bio.Seq import MutableSeq
mutable_seq = MutableSeq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA')

mutable_seq[5] = "A"
print(mutable_seq)
del mutable_seq[4]
mutable_seq.remove('A')
print(mutable_seq)
new_seq = mutable_seq.toseq()
print(new_seq)

from Bio.Seq import UnknownSeq
unk = UnknownSeq(10)
print(unk)
unk = UnknownSeq(10, character="A")
print(unk)
unk_protein = unk.translate()
Exemplo n.º 20
0
def search_mutated_feature(vcf_record, gbk_dico):
    '''
    - Search if mutation is located within a coding sequence
    - determine if mutation is synonymous or not using a MutableSeq record (copy of the original record with mutation)
    '''
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from copy import copy
    from Bio.Alphabet import IUPAC
    from Bio.Seq import MutableSeq
    from Bio.Alphabet import generic_dna

    # create
    record_alt = copy(gbk_dico[vcf_record.CHROM])
    record_alt.seq = MutableSeq(str(record_alt.seq), generic_dna)

    results = {
        "mut_location": "Intergenic",
        "mut_type": '-',
        "orf_name": '-',
        "gene": '-'
    }

    for feature in record_alt.features:
        if int(vcf_record.POS) in feature and feature.type != "source":
            results["mut_location"] = feature.type
            if feature.type == 'mobile_element':
                results["orf_name"] = feature.qualifiers[
                    "mobile_element_type"][0]
            elif feature.type == 'CDS':
                results["orf_name"] = feature.qualifiers["locus_tag"][0]
            else:
                results[
                    "orf_name"] = "Unknown locus for feature: %s" % feature.type
            try:
                results["gene"] = feature.qualifiers["gene"][0]
            except KeyError:
                results["gene"] = '-'
            if feature.type == 'CDS':

                if len(vcf_record.ALT[0]) > 1:
                    results["mut_type"] = 'INDEL'
                    continue
                else:
                    aa_seq_ref = str(
                        feature.extract(record_alt.seq).translate())
                    # mutate reference sequence
                    if vcf_record.ALT[0] == '*':
                        # frameshift
                        results["mut_type"] = 'F'
                    else:
                        record_alt.seq[int(vcf_record.POS) - 1] = str(
                            vcf_record.ALT[0])

                        # check if synonymous or not
                        aa_seq_alt = str(
                            feature.extract(record_alt.seq).translate())
                        if str(aa_seq_ref) == str(aa_seq_alt):
                            results["mut_type"] = 'S'
                        else:
                            results["mut_type"] = extract_mutation(
                                aa_seq_ref, aa_seq_alt)

            return results
    # if no match, return empty results
    return results
Exemplo n.º 21
0
def seq_query():
    """Sequence query function,  to be continued.
    """
    query_type = input(
        '1.Specific fragment\n'
        '2.Specific Organism\n'
        '3.Specific gene\n'
        '4.All\n'
        '5.All cds\n'
    )
    organize = input('Organize output?(y/n)\n')
    if query_type not in ['1', '2', '3', '4', '5']:
        raise ValueError('wrong input!\n')
    con = sqlite3.connect('./data/DB')
    cur = con.cursor()
    if query_type == '1':
        organism = input('Organism:\n')
        gene = input('Gene:\n')
        frag_type = input('Fragment type(gene, cds, rRNA, tRNA, exon, intron, spacer):\n')
        cur.execute(
            'SELECT Taxon, Organism, Name, Type, Strand, Sequence FROM main WHERE Name LIKE ? AND Type = ? AND Organism=?',
            ('%' + gene + '%', frag_type, organism))
        result = cur.fetchall()
    elif query_type == '2':
        organism = input('Organism:\n')
        frag_type = input('Fragment type(gene, cds, rRNA, tRNA, exon, intron, spacer, whole, fragments):\n')
        if frag_type == 'fragments':
            cur.execute(
                'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE Organism = ?  ORDER BY Head',
                (organism,))
        else:
            cur.execute(
                'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE Organism LIKE ? AND Type = ? ORDER BY Head',
                ('%' + organism + '%', frag_type))
        result = cur.fetchall()
    elif query_type == '3':
        gene = input('Gene:\n')
        frag_type = input('Fragment type(gene, cds, rRNA, tRNA, exon, intron, spacer):\n')
        cur.execute(
            'SELECT Taxon, Organism, Name, Type, Strand, Sequence FROM main WHERE Name LIKE ? AND Type = ? ORDER BY Taxon',
            ('%' + gene + '%', frag_type))
        result = cur.fetchall()
    elif query_type == '4':
        cur.execute('SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main ORDER BY Taxon')
        result = cur.fetchall()
    elif query_type == '5':
        cur.execute(
            'SELECT Taxon, Organism, Name, Type, Strand, Sequence, Head FROM main WHERE type = "cds" ORDER BY Taxon')
        result = cur.fetchall()

    query_result = []
    for i in result:
        title = '|'.join([str(i[0]), i[1], i[2], i[3]])
        sequence = MutableSeq(i[5])
        gene = i[2]
        if i[4] == '-1':
            sequence.seq = sequence.reverse_complement()
        record = [title, gene, sequence]
        query_result.append(record)

    if organize == 'y':
        if not exists('output'):
            makedirs('output')
        for i in query_result:
            file_name = ''.join([
                'output',
                '/',
                i[1].replace('/', ''),
                '.fasta'
            ])
            with open(file_name, 'a') as output_file:
                output_file.write('>%s\n%s\n' % (i[0], i[2]))
    else:
        output = input('Enter output filename:\n')
        with open('.'.join([output, 'fasta']), 'w') as output_file:
            for i in query_result:
                output_file.write('>%s\n%s\n' % (i[0], i[2]))

    cur.close()
    con.close()
    print('Done.\n')
Exemplo n.º 22
0
	def __init__(self, seq, alphaproperty=None, insertprob=None,
		     deleteprob=None, mualphabet=None,
		     muprob=None, mupos=None, delpos=None, inpos=None,
		     verbose=False):
		try:
			self.occureddel = list()  # This is to keep a history of chnges made to the reference
			self.occuredmu = list()  # This is necessary for writing the haplotypes in the format
			self.occuredins = list()  # of haplotyping software's.
			self.inserted_allele = list()  # keeps track of the inserted allele to be able to get them back when needed!
			self.alt_allele = list()  # keeps track of the substituted
			if not isinstance(verbose, bool):
				raise CustomException("ERROR: verbose must be set to either True or False. \
Default is to False")
			else:
				self.verbose = verbose
			if isinstance(seq, str):
				if alphaproperty is None:
					if self.verbose:
						print(
							"WARNING: No alphabet type is specified for the sequence string!")
					else:
						pass
					self.alphaproperty = Alphabet()
				else:
					self.alphaproperty = alphaproperty
				self.seq = MutableSeq(seq, self.alphaproperty)
			elif isinstance(seq, Seq):
				self.alphaproperty = seq.__getattribute__(
					'alphabet')
				self.seq = seq.tomutable()
			elif isinstance(seq, MutableSeq):
				self.alphaproperty = seq.__getattribute__(
					'alphabet')
				self.seq = copy.deepcopy(seq)
			else:
				raise CustomException("ERROR: Should provide a Seq or MutableSeq object, \n \
or a string sequence!")
			self.alphabet = set(str(self.seq))
			self.ref = str(self.seq)
			if not delpos:
				self.delpos = []
			else:
				if set(delpos).issubset(
					set(range(len(self.ref)))):
					self.delpos = list(
						delpos)  # Deletion by specifying the positions
				else:
					raise CustomException(
						"ERROR: Deletion positions exceed the range of the reference or are not positive integers!")
			if not inpos:
				self.inpos = []
			else:
				if set(inpos).issubset(
					set(range(len(self.ref)))):
					self.inpos = list(
						inpos)  # Insertion by specifying the positions
				else:
					raise CustomException(
						"ERROR: Insertion positions exceed the range of the reference or are not positive integers!")
			if not mupos:
				self.mupos = []
			else:
				if set(mupos).issubset(
					set(range(len(self.ref)))):
					self.mupos = list(
						mupos)  # Mutation by specifying the positions
				else:
					raise CustomException(
						"ERROR: Mutation positions exceed the range of the reference or are not positive integers!")
			if not mualphabet:
				if self.verbose:
					print("WARNING: You have specified no mutation alphabet! Mutations are set to random \
letters!")
				self.mualphabet = dict()
				for key in self.alphabet:
					self.mualphabet[key] = ''.join(
						self.alphabet - {
						key,'N'})  # Non-specified mutations could happen to any letter
			else:
				mualphabet = dict([(str(k), str(v)) for k, v in
						   mualphabet.iteritems()])
				for key, value in mualphabet.iteritems():
					if len(key) != 1:
						raise CustomException("ERROR: the mutation alphabet deals with point mutations! Only single letters are\
 allowed as keys!")
					elif key in set(''.join(value)):
						raise CustomException("ERROR: Wrong mutation values specified! A letter could just be substituted with a\
 different letter for mutation!")
				if set(
					mualphabet.keys()) == self.alphabet and set(
					''.join(
						mualphabet.values())) <= self.alphabet:
					self.mualphabet = copy.deepcopy(
						mualphabet)
				elif set(
					mualphabet.keys()) < self.alphabet and set(
					''.join(
						mualphabet.values())) < self.alphabet:
					if self.verbose:
						print("WARNING: Mutation is not specified for some letters! Those mutations are set\
 to random letters!")
					self.mualphabet = copy.deepcopy(
						mualphabet)  # Whatever has been specified for mutation alphabet is kep intact
					for key in self.alphabet - set(
						mualphabet.keys()):
						self.mualphabet[key] = ''.join(
							self.alphabet - {
							key,'N'})  # Non-specified mutations could happen to any letter
				else:
					if self.verbose:
						print("WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\
 updated and\nunspecified mutations are set to random letters!")
					new_mualphabet = dict()  # As mutation may introduce novel alleles in the sequence, alphabet is updated first
					for key, value in mualphabet.iteritems():  # Whatever has been specified for mutation alphabet is kep intact
						self.alphabet.add(
							key)  # Only the alphabet is updated if necessary
						self.alphabet |= (set(''.join(
							value)) - self.alphabet)
						new_mualphabet.update(
							{key: value})
					for key in self.alphabet - set(
						new_mualphabet.keys()):
						new_mualphabet[key] = ''.join(
							self.alphabet - {
							key,'N'})  # Non-specified mutations could happen to any letter
					self.mualphabet = copy.deepcopy(
						new_mualphabet)
			if not insertprob:
				self.insertprob = dict()  # If no insertprob is given, it is set to zero everywhere
				for key in self.alphabet:
					self.insertprob[key] = 0
			else:
				if set(list(
					insertprob.keys())) != self.alphabet:
					if self.verbose:
						print("WARNING: Missing/Invalid letter(s) in insertion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_insertprob = dict()
				for key, value in insertprob.iteritems():
					if value >= 0 and value <= 1:
						new_insertprob.update(
							{key: value})
					else:
						raise CustomException(
							"ERROR: Insertion probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_insertprob.keys()):
					new_insertprob[key] = 0
				self.insertprob = copy.deepcopy(new_insertprob)
			if not deleteprob:  # If no deleteprob is given, it is set to zero everywhere
				self.deleteprob = dict()
				for key in self.alphabet:
					self.deleteprob[key] = 0
			else:
				if set(list(
					deleteprob.keys())) != self.alphabet:
					if self.verbose:
						print("WARNING: Missing/Invalid letter(s) in deletion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_deleteprob = dict()
				for key, value in deleteprob.iteritems():
					if value >= 0 and value <= 1:
						new_deleteprob.update(
							{key: value})
					else:
						raise CustomException(
							"ERROR: Deletion probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_deleteprob.keys()):
					new_deleteprob[key] = 0
				self.deleteprob = copy.deepcopy(new_deleteprob)
			if not muprob:
				self.muprob = dict()  # If no muprob is given, it is set to zero everywhere
				for key in self.alphabet:
					self.muprob[key] = 0
			else:
				if set(list(muprob.keys())) != self.alphabet:
					if self.verbose:
						print("WARNING: Missing/Invalid letter(s) in mutation probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_muprob = dict()
				for key, value in muprob.iteritems():
					if value >= 0 and value <= 1:
						new_muprob.update({key: value})
					else:
						raise CustomException(
							"ERROR: Mutation probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_muprob.keys()):
					new_muprob[key] = 0
				self.muprob = copy.deepcopy(new_muprob)
		except CustomException as instance:
			print(instance)
			sys.exit(2)
		else:
			if self.verbose:
				print(
					"MuGen object successfully created.\nWARNING: MuGen sequence is case sensitive!")
Exemplo n.º 23
0
with open(args.rc_regions) as infile1:
	RCseqs = csv.reader(infile1, delimiter='\t')
	for row in RCseqs:
		seq = row[0]
		RCstart = row[1]
		RCstop = row[2]
		seq_list.append([seq, RCstart, RCstop])

# Read fasta file:
fasta_seqs = list(SeqIO.parse(args.fasta, "fasta"))

# Mask recombinant regions:
masked_seq = []

for i in fasta_seqs:
    seq = MutableSeq(str(i.seq))
    for j in seq_list:
        if i.id == j[0]: # j[0] is the sequence id in recombinant regions list
            start_mask = int(j[1]) - 1 # 1 based positions are 1 less in 0-based indexing
            end_mask = int(j[2]) # last index in range is not included in python
            len_mask = end_mask - start_mask
            seq[start_mask:end_mask] = args.maskchar * len_mask

    masked_seq.append(SeqRecord(Seq(str(seq)), i.id, description=""))

for i in masked_seq:
    print("Number of characters masked in sequence " + i.id + ": " + str(str(i.seq).count(args.maskchar)))

# Write masked sequences to file:
SeqIO.write(masked_seq, args.out, "fasta")
while (break_nb[break_nb_ord[cl_ii]] > 2):
    rcl_file = tempfn + '-rcl.fa'
    f = open(rcl_file, 'w')
    nbr = 0
    for rii in xrange(len(break_pos[1])):
        if (break_pos[0][rii] == break_nb_ord[cl_ii]):
            nbr += 1
            pbr = break_pos[1][rii]
            pbseq = pb_reads[pbr]
            if (read_blocks[pbr][0]['refE'] == break_nb_ord[2]):
                readbp = read_blocks[pbr][0]['readE']
            else:
                readbp = read_blocks[pbr][0]['readS']
            pbseq = pbseq[max(0, readbp - 400):min(len(pbseq), readbp + 400)]
            if (read_blocks[pbr][0]['refStd'] == '-'):
                pbseq = MutableSeq(pbseq, generic_dna)
                pbseq.reverse_complement()
                pbseq = str(pbseq)
            f.write('>' + pbr + '\n')
            f.write(pbseq)
            f.write("\n")
    f.close()
    # Run Clustal
    clo_outfile = tempfn + '-clo-out.fa'
    clo_cmd = ['clustalo', '-i', rcl_file, '-o', clo_outfile, '--force']
    clo_out = subprocess.check_output(clo_cmd)
    # Get consensus
    msa_out = []
    for record in SeqIO.parse(clo_outfile, "fasta"):
        msa_out.append(str(record.seq))
    clo_cons = ""
Exemplo n.º 25
0
else:
    print "huh?  ERROR"

t = Seq.Seq("T", IUPAC.ambiguous_dna)
u = s + t
print str(u.alphabet)

from Bio.Seq import MutableSeq
import array

print
print "Testing MutableSeq"
print "=================="

print "Testing creating MutableSeqs in multiple ways"
string_seq = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna)
array_seq = MutableSeq(array.array(array_indicator, "TCAAAAGGATGCATCATG"),
                       IUPAC.ambiguous_dna)
converted_seq = s.tomutable()

for test_seq in [string_seq]:
    print repr(test_seq)
    print str(test_seq)
    print len(test_seq)
    print repr(test_seq.toseq())

    print test_seq[0]
    print repr(test_seq[1:5])

    test_seq[1:3] = "GAT"
    print "Set slice with string:", repr(test_seq)
Exemplo n.º 26
0
 def setUp(self):
     alphabet = TestAlphabet()
     test_genome = MutableSeq("11*22*33*", alphabet)
     self.organism = Organism(test_genome, test_fitness)
     
     self.ambig_info = Schema(alphabet.alphabet_matches)
Exemplo n.º 27
0
	def probmu(self):
		self.occuredmu = list()
		self.occureddel = list()
		self.occuredins = list()
		self.inserted_allele = list()
		self.alt_allele = list()
		"""Operates on a MuGen object, and returns a Seq object obtained by making random changes
		to the reference sequence of the MuGen object, using the probabilities given to MuGen"""
		self.seq = []
		for __site, __base in enumerate(self.ref):
			if __site in set(self.mupos) | set(self.inpos) | set(
				self.delpos):
				self.seq.append(
					__base)  # No change is made at indel/mutation positions
			else:
				__prob = {'ins': self.insertprob.get(__base),
					  'del': self.deleteprob.get(__base),
					  'sub': self.muprob.get(__base)}
				__error = random.choice(['ins', 'del', 'sub',
							 'sub'])  # An error occurs randomly: insertion or \
				# deletion or substitution
				__rnd = float(int(
					random.random() * 100000)) / 100000  # The probability that this error is \
				# not corrected by replication machinary is determined \
				if __rnd < __prob.get(
					__error):  # by insertprob,deleteprob and muprob
					if __error == 'sub':
						self.seq.append(random.choice(
							self.mualphabet.get(
								__base)))  # Substitute tha letter with one from the mutation alphabet
						self.occuredmu.append(
							__site)  # Update the list of the sites where a mutation has occured
						self.alt_allele.extend([
									       self.seq[
										       -1]])  # Update the list of alternative alleles
					elif __error == 'ins':
						self.seq.append(__base)
						self.seq.append(random.choice(
							list(
								self.alphabet)))  # Insert a random letter right after the letter
						self.occuredins.append(
							__site)  # Update the list of the sites after which an insertion has occured
						self.inserted_allele.extend([
										    __base +
										    self.seq[
											    -1]])  # Update the list of inserted alleles
					else:
						self.occureddel.append(
							__site)  # Delete the letter in the progeny sequence by just not adding it
				else:  # Update the list of the sites which are deleted in the progeny sequence
					self.seq.append(
						__base)  # No change is induced at the site in the progeny sequence
		self.seq = ''.join(self.seq)
		self.seq = MutableSeq(self.seq, self.alphaproperty)
		if (self.occuredins):
			_ins_allele = zip(self.occuredins,
					  self.inserted_allele)
			_ins_allele.sort(key=lambda tup: tup[
				0])  # Sort the occured change positions in ascending order
			self.occuredins, self.inserted_allele = zip(
				*_ins_allele)
			self.occuredins = list(self.occuredins)
			self.inserted_allele = list(self.inserted_allele)
			_ins_allele = None
		else:
			self.inserted_allele = []
			self.occuredins = []
		if (self.occuredmu):
			_alt_allele = zip(self.occuredmu, self.alt_allele)
			_alt_allele.sort(key=lambda tup: tup[0])
			self.occuredmu, self.alt_allele = zip(*_alt_allele)
			self.occuredmu = list(self.occuredmu)
			self.alt_allele = list(self.alt_allele)
			_alt_allele = None
		else:
			self.occuredmu = []
			self.alt_allele = []
		if (self.occureddel):
			self.occureddel.sort()
		else:
			self.occureddel = []
		if self.verbose:
			print("WARNING: If indel/mutation positions are specified, MuGen.probmu() makes no change at those sites. \n \
Use MuGen.posmu() or Mugen.hapchanger() to apply changes at those sites!")
			print("Changes made to the haplotype!")
Exemplo n.º 28
0
def parse_vcf(varfile):
    reader = csv.reader(open(varfile), "excel-tab")
    for line in reader:
        if line[0][0] == "#":
            continue

        pos = int(line[1]) - 1
        var = line[4].split(',')

        yield pos, var


for seq_record in SeqIO.parse(sys.argv[1], 'fasta'):
    print >> sys.stderr, "Seq ID = %s, Length = %d" % \
                                        (seq_record.id, len(seq_record))
    seq = MutableSeq(str(seq_record.seq))

    n = 0
    for pos, var in parse_vcf(sys.argv[2]):
        # if (len(var) > 2) or (len(var[0]) > 1):
        # continue
        if (len(var) > 1) or (len(var[0]) > 1):
            continue
        else:
            seq[pos] = var[0]
            n += 1

    SeqIO.write(SeqRecord(Seq(str(seq)), id=seq_record.id), sys.stdout,
                'fasta')

    print >> sys.stderr, "Total variants = %d" % n
Exemplo n.º 29
0
	def hapchanger(self):
		"""Operates on a MuGen object, and returns a Seq object obtained by making random and specified
		changes to the reference sequence of the MuGen object, using the probabilities as well as the
		positions given to MuGen."""
		self.seq = []
		self.occuredmu = list()
		self.occureddel = list()
		self.occuredins = list()
		self.inserted_allele = list()
		self.alt_allele = list()
		for __site, __base in enumerate(self.ref):
			if __site in set(
				self.mupos):  # Making specified changes at the specified positions
				self.seq.append(random.choice(
					self.mualphabet.get(
						__base)))  # Induce mutation at the site whose position is given
				self.occuredmu.append(
					__site)  # Update the list of the sites where a mutation has occured
				self.alt_allele.extend([self.seq[
								-1]])  # Update the list of alternative alleles
			elif __site in set(self.inpos):
				self.seq.append(
					__base)  # Make an insertion right after the site whose position is given
				self.seq.append(
					random.choice(list(self.alphabet)))
				self.occuredins.append(
					__site)  # Update the list of the sites after which an insertion has occured
				self.inserted_allele.extend([__base + self.seq[
					-1]])  # Update the list of inserted alleles
			elif __site in set(self.delpos):
				self.occureddel.append(
					__site)  # Update the list of the sited with deleted letter
			else:  # If not change is specified at the position, \
				# make a random change according to the prob model
				__prob = {'ins': self.insertprob.get(__base),
					  'del': self.deleteprob.get(__base),
					  'sub': self.muprob.get(__base)}
				__error = random.choice(['ins', 'del', 'sub',
							 'sub'])  # An error occurs randomly: insertion or \
				# deletion or substitution
				__rnd = float(int(
					random.random() * 100000)) / 100000  # The probability that this error is \
				# not corrected by replication machinary is determined \
				if __rnd < __prob.get(
					__error):  # by insertprob,deleteprob and muprob
					if __error == 'sub':
						self.seq.append(random.choice(self.mualphabet.get(__base)))
						self.occuredmu.append(__site)  # Update the list of the sites where a mutation has occured
						self.alt_allele.extend([self.seq[-1]])  # Update the list of alternative alleles
					elif __error == 'ins':
						self.seq.append(__base)
						self.seq.append(random.choice(list(self.alphabet)))
						self.occuredins.append(__site)  # Update the list of the sites after which an insertion has occured
						self.inserted_allele.extend([__base + self.seq[-1]])  # Update the list of inserted alleles
					elif __error == 'del':
						self.occureddel.append(__site)  # Update the list of the sited with deleted letter
				else:
					self.seq.append(__base)
		self.seq = ''.join(self.seq)
		self.seq = MutableSeq(self.seq, self.alphaproperty)
		if (self.occuredins):
			_ins_allele = zip(self.occuredins,
					  self.inserted_allele)
			_ins_allele.sort(key=lambda tup: tup[
				0])  # Sort the occured change positions
			self.occuredins, self.inserted_allele = zip(
				*_ins_allele)
			self.occuredins = list(self.occuredins)
			self.inserted_allele = list(self.inserted_allele)
			_ins_allele = None
		else:
			self.inserted_allele = []
			self.occuredins = []
		if (self.occuredmu):
			_alt_allele = zip(self.occuredmu, self.alt_allele)
			_alt_allele.sort(key=lambda tup: tup[0])
			self.occuredmu, self.alt_allele = zip(*_alt_allele)
			self.occuredmu = list(self.occuredmu)
			self.alt_allele = list(self.alt_allele)
			_alt_allele = None
		else:
			self.occuredmu = []
			self.alt_allele = []
		if (self.occureddel):
			self.occureddel.sort()
		else:
			self.occureddel = []
		if self.verbose:
			print("Changes made to the haplotype!")
Exemplo n.º 30
0
class StringMethodTests(unittest.TestCase):
    _examples = [
        # These are length 9, a multiple of 3 for translation tests:
        Seq("ACGTGGGGT"),
        Seq("ACGUGGGGU"),
        Seq("GG"),
        Seq("A"),
        UnknownSeq(1),
        UnknownSeq(1, character="n"),
        UnknownSeq(1, character="N"),
        UnknownSeq(12, character="N"),
        UnknownSeq(12, character="X"),
        UnknownSeq(12),
    ]
    for seq in _examples[:]:
        if not isinstance(seq, UnknownSeq):
            _examples.append(MutableSeq(seq))
    _start_end_values = [0, 1, 2, 1000, -1, -2, -999, None]

    def _test_method(self, method_name, start_end=False):
        """Check this method matches the plain string's method."""
        self.assertIsInstance(method_name, str)
        for example1 in self._examples:
            if not hasattr(example1, method_name):
                # e.g. MutableSeq does not support transcribe
                continue
            str1 = str(example1)

            for example2 in self._examples:
                if not hasattr(example2, method_name):
                    # e.g. MutableSeq does not support transcribe
                    continue
                str2 = str(example2)

                try:
                    i = getattr(example1, method_name)(str2)
                except ValueError:
                    i = ValueError
                try:
                    j = getattr(str1, method_name)(str2)
                except ValueError:
                    j = ValueError
                self.assertEqual(i, j,
                                 "%r.%s(%r)" % (example1, method_name, str2))
                try:
                    i = getattr(example1, method_name)(example2)
                except ValueError:
                    i = ValueError
                try:
                    j = getattr(str1, method_name)(str2)
                except ValueError:
                    j = ValueError
                self.assertEqual(
                    i, j, "%r.%s(%r)" % (example1, method_name, example2))

                if start_end:
                    for start in self._start_end_values:
                        try:
                            i = getattr(example1, method_name)(str2, start)
                        except ValueError:
                            i = ValueError
                        try:
                            j = getattr(str1, method_name)(str2, start)
                        except ValueError:
                            j = ValueError
                        self.assertEqual(
                            i, j, "%r.%s(%r, %s)" %
                            (example1, method_name, str2, start))

                        for end in self._start_end_values:
                            try:
                                i = getattr(example1, method_name)(str2, start,
                                                                   end)
                            except ValueError:
                                i = ValueError
                            try:
                                j = getattr(str1, method_name)(str2, start,
                                                               end)
                            except ValueError:
                                j = ValueError
                            self.assertEqual(
                                i,
                                j,
                                "%r.%s(%r, %s, %s)" %
                                (example1, method_name, str2, start, end),
                            )

    def test_str_count(self):
        """Check matches the python string count method."""
        self._test_method("count", start_end=True)
        self.assertEqual(Seq("AC777GT").count("7"), 3)
        self.assertRaises(TypeError, Seq("AC777GT").count, 7)
        self.assertRaises(TypeError, Seq("AC777GT").count, None)

    def test_count_overlap(self):
        """Check count_overlap exception matches python string count method."""
        self.assertEqual(Seq("AC777GT").count("77"), 1)
        self.assertEqual(Seq("AC777GT").count_overlap("77"), 2)
        self.assertEqual(Seq("AC777GT").count_overlap("7"), 3)
        self.assertRaises(TypeError, Seq("AC777GT").count_overlap, 7)
        self.assertRaises(TypeError, Seq("AC777GT").count_overlap, None)

    def test_str_count_overlap_GG(self):
        """Check our count_overlap method using GG."""
        # Testing with self._examples
        expected = [
            3,
            3,
            1,
            0,  # Seq() Tests
            0,
            0,
            0,
            0,
            0,
            0,  # UnknownSeq() Tests
            3,
            3,
            1,
            0,  # MutableSeq() Tests
        ]

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term GG as a string
            self.assertEqual(seq.count_overlap("GG"), exp)
            self.assertEqual(seq.count_overlap("G" * 5), 0)
            # Using search term GG as a Seq
            self.assertEqual(seq.count_overlap(Seq("GG")), exp)
            self.assertEqual(seq.count_overlap(Seq("G" * 5)), 0)

    def test_count_overlap_start_end_GG(self):
        """Check our count_overlap method using GG with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 3),
            (3, None, 3),
            (3, 6, 2),
            (4, 6, 1),
            (4, -1, 2),
            (-5, None, 2),
            (-5, 7, 2),
            (7, -5, 0),
            (-100, None, 3),
            (None, 100, 3),
            (-100, 1000, 3),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("GG", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("GG", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0)

        # Testing UnknownSeq() with variable start and end arguments
        char_start_end_exp = [
            ("N", 1, 7, 0),
            ("N", 1, 7, 0),
            ("N", -4, None, 0),
            ("N", -4, None, 0),
            ("X", 1, 7, 0),
        ]

        for char, start, end, exp in char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, character=char).count_overlap("GG", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("G", 100, 105, 0),
            ("G", -1, 4, 0),
            ("G", 4, -1, 0),
            ("G", -8, -2, 0),
            ("G", -2, -8, 0),
            ("G", 8, 2, 0),
            ("G", 2, 8, 0),
            ("GG", 8, 2, 0),
            ("GG", 2, 8, 0),
            ("GG", -5, -1, 0),
            ("GG", 1, 5, 0),
            ("GGG", None, None, 0),
            ("GGGGGGGGG", None, None, 0),
            ("GGG", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("GG", 1), 0)

    def test_str_count_overlap_NN(self):
        """Check our count_overlap method using NN."""
        # Testing with self._examples
        expected = [
            0,
            0,
            0,
            0,  # Seq() Tests
            0,
            0,
            0,
            11,
            0,
            0,  # UnknownSeq() Tests
            0,
            0,
            0,
            0,  # MutableSeq() Tests
        ]

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term NN as a string
            self.assertEqual(seq.count_overlap("NN"), exp)
            self.assertEqual(seq.count_overlap("N" * 13), 0)
            # Using search term NN as a Seq
            self.assertEqual(seq.count_overlap(Seq("NN")), exp)
            self.assertEqual(seq.count_overlap(Seq("N" * 13)), 0)

    def test_count_overlap_start_end_NN(self):
        """Check our count_overlap method using NN with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 0),
            (3, None, 0),
            (3, 6, 0),
            (4, 6, 0),
            (4, -1, 0),
            (-5, None, 0),
            (-5, 7, 0),
            (7, -5, 0),
            (-100, None, 0),
            (None, 100, 0),
            (-100, 1000, 0),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("NN", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("NN", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0)

        # Testing UnknownSeq() with variable start and end arguments
        char_start_end_exp = [
            ("N", 1, 7, 5),
            ("N", 1, 7, 5),
            ("N", -4, None, 3),
            ("N", -4, None, 3),
            ("X", 1, 7, 0),
        ]

        for char, start, end, exp in char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, character=char).count_overlap("NN", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("N", 100, 105, 0),
            ("N", -1, 4, 0),
            ("N", 4, -1, 2),
            ("N", -8, -2, 5),
            ("N", -2, -8, 0),
            ("N", 8, 2, 0),
            ("N", 2, 8, 5),
            ("NN", 8, 2, 0),
            ("NN", 2, 8, 4),
            ("NN", -5, -1, 3),
            ("NN", 1, 5, 3),
            ("NNN", None, None, 5),
            ("NNNNNNNNN", None, None, 0),
            ("NNN", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("NN", 1), 5)

    def test_str_find(self):
        """Check matches the python string find method."""
        self._test_method("find", start_end=True)
        self.assertEqual(Seq("AC7GT").find("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").find, 7)
        self.assertRaises(TypeError, Seq("ACGT").find, None)

    def test_str_rfind(self):
        """Check matches the python string rfind method."""
        self._test_method("rfind", start_end=True)
        self.assertEqual(Seq("AC7GT").rfind("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").rfind, 7)
        self.assertRaises(TypeError, Seq("ACGT").rfind, None)

    def test_str_index(self):
        """Check matches the python string index method."""
        self._test_method("index", start_end=True)
        self.assertEqual(Seq("AC7GT").index("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").index, 7)
        self.assertRaises(TypeError, Seq("ACGT").index, None)
        self.assertEqual(MutableSeq("AC7GT").index("7"), 2)
        self.assertRaises(TypeError, MutableSeq("AC7GT").index, 7)
        self.assertRaises(TypeError, MutableSeq("ACGT").index, None)

    def test_str_rindex(self):
        """Check matches the python string rindex method."""
        self._test_method("rindex", start_end=True)
        self.assertEqual(Seq("AC7GT").rindex("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").rindex, 7)
        self.assertRaises(TypeError, Seq("ACGT").rindex, None)
        self.assertEqual(MutableSeq("AC7GT").rindex("7"), 2)
        self.assertRaises(TypeError, MutableSeq("AC7GT").rindex, 7)
        self.assertRaises(TypeError, MutableSeq("ACGT").rindex, None)

    def test_str_startswith(self):
        """Check matches the python string startswith method."""
        self._test_method("startswith", start_end=True)
        self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC")))
        self.assertRaises(TypeError, Seq("ACGT").startswith, None)
        self.assertRaises(TypeError, MutableSeq("ACGT").startswith, None)

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            subs = tuple(example1[start:start + 2]
                         for start in range(0,
                                            len(example1) - 2, 3))
            subs_str = tuple(str(s) for s in subs)

            self.assertEqual(
                str(example1).startswith(subs_str), example1.startswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).startswith(subs_str, 3),
                example1.startswith(subs, 3))
            self.assertEqual(
                str(example1).startswith(subs_str, 2, 6),
                example1.startswith(subs, 2, 6),
            )

    def test_str_endswith(self):
        """Check matches the python string endswith method."""
        self._test_method("endswith", start_end=True)
        self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE")))
        self.assertRaises(TypeError, Seq("ACGT").endswith, None)

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            subs = tuple(example1[start:start + 2]
                         for start in range(0,
                                            len(example1) - 2, 3))
            subs_str = tuple(str(s) for s in subs)

            self.assertEqual(
                str(example1).endswith(subs_str), example1.endswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).endswith(subs_str, 3),
                example1.endswith(subs, 3))
            self.assertEqual(
                str(example1).endswith(subs_str, 2, 6),
                example1.endswith(subs, 2, 6))

    def test_str_strip(self):
        """Check matches the python string strip method."""
        self._test_method("strip")
        s = Seq(" ACGT ")
        m = MutableSeq(" ACGT ")
        self.assertEqual(s.strip(), "ACGT")
        self.assertRaises(TypeError, s.strip, 7)
        self.assertEqual(s, " ACGT ")
        self.assertEqual(m.strip(), "ACGT")
        self.assertRaises(TypeError, m.strip, 7)
        self.assertEqual(m, " ACGT ")
        self.assertEqual(m.strip(inplace=True), "ACGT")
        self.assertEqual(m, "ACGT")

    def test_str_lstrip(self):
        """Check matches the python string lstrip method."""
        self._test_method("lstrip")
        s = Seq(" ACGT ")
        m = MutableSeq(" ACGT ")
        self.assertEqual(s.lstrip(), "ACGT ")
        self.assertRaises(TypeError, s.lstrip, 7)
        self.assertEqual(s, " ACGT ")
        self.assertEqual(m.lstrip(), "ACGT ")
        self.assertRaises(TypeError, m.lstrip, 7)
        self.assertEqual(m, " ACGT ")
        self.assertEqual(m.lstrip(inplace=True), "ACGT ")
        self.assertEqual(m, "ACGT ")

    def test_str_rstrip(self):
        """Check matches the python string rstrip method."""
        self._test_method("rstrip")
        s = Seq(" ACGT ")
        m = MutableSeq(" ACGT ")
        self.assertEqual(s.rstrip(), " ACGT")
        self.assertRaises(TypeError, s.rstrip, 7)
        self.assertEqual(s, " ACGT ")
        self.assertEqual(m.rstrip(), " ACGT")
        self.assertRaises(TypeError, m.rstrip, 7)
        self.assertEqual(m, " ACGT ")
        self.assertEqual(m.rstrip(inplace=True), " ACGT")
        self.assertEqual(m, " ACGT")

    def test_str_split(self):
        """Check matches the python string split method."""
        self._test_method("split")
        self.assertEqual(Seq("AC7GT").split("7"), "AC7GT".split("7"))
        self.assertRaises(TypeError, Seq("AC7GT").split, 7)
        self.assertEqual(MutableSeq("AC7GT").split("7"), "AC7GT".split("7"))
        self.assertRaises(TypeError, MutableSeq("AC7GT").split, 7)

    def test_str_rsplit(self):
        """Check matches the python string rsplit method."""
        self._test_method("rsplit")
        self.assertEqual(Seq("AC7GT").rsplit("7"), "AC7GT".rsplit("7"))
        self.assertRaises(TypeError, Seq("AC7GT").rsplit, 7)
        self.assertEqual(MutableSeq("AC7GT").rsplit("7"), "AC7GT".rsplit("7"))
        self.assertRaises(TypeError, MutableSeq("AC7GT").rsplit, 7)

    def test_str_length(self):
        """Check matches the python string __len__ method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(len(example1), len(str1))

    def test_str_upper(self):
        """Check matches the python string upper method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(example1.upper(), str1.upper())

    def test_str_lower(self):
        """Check matches the python string lower method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(example1.lower(), str1.lower())

    def test_str_encode(self):
        """Check matches the python string encode method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(bytes(example1), str1.encode("ascii"))

    def test_str_hash(self):
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            with warnings.catch_warnings():
                # Silence change in behaviour warning
                warnings.simplefilter("ignore", BiopythonWarning)
                self.assertEqual(
                    hash(str(example1)),
                    hash(example1),
                    "Hash mismatch, %r for %r vs %r for %r" %
                    (hash(str(example1)), id(example1), hash(example1),
                     example1),
                )

    def test_str_comparison(self):
        for example1 in self._examples:
            for example2 in self._examples:
                with warnings.catch_warnings():
                    self.assertEqual(
                        str(example1) == str(example2),
                        example1 == example2,
                        "Checking %r == %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) != str(example2),
                        example1 != example2,
                        "Checking %r != %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) < str(example2),
                        example1 < example2,
                        "Checking %r < %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) <= str(example2),
                        example1 <= example2,
                        "Checking %r <= %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) > str(example2),
                        example1 > example2,
                        "Checking %r > %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) >= str(example2),
                        example1 >= example2,
                        "Checking %r >= %r" % (example1, example2),
                    )

    def test_str_getitem(self):
        """Check slicing and indexing works like a string."""
        for example1 in self._examples:
            str1 = str(example1)
            for i in self._start_end_values:
                if i is not None and abs(i) < len(example1):
                    self.assertEqual(example1[i], str1[i])
                self.assertEqual(example1[:i], str1[:i])
                self.assertEqual(example1[i:], str1[i:])
                for j in self._start_end_values:
                    self.assertEqual(example1[i:j], str1[i:j])
                    for step in range(-3, 4):
                        if step == 0:
                            with self.assertRaises(ValueError) as cm:
                                example1[i:j:step]
                            self.assertEqual(str(cm.exception),
                                             "slice step cannot be zero")
                        else:
                            self.assertEqual(example1[i:j:step],
                                             str1[i:j:step])

    def test_tomutable(self):
        """Check creating a MutableSeq object."""
        for example1 in self._examples:
            mut = MutableSeq(example1)
            self.assertIsInstance(mut, MutableSeq)
            self.assertEqual(mut, example1)

    def test_toseq(self):
        """Check creating a Seq object."""
        for example1 in self._examples:
            seq = Seq(example1)
            self.assertIsInstance(seq, Seq)
            self.assertEqual(seq, example1)

    def test_the_complement(self):
        """Check obj.complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            comp = example1.complement()
            str1 = str(example1)
            if "U" in str1 or "u" in str1:
                mapping = str.maketrans("ACGUacgu", "UGCAugca")
            else:
                # Default to DNA, e.g. complement("A") -> "T" not "U"
                mapping = str.maketrans("ACGTacgt", "TGCAtgca")
            self.assertEqual(str1.translate(mapping), comp)

    def test_the_reverse_complement(self):
        """Check obj.reverse_complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            comp = example1.reverse_complement()
            str1 = str(example1)
            if "U" in str1 or "u" in str1:
                mapping = str.maketrans("ACGUacgu", "UGCAugca")
            else:
                # Defaults to DNA, so reverse_complement("A") --> "T" not "U"
                mapping = str.maketrans("ACGTacgt", "TGCAtgca")
            self.assertEqual(str1.translate(mapping)[::-1], comp)

    def test_the_transcription(self):
        """Check obj.transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            tran = example1.transcribe()
            str1 = str(example1)
            if len(str1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            self.assertEqual(str1.replace("T", "U").replace("t", "u"), tran)

    def test_the_back_transcription(self):
        """Check obj.back_transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            tran = example1.back_transcribe()
            str1 = str(example1)
            self.assertEqual(str1.replace("U", "T").replace("u", "t"), tran)

    def test_the_translate(self):
        """Check obj.translate() method."""
        mapping = ""
        for example1 in self._examples:
            if len(example1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            tran = example1.translate()
            # Try with positional vs named argument:
            self.assertEqual(example1.translate(11),
                             example1.translate(table=11))

            # TODO - check the actual translation, and all the optional args

    def test_the_translation_of_stops(self):
        """Check obj.translate() method with stop codons."""
        misc_stops = "TAATAGTGAAGAAGG"
        nuc = Seq(misc_stops)
        self.assertEqual("***RR", nuc.translate())
        self.assertEqual("***RR", nuc.translate(1))
        self.assertEqual("***RR", nuc.translate("SGC0"))
        self.assertEqual("**W**", nuc.translate(table=2))
        self.assertEqual("**WRR", nuc.translate(table="Yeast Mitochondrial"))
        self.assertEqual("**WSS", nuc.translate(table=5))
        self.assertEqual("**WSS", nuc.translate(table=9))
        self.assertEqual("**CRR", nuc.translate(table="Euplotid Nuclear"))
        self.assertEqual("***RR", nuc.translate(table=11))
        self.assertEqual("***RR", nuc.translate(table="11"))
        self.assertEqual("***RR", nuc.translate(table="Bacterial"))
        self.assertEqual("**GRR", nuc.translate(table=25))
        self.assertEqual("", nuc.translate(to_stop=True))
        self.assertEqual("O*ORR", nuc.translate(table=special_table))
        self.assertEqual("*QWRR",
                         nuc.translate(table=Chilodonella_uncinata_table))
        nuc = MutableSeq(misc_stops)
        self.assertEqual("***RR", nuc.translate())
        self.assertEqual("***RR", nuc.translate(1))
        self.assertEqual("***RR", nuc.translate("SGC0"))
        self.assertEqual("**W**", nuc.translate(table=2))
        self.assertEqual("**WRR", nuc.translate(table="Yeast Mitochondrial"))
        self.assertEqual("**WSS", nuc.translate(table=5))
        self.assertEqual("**WSS", nuc.translate(table=9))
        self.assertEqual("**CRR", nuc.translate(table="Euplotid Nuclear"))
        self.assertEqual("***RR", nuc.translate(table=11))
        self.assertEqual("***RR", nuc.translate(table="11"))
        self.assertEqual("***RR", nuc.translate(table="Bacterial"))
        self.assertEqual("**GRR", nuc.translate(table=25))
        self.assertEqual("", nuc.translate(to_stop=True))
        self.assertEqual("O*ORR", nuc.translate(table=special_table))
        self.assertEqual("*QWRR",
                         nuc.translate(table=Chilodonella_uncinata_table))
        # These test the Bio.Seq.translate() function - move these?:
        self.assertEqual(
            "*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table))
        self.assertEqual("O*ORR", translate(str(nuc), table=special_table))
        self.assertEqual("", translate(str(nuc), to_stop=True))
        self.assertEqual("***RR", translate(str(nuc), table="Bacterial"))
        self.assertEqual("***RR", translate(str(nuc), table="11"))
        self.assertEqual("***RR", translate(str(nuc), table=11))
        self.assertEqual("**W**", translate(str(nuc), table=2))
        self.assertEqual(Seq("TAT").translate(), "Y")
        self.assertEqual(Seq("TAR").translate(), "*")
        self.assertEqual(Seq("TAN").translate(), "X")
        self.assertEqual(Seq("NNN").translate(), "X")
        self.assertEqual(Seq("TAt").translate(), "Y")
        self.assertEqual(Seq("TaR").translate(), "*")
        self.assertEqual(Seq("TaN").translate(), "X")
        self.assertEqual(Seq("nnN").translate(), "X")
        self.assertEqual(Seq("tat").translate(), "Y")
        self.assertEqual(Seq("tar").translate(), "*")
        self.assertEqual(Seq("tan").translate(), "X")
        self.assertEqual(Seq("nnn").translate(), "X")

    def test_the_translation_of_invalid_codons(self):
        """Check obj.translate() method with invalid codons."""
        for codon in ["TA?", "N-N", "AC_", "Ac_"]:
            msg = "Translating %s should fail" % codon
            nuc = Seq(codon)
            with self.assertRaises(TranslationError, msg=msg):
                nuc.translate()
            nuc = MutableSeq(codon)
            with self.assertRaises(TranslationError, msg=msg):
                nuc.translate()

    def test_the_translation_of_ambig_codons(self):
        """Check obj.translate() method with ambiguous codons."""
        for ambig_values in [ambiguous_dna_values, ambiguous_rna_values]:
            ambig = set(ambig_values.keys())
            ambig.remove("X")
            for c1 in ambig:
                for c2 in ambig:
                    for c3 in ambig:
                        values = {
                            str(Seq(a + b + c).translate())
                            for a in ambig_values[c1] for b in ambig_values[c2]
                            for c in ambig_values[c3]
                        }
                        t = Seq(c1 + c2 + c3).translate()
                        if t == "*":
                            self.assertEqual(values, set("*"))
                        elif t == "X":
                            self.assertGreater(
                                len(values),
                                1,
                                "translate('%s') = '%s' not '%s'" %
                                (c1 + c2 + c3, t, ",".join(values)),
                            )
                        elif t == "Z":
                            self.assertEqual(values, set("EQ"))
                        elif t == "B":
                            self.assertEqual(values, set("DN"))
                        elif t == "J":
                            self.assertEqual(values, set("LI"))
                        else:
                            self.assertEqual(values, set(t))
                        # TODO - Use the Bio.Data.IUPACData module for the
                        # ambiguous protein mappings?

    def test_init_typeerror(self):
        """Check Seq __init__ gives TypeError exceptions."""
        self.assertRaises(TypeError, Seq, ("A", "C", "G", "T"))
        self.assertRaises(TypeError, Seq, ["A", "C", "G", "T"])
        self.assertRaises(TypeError, Seq, 1)
        self.assertRaises(TypeError, Seq, 1.0)

    def test_MutableSeq_init_typeerror(self):
        """Check MutableSeq __init__ gives TypeError exceptions."""
        self.assertRaises(TypeError, MutableSeq, ("A", "C", "G", "T"))
        self.assertRaises(TypeError, MutableSeq, ["A", "C", "G", "T"])
        self.assertRaises(TypeError, MutableSeq, 1)
        self.assertRaises(TypeError, MutableSeq, 1.0)

    def test_join_Seq_TypeError(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = Seq("NNNNN")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_UnknownSeq_TypeError_iter(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = UnknownSeq(5, character="-")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_MutableSeq_TypeError_iter(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = MutableSeq("MMMMM")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_Seq(self):
        """Checks if Seq join correctly concatenates sequence with the spacer."""
        spacer = Seq("NNNNN")
        self.assertEqual(
            "N" * 15,
            spacer.join([Seq("NNNNN"), Seq("NNNNN")]),
        )

        spacer1 = Seq("")
        spacers = [spacer1, Seq("NNNNN"), Seq("GGG")]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str_concatenated, "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(seq_concatenated,
                             str(spacer).join(example_strings))
            # Now try single sequence arguments, should join the letters
            for target in example_strings + example_strings_seqs:
                self.assertEqual(
                    str(spacer).join(str(target)), str(spacer.join(target)))

    def test_join_UnknownSeq(self):
        """Checks if UnknownSeq join correctly concatenates sequence with the spacer."""
        spacer1 = UnknownSeq(5, character="-")
        spacer2 = UnknownSeq(0, character="-")
        spacers = [spacer1, spacer2]

        self.assertEqual(
            "-" * 15,
            spacer1.join(
                [UnknownSeq(5, character="-"),
                 UnknownSeq(5, character="-")]),
        )
        self.assertEqual(
            "N" * 5 + "-" * 10,
            spacer1.join([Seq("NNNNN"),
                          UnknownSeq(5, character="-")]),
        )

        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer2.join(example_strings)

        self.assertEqual(str_concatenated, "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(seq_concatenated,
                             str(spacer).join(example_strings))
            # Now try single sequence arguments, should join the letters
            for target in example_strings + example_strings_seqs:
                self.assertEqual(
                    str(spacer).join(str(target)), str(spacer.join(target)))

    def test_join_MutableSeq_mixed(self):
        """Check MutableSeq objects can be joined."""
        spacer = MutableSeq("NNNNN")
        self.assertEqual(
            "N" * 15,
            spacer.join([MutableSeq("NNNNN"),
                         MutableSeq("NNNNN")]),
        )
        self.assertRaises(
            TypeError,
            spacer.join([Seq("NNNNN"), MutableSeq("NNNNN")]),
        )

    def test_join_Seq_with_file(self):
        """Checks if Seq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = Seq("NNNNN")
        spacer1 = Seq("")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(seq_concatenated, ref_data)
        self.assertEqual(seq_concatenated1, ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_join_UnknownSeq_with_file(self):
        """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = UnknownSeq(0, character="-")
        spacer1 = UnknownSeq(5, character="-")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(seq_concatenated, ref_data)
        self.assertEqual(seq_concatenated1, ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_join_MutableSeq(self):
        """Checks if MutableSeq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = MutableSeq("")
        spacers = [
            spacer1,
            MutableSeq("NNNNN"),
            MutableSeq("GGG"),
        ]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str_concatenated, "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(seq_concatenated,
                             str(spacer).join(example_strings))

    def test_join_MutableSeq_with_file(self):
        """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = MutableSeq("NNNNN")
        spacer1 = MutableSeq("")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(seq_concatenated, ref_data)
        self.assertEqual(seq_concatenated1, ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_equality(self):
        """Test equality when mixing types."""
        self.assertEqual(Seq("6"), "6")
        self.assertNotEqual(Seq("6"), 6)
        self.assertEqual(Seq(""), "")
        self.assertNotEqual(Seq(""), None)
        self.assertEqual(Seq("None"), "None")
        self.assertNotEqual(Seq("None"), None)

        self.assertEqual(MutableSeq("6"), "6")
        self.assertNotEqual(MutableSeq("6"), 6)
        self.assertEqual(MutableSeq(""), "")
        self.assertNotEqual(MutableSeq(""), None)
        self.assertEqual(MutableSeq("None"), "None")
        self.assertNotEqual(MutableSeq("None"), None)

        self.assertEqual(UnknownSeq(1, character="6"), "6")
        self.assertNotEqual(UnknownSeq(1, character="6"), 6)
        self.assertEqual(UnknownSeq(0), "")
        self.assertNotEqual(UnknownSeq(0), None)
Exemplo n.º 31
0
 def setUp(self):
     self.alphabet = TestAlphabet()
     self.genome = MutableSeq("1234", self.alphabet)
     self.organism = Organism.Organism(self.genome, fitness_calculator)
Exemplo n.º 32
0
 def test_tomutable(self):
     """Check creating a MutableSeq object."""
     for example1 in self._examples:
         mut = MutableSeq(example1)
         self.assertIsInstance(mut, MutableSeq)
         self.assertEqual(mut, example1)
Exemplo n.º 33
0
    str_light_chain_one, str_light_chain_two,
    "ATGCGTATCGATCGCGATACGATTAGGCGGAT"
]


def u_crc32(seq):
    #NOTE - On Python 2 crc32 could return a signed int, but on Python 3 it is
    #always unsigned
    #Docs suggest should use crc32(x) & 0xffffffff for consistency.
    return crc32(seq) & 0xffffffff


for i, seq_str in enumerate(examples):
    print "Example %i, length %i, %s..." % (i + 1, len(seq_str), seq_str[:10])

    #Avoid cross platforms with printing floats by doing conversion explicitly
    def simple_LCC(s):
        return "%0.2f" % lcc_simp(s)

    def windowed_LCC(s):
        return ", ".join(["%0.2f" % v for v in lcc_mult(s, 20)])

    for checksum in [u_crc32, crc64, gcg, seguid, simple_LCC, windowed_LCC]:
        #First using a string:
        value = checksum(seq_str)
        print " %s = %s" % (checksum.__name__, value)
        #Secondly check it works with a Seq object
        assert value == checksum(Seq(seq_str, single_letter_alphabet))
        #Finally check it works with a MutableSeq object
        assert value == checksum(MutableSeq(seq_str, single_letter_alphabet))
Exemplo n.º 34
0
    def create_clusters_from_bowtie(self):
        """
		The 'offset' field is actually 'abundance'
		The 'ref' field is actually 'cycle' offset
		"""
        with open(self.otu_txt) as f:
            for line in f:
                otuid, rest = line.strip().split(None, 1)
                for x in rest.split():
                    self.otu_info[x] = otuid
                self.cluster_by_otu[otuid] = {}

        for r in BowTieReader(self.input_bowtie, False):
            cid = r['ID']
            otuid = self.otu_info[r['ID']]
            self.cluster_by_otu[otuid][cid] = {'dirty':True, 'cids':[cid], 'len':len(r['seq']), 'seq': MutableSeq(r['seq']), 'size':int(r['offset']), \
              'qual': [ord(x)-33 for x in r['qual']], 'cycle': range(int(r['ref']), int(r['ref'])+len(r['seq']))}
Exemplo n.º 35
0
    def test_count_overlap_start_end_NN(self):
        """Check our count_overlap method using NN with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 0),
            (3, None, 0),
            (3, 6, 0),
            (4, 6, 0),
            (4, -1, 0),
            (-5, None, 0),
            (-5, 7, 0),
            (7, -5, 0),
            (-100, None, 0),
            (None, 100, 0),
            (-100, 1000, 0),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("NN", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("NN", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0)

        # Testing UnknownSeq() with variable start and end arguments
        alphabet_char_start_end_exp = [
            (generic_rna, "N", 1, 7, 5),
            (generic_dna, "N", 1, 7, 5),
            (generic_rna, "N", -4, None, 3),
            (generic_dna, "N", -4, None, 3),
            (generic_protein, "X", 1, 7, 0),
        ]

        for alpha, char, start, end, exp in alphabet_char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, alpha, char).count_overlap("NN", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("N", 100, 105, 0),
            ("N", -1, 4, 0),
            ("N", 4, -1, 2),
            ("N", -8, -2, 5),
            ("N", -2, -8, 0),
            ("N", 8, 2, 0),
            ("N", 2, 8, 5),
            ("NN", 8, 2, 0),
            ("NN", 2, 8, 4),
            ("NN", -5, -1, 3),
            ("NN", 1, 5, 3),
            ("NNN", None, None, 5),
            ("NNNNNNNNN", None, None, 0),
            ("NNN", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("NN", 1), 5)
Exemplo n.º 36
0
 def get_optimal_alignment(self):
     """Follow the traceback to get the optimal alignment."""
     # intialize the two sequences which will return the alignment
     align_seq1 = MutableSeq(array.array("c"), 
       Alphabet.Gapped(IUPAC.protein, GAP_CHAR))
     align_seq2 = MutableSeq(array.array("c"), 
       Alphabet.Gapped(IUPAC.protein, GAP_CHAR))
       
     # take care of the initial case with the bottom corner matrix
     # item
     current_cell = self.dpmatrix[(len(self.seq1), len(self.seq2))]
     align_seq1.append(current_cell.seq1item)
     align_seq2.append(current_cell.seq2item)
     
     next_cell = current_cell.get_parent()
     current_cell = next_cell
     next_cell = current_cell.get_parent()
     
     # keeping adding sequence until we reach (0, 0)
     while next_cell:
         # add the new sequence--three cases:
         # 1. Move up diaganolly, add a new seq1 and seq2 to the 
         # aligned sequences
         if ((next_cell.col_pos == current_cell.col_pos - 1) and
           (next_cell.row_pos == current_cell.row_pos - 1)):
             # print "case 1 -> seq1 %s, seq2 %s" % (
             # current_cell.seq1item, current_cell.seq2item)
             align_seq1.append(current_cell.seq1item)
             align_seq2.append(current_cell.seq2item)
         # 2. Move upwards, add a new seq2 and a gap in seq1
         elif ((next_cell.col_pos  == current_cell.col_pos) and
           (next_cell.row_pos == current_cell.row_pos - 1)):
             #print "case 2 -> seq2 %s" % current_cell.seq2item
             align_seq1.append(GAP_CHAR)
             align_seq2.append(current_cell.seq2item)
         # 3. Move to the right, add a new seq1 and a gap in seq2
         elif ((next_cell.col_pos == current_cell.col_pos - 1) and
           (next_cell.row_pos == current_cell.row_pos)):
             #print "case 3 -> seq1 % s" % current_cell.seq1item
             align_seq1.append(current_cell.seq1item)
             align_seq2.append(GAP_CHAR)
         
         # now move on to the next sequence
         current_cell = next_cell
         next_cell = current_cell.get_parent()
     
     # reverse the returned alignments since we are reading them in
     # backwards
     align_seq1.reverse()
     align_seq2.reverse()
     return align_seq1.toseq(), align_seq2.toseq()
Exemplo n.º 37
0
class TestMutableSeq(unittest.TestCase):
    def setUp(self):
        self.s = Seq.Seq("TCAAAAGGATGCATCATG", IUPAC.unambiguous_dna)
        self.mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna)

    def test_mutableseq_creation(self):
        """Test creating MutableSeqs in multiple ways"""
        mutable_s = MutableSeq("TCAAAAGGATGCATCATG", IUPAC.ambiguous_dna)
        self.assertIsInstance(mutable_s, MutableSeq, "Creating MutableSeq")

        mutable_s = self.s.tomutable()
        self.assertIsInstance(mutable_s, MutableSeq, "Converting Seq to mutable")

        array_seq = MutableSeq(array.array(array_indicator, "TCAAAAGGATGCATCATG"),
                               IUPAC.ambiguous_dna)
        self.assertIsInstance(array_seq, MutableSeq, "Creating MutableSeq using array")

    def test_repr(self):
        self.assertEqual("MutableSeq('TCAAAAGGATGCATCATG', IUPACAmbiguousDNA())",
                         repr(self.mutable_s))

    def test_truncated_repr(self):
        seq = "TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGA"
        expected = "MutableSeq('TCAAAAGGATGCATCATGTCAAAAGGATGCATCATGTCAAAAGGATGCATCATG...GGA', IUPACAmbiguousDNA())"
        self.assertEqual(expected, repr(MutableSeq(seq, IUPAC.ambiguous_dna)))

    def test_equal_comparison(self):
        """Test __eq__ comparison method"""
        self.assertEqual(self.mutable_s, "TCAAAAGGATGCATCATG")

    def test_equal_comparison_of_incompatible_alphabets(self):
        with warnings.catch_warnings(record=True):
            self.mutable_s == MutableSeq('UCAAAAGGA', IUPAC.ambiguous_rna)

    def test_not_equal_comparison(self):
        """Test __ne__ comparison method"""
        self.assertNotEqual(self.mutable_s, "other thing")

    def test_less_than_comparison(self):
        """Test __lt__ comparison method"""
        self.assertTrue(self.mutable_s[:-1] < self.mutable_s)

    def test_less_than_comparison_of_incompatible_alphabets(self):
        with warnings.catch_warnings(record=True):
            self.mutable_s[:-1] < MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna)

    def test_less_than_comparison_without_alphabet(self):
        self.assertTrue(self.mutable_s[:-1] < "TCAAAAGGATGCATCATG")

    def test_less_than_or_equal_comparison(self):
        """Test __le__ comparison method"""
        self.assertTrue(self.mutable_s[:-1] <= self.mutable_s)

    def test_less_than_or_equal_comparison_of_incompatible_alphabets(self):
        with warnings.catch_warnings(record=True):
            self.mutable_s[:-1] <= MutableSeq("UCAAAAGGAUGCAUCAUG", IUPAC.ambiguous_rna)

    def test_less_than_or_equal_comparison_without_alphabet(self):
        self.assertTrue(self.mutable_s[:-1] <= "TCAAAAGGATGCATCATG")

    def test_add_method(self):
        """Test adding wrong type to MutableSeq"""
        with self.assertRaises(TypeError):
            self.mutable_s + 1234

    def test_radd_method(self):
        self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG",
                         self.mutable_s.__radd__(self.mutable_s))

    def test_radd_method_incompatible_alphabets(self):
        with self.assertRaises(TypeError):
            self.mutable_s.__radd__(MutableSeq("UCAAAAGGA", IUPAC.ambiguous_rna))

    def test_radd_method_using_seq_object(self):
        self.assertEqual("TCAAAAGGATGCATCATGTCAAAAGGATGCATCATG",
                         self.mutable_s.__radd__(self.s))

    def test_radd_method_wrong_type(self):
        with self.assertRaises(TypeError):
            self.mutable_s.__radd__(1234)

    def test_as_string(self):
        self.assertEqual("TCAAAAGGATGCATCATG", str(self.mutable_s))

    def test_length(self):
        self.assertEqual(18, len(self.mutable_s))

    def test_converting_to_immutable(self):
        self.assertIsInstance(self.mutable_s.toseq(), Seq.Seq)

    def test_first_nucleotide(self):
        self.assertEqual('T', self.mutable_s[0])

    def test_setting_slices(self):
        self.assertEqual(MutableSeq('CAAA', IUPAC.ambiguous_dna),
                         self.mutable_s[1:5], "Slice mutable seq")

        self.mutable_s[1:3] = "GAT"
        self.assertEqual(MutableSeq("TGATAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s,
                         "Set slice with string and adding extra nucleotide")

        self.mutable_s[1:3] = self.mutable_s[5:7]
        self.assertEqual(MutableSeq("TAATAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s, "Set slice with MutableSeq")

        self.mutable_s[1:3] = array.array(array_indicator, "GAT")
        self.assertEqual(MutableSeq("TGATTAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s, "Set slice with array")

    def test_setting_item(self):
        self.mutable_s[3] = "G"
        self.assertEqual(MutableSeq("TCAGAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_deleting_slice(self):
        del self.mutable_s[4:5]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_deleting_item(self):
        del self.mutable_s[3]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_appending(self):
        self.mutable_s.append("C")
        self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGC", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_inserting(self):
        self.mutable_s.insert(4, "G")
        self.assertEqual(MutableSeq("TCAAGAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_popping_last_item(self):
        self.assertEqual("G", self.mutable_s.pop())

    def test_remove_items(self):
        self.mutable_s.remove("G")
        self.assertEqual(MutableSeq("TCAAAAGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s, "Remove first G")

        self.assertRaises(ValueError, self.mutable_s.remove, 'Z')

    def test_count(self):
        self.assertEqual(7, self.mutable_s.count("A"))
        self.assertEqual(2, self.mutable_s.count("AA"))

    def test_index(self):
        self.assertEqual(2, self.mutable_s.index("A"))
        self.assertRaises(ValueError, self.mutable_s.index, "8888")

    def test_reverse(self):
        """Test using reverse method"""
        self.mutable_s.reverse()
        self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_reverse_with_stride(self):
        """Test reverse using -1 stride"""
        self.assertEqual(MutableSeq("GTACTACGTAGGAAAACT", IUPAC.ambiguous_dna),
                         self.mutable_s[::-1])

    def test_complement(self):
        self.mutable_s.complement()
        self.assertEqual(str("AGTTTTCCTACGTAGTAC"), str(self.mutable_s))

    def test_complement_rna(self):
        seq = Seq.MutableSeq("AUGaaaCUG", IUPAC.unambiguous_rna)
        seq.complement()
        self.assertEqual(str("UACuuuGAC"), str(seq))

    def test_complement_mixed_aphabets(self):
        seq = Seq.MutableSeq("AUGaaaCTG")
        with self.assertRaises(ValueError):
            seq.complement()

    def test_complement_rna_string(self):
        seq = Seq.MutableSeq("AUGaaaCUG")
        seq.complement()
        self.assertEqual('UACuuuGAC', str(seq))

    def test_complement_dna_string(self):
        seq = Seq.MutableSeq("ATGaaaCTG")
        seq.complement()
        self.assertEqual('TACtttGAC', str(seq))

    def test_reverse_complement(self):
        self.mutable_s.reverse_complement()
        self.assertEqual("CATGATGCATCCTTTTGA", str(self.mutable_s))

    def test_reverse_complement_of_protein(self):
        seq = Seq.MutableSeq("ACTGTCGTCT", Alphabet.generic_protein)
        with self.assertRaises(ValueError):
            seq.reverse_complement()

    def test_to_string_method(self):
        """This method is currently deprecated, probably will need to remove this test soon"""
        with warnings.catch_warnings(record=True):
            self.mutable_s.tostring()

    def test_extend_method(self):
        self.mutable_s.extend("GAT")
        self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGGAT", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_extend_with_mutable_seq(self):
        self.mutable_s.extend(MutableSeq("TTT", IUPAC.ambiguous_dna))
        self.assertEqual(MutableSeq("TCAAAAGGATGCATCATGTTT", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_delete_stride_slice(self):
        del self.mutable_s[4:6 - 1]
        self.assertEqual(MutableSeq("TCAAAGGATGCATCATG", IUPAC.ambiguous_dna),
                         self.mutable_s)

    def test_extract_third_nucleotide(self):
        """Test extracting every third nucleotide (slicing with stride 3)"""
        self.assertEqual(MutableSeq("TAGTAA", IUPAC.ambiguous_dna), self.mutable_s[0::3])
        self.assertEqual(MutableSeq("CAGGTT", IUPAC.ambiguous_dna), self.mutable_s[1::3])
        self.assertEqual(MutableSeq("AAACCG", IUPAC.ambiguous_dna), self.mutable_s[2::3])

    def test_set_wobble_codon_to_n(self):
        """Test setting wobble codon to N (set slice with stride 3)"""
        self.mutable_s[2::3] = "N" * len(self.mutable_s[2::3])
        self.assertEqual(MutableSeq("TCNAANGGNTGNATNATN", IUPAC.ambiguous_dna),
                         self.mutable_s)
Exemplo n.º 38
0
 def test_reverse_complement_mutable_seq(self):
     s = SeqRecord(MutableSeq("ACTG"))
     self.assertEqual("CAGT", str(s.reverse_complement().seq))
Exemplo n.º 39
0
##                    checkSynonymity(resultDict[gene])
##
##                except:
##                    pass
                
                positionList = []
                for sub_element, value in resultDict[gene].items():
                    
                    converted = classifydict(value)
                    for nucleotide, valuen in converted.items():
                        if int(valuen[0]) > 0 and float(valuen[2]) < float(args.minqual):
                            count +=1


                            # now check for synonymity,  move this to functions
                            alternativeSeq = MutableSeq(str(element.seq[start:stop]), generic_dna)
##                            print nucleotide
                            alternativeSeq = mutateSequence(alternativeSeq,sub_element,nucleotide,start)
                            alternativeSeq = Seq(str(alternativeSeq), generic_dna)
                            if len(alternativeSeq)%3 != 0:
                                overlap = len(alternativeSeq)%3
                                alternativeSeq = alternativeSeq[:-int(overlap)]
                            
                            altprot = alternativeSeq.translate()
                            altprot = list2dict(altprot[0:len(protein)],0)

                            protposition = int((sub_element-start)/3)
                            try:
                                if protein[protposition] != altprot[protposition]:
                                    positionList.append(sub_element)
                                    synonym = 'NonSynon'
Exemplo n.º 40
0
				leaf_names = [leaf.name for leaf in leaves]
				z = 'ancestral'
			for l in leaf_names:
				seqLIST.append([l, [RCstart, RCstop, z]])

# Build dictionary of recombinant regions
for k,v in seqLIST:
	d[k].append(v)

# Mask recombination in sequences
seqALN = []
for record in SeqIO.parse(args.aln, 'fasta'):
	msg('Reading {} ... '.format(record.id))
	seqlen = len(record.seq)
	regions = d.get(record.id, None)
	newrec = MutableSeq(str(record.seq))
	if regions:
		for a in regions:
			start = int(a[0]) - 1
			end = int(a[1])
			lenMASK = end - start
			newrec[start:end] = (args.symbol)*lenMASK
	seqALN.append(SeqRecord(Seq(str(newrec)), record.id, description=''))

# Write masked alignment to file
msg('Writing masked alignment to {} ... '.format(args.out))
SeqIO.write(seqALN, args.out, 'fasta')

# Write recombinant regions to file
if args.regions:
	with open(args.regions, 'w') as csvfile:
Exemplo n.º 41
0
                if VERBOSE >= 1:
                    print pname+':', gene, 'not found or with problems: skipping.'
                continue
    
            gene_pos = gene_poss[gene]
            aft_der_gene = np.concatenate([aft_der[:, :, exon_pos[0]: exon_pos[1]]
                                           for exon_pos in gene_pos], axis=2)
            conss_gene = gene_seqs[gene]
            gene_len = len(conss_gene)

            hist += np.histogram(aft_der_gene.ravel(), bins=bins, density=False)[0]

            # Collect counts syn/nonsyn
            nu_syn = []
            nu_nonsyn = []
            cod_anc = MutableSeq('AAA', unambiguous_dna)
            cod_new = MutableSeq('AAA', unambiguous_dna)
            for j in xrange(gene_len // 3):
                for jcod in xrange(3):
                    for ai in xrange(4):
                        cod_anc[:] = conss_gene[3 * j: 3 * (j+1)]
                        # Ancestral allele, skip (we only look at propagation of MINOR alleles)
                        if alpha[ai] == cod_anc[jcod]:
                            continue
    
                        cod_new[:] = conss_gene[3 * j: 3 * (j+1)]
                        cod_new[jcod] = alpha[ai]
    
                        aftmp = aft_der_gene[:, ai, j + jcod]
                        aftmp = aftmp[(aftmp >= bins[0]) & (aftmp <= bins[-1])]
                        if not len(aftmp):
Exemplo n.º 42
0
seq=Seq('ATGGTCTTTCCAGACGCG',IUPAC.unambiguous_dna)
print Seq.transcribe(seq)	#as function, up is as method

print seq[:5]	#methods as string
print len(seq)
#seq[0]='C'	#aren't mutables
st=str(seq)		#toString
print st

#tipo de dato secuencia editable
from Bio.Seq import MutableSeq
mut_seq=seq.tomutable()	#convertirlo a tipo seq mutable
print mut_seq
mut_seq[0]='C'
print mut_seq
mut_seq=MutableSeq('ATGCCG',IUPAC.IUPACUnambiguousDNA())
#has methods as a list: append(), insert(), pop(), remove()
mut_seq[1:3]='TTT'
mut_seq.reverse()
mut_seq.complement()
print mut_seq
mut_seq.reverse_complement()
print mut_seq

#tipo de dato metadatos de secuencia
from Bio.SeqRecord import SeqRecord
seqrec=SeqRecord(seq,id='001', name='My Secuencia')
#2 main attributes:
#	id: string identifier, optional, recommended
#	seq: Seq object, required
#additional attributes
Exemplo n.º 43
0
    def viterbi(self, sequence, state_alphabet):
        """Calculate the most probable state path using the Viterbi algorithm.

        This implements the Viterbi algorithm (see pgs 55-57 in Durbin et
        al for a full explanation -- this is where I took my implementation
        ideas from), to allow decoding of the state path, given a sequence
        of emissions.

        Arguments:

        o sequence -- A Seq object with the emission sequence that we
        want to decode.

        o state_alphabet -- The alphabet of the possible state sequences
        that can be generated.
        """
        # calculate logarithms of the transition and emission probs
        log_trans = self._log_transform(self.transition_prob)
        log_emission = self._log_transform(self.emission_prob)

        viterbi_probs = {}
        pred_state_seq = {}
        state_letters = state_alphabet.letters
        # --- initialization
        #
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        #
        # v_{0}(0) = 1
        viterbi_probs[(state_letters[0], -1)] = 1
        # v_{k}(0) = 0 for k > 0
        for state_letter in state_letters[1:]:
            viterbi_probs[(state_letter, -1)] = 0

        # --- recursion
        # loop over the training squence (i = 1 .. L)
        for i in range(0, len(sequence)):
            # now loop over all of the letters in the state path
            for main_state in state_letters:
                # e_{l}(x_{i})
                emission_part = log_emission[(main_state, sequence[i])]

                # loop over all possible states
                possible_state_probs = {}
                for cur_state in self.transitions_from(main_state):
                    # a_{kl}
                    trans_part = log_trans[(cur_state, main_state)]

                    # v_{k}(i - 1)
                    viterbi_part = viterbi_probs[(cur_state, i - 1)]
                    cur_prob = viterbi_part + trans_part

                    possible_state_probs[cur_state] = cur_prob

                # finally calculate the viterbi probability using the max
                max_prob = max(possible_state_probs.values())
                viterbi_probs[(main_state, i)] = (emission_part + max_prob)

                # now get the most likely state
                for state in possible_state_probs:
                    if possible_state_probs[state] == max_prob:
                        pred_state_seq[(i - 1, main_state)] = state
                        break
                    
        # --- termination
        # calculate the probability of the state path
        # loop over all letters
        all_probs = {}
        for state in state_letters:
            # v_{k}(L)
            viterbi_part = viterbi_probs[(state, len(sequence) - 1)]
            # a_{k0}
            transition_part = log_trans[(state, state_letters[0])]

            all_probs[state] = viterbi_part * transition_part

        state_path_prob = max(all_probs.values())

        # find the last pointer we need to trace back from
        last_state = ''
        for state in all_probs:
            if all_probs[state] == state_path_prob:
                last_state = state

        assert last_state != '', "Didn't find the last state to trace from!"
                
        # --- traceback
        traceback_seq = MutableSeq('', state_alphabet)
        
        loop_seq = range(0, len(sequence))
        loop_seq.reverse()

        cur_state = last_state
        for i in loop_seq:
            traceback_seq.append(cur_state)
            
            cur_state = pred_state_seq[(i - 1, cur_state)]

        # put the traceback sequence in the proper orientation
        traceback_seq.reverse()

        return traceback_seq.toseq(), state_path_prob
Exemplo n.º 44
0
    def viterbi(self, sequence, state_alphabet):
        """Calculate the most probable state path using the Viterbi algorithm.

        This implements the Viterbi algorithm (see pgs 55-57 in Durbin et
        al for a full explanation -- this is where I took my implementation
        ideas from), to allow decoding of the state path, given a sequence
        of emissions.

        Arguments:

        o sequence -- A Seq object with the emission sequence that we
        want to decode.

        o state_alphabet -- The alphabet of the possible state sequences
        that can be generated.
        """

        # calculate logarithms of the initial, transition, and emission probs
        log_initial = self._log_transform(self.initial_prob)
        log_trans = self._log_transform(self.transition_prob)
        log_emission = self._log_transform(self.emission_prob)

        viterbi_probs = {}
        pred_state_seq = {}
        state_letters = state_alphabet.letters

        # --- recursion
        # loop over the training squence (i = 1 .. L)
        # NOTE: My index numbers are one less than what is given in Durbin
        # et al, since we are indexing the sequence going from 0 to
        # (Length - 1) not 1 to Length, like in Durbin et al.
        for i in range(0, len(sequence)):
            # loop over all of the possible i-th states in the state path
            for cur_state in state_letters:
                # e_{l}(x_{i})
                emission_part = log_emission[(cur_state, sequence[i])]

                max_prob = 0
                if i == 0:
                    # for the first state, use the initial probability rather
                    # than looking back to previous states
                    max_prob = log_initial[cur_state]
                else:
                    # loop over all possible (i-1)-th previous states
                    possible_state_probs = {}
                    for prev_state in self.transitions_to(cur_state):
                        # a_{kl}
                        trans_part = log_trans[(prev_state, cur_state)]

                        # v_{k}(i - 1)
                        viterbi_part = viterbi_probs[(prev_state, i - 1)]
                        cur_prob = viterbi_part + trans_part

                        possible_state_probs[prev_state] = cur_prob

                    # calculate the viterbi probability using the max
                    max_prob = max(possible_state_probs.values())

                # v_{k}(i)
                viterbi_probs[(cur_state, i)] = (emission_part + max_prob)

                if i > 0:
                    # get the most likely prev_state leading to cur_state
                    for state in possible_state_probs:
                        if possible_state_probs[state] == max_prob:
                            pred_state_seq[(i - 1, cur_state)] = state
                            break
                    
        # --- termination
        # calculate the probability of the state path
        # loop over all states
        all_probs = {}
        for state in state_letters:
            # v_{k}(L)
            all_probs[state] = viterbi_probs[(state, len(sequence) - 1)]

        state_path_prob = max(all_probs.values())

        # find the last pointer we need to trace back from
        last_state = ''
        for state in all_probs:
            if all_probs[state] == state_path_prob:
                last_state = state

        assert last_state != '', "Didn't find the last state to trace from!"
                
        # --- traceback
        traceback_seq = MutableSeq('', state_alphabet)
        
        loop_seq = range(1, len(sequence))
        loop_seq.reverse()

        # last_state is the last state in the most probable state sequence.
        # Compute that sequence by walking backwards in time. From the i-th
        # state in the sequence, find the (i-1)-th state as the most
        # probable state preceding the i-th state.
        state = last_state
        traceback_seq.append(state)
        for i in loop_seq:
            state = pred_state_seq[(i - 1, state)]
            traceback_seq.append(state)

        # put the traceback sequence in the proper orientation
        traceback_seq.reverse()

        return traceback_seq.toseq(), state_path_prob
Exemplo n.º 45
0
#print gene
#YAAX = yaaX.translate(table='Bacterial', cds=True, to_stop=True)
#print YAAX

#playing with codon usage tables
#from Bio.Data import CodonTable
#standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
#mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]
#print standard_table

#mutable seq objects
from Bio.Seq import Seq
from Bio.Seq import MutableSeq
from Bio.Alphabet import IUPAC
#my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)
#mutable_seq = my_seq.tomutable()
#Or just create a mutable seq!
my_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)
print my_seq
#my_seq_div = my_seq
#my_seq_div[5:8] = 'tag' #how to do insertions????????  only can replace as many characters as indicated.  wait it works now.  
#why 5:8?
#print my_seq #why does this print as my_seq_div with SNP?  
#print my_seq_div
#my_seq_del = my_seq_div.remove("T")
#print my_seq_del
my_seq_rev = my_seq.reverse() #should be able to do my_seq.reverse_complement() as well
print my_seq_rev #this should be working, but it returning None

fin_seq = my_seq_div.toseq() #converts back to immutable Seq Object
# How to remove all Ts from a sequence using a while loop
from Bio.Seq import MutableSeq  #import MutableSeq (mutable sequence) object
from Bio.Alphabet import IUPAC  #import IUPAC alphabets
# Create a MutableSeq object called mutable_seq
# The code below also works without an alphabet argument
mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA",
                         IUPAC.unambiguous_dna)
while "T" in mutable_seq:  #as long as there are Ts in mutable_seq...
    mutable_seq.remove("T")  #remove the next T
print(mutable_seq)  #show me the result
Exemplo n.º 47
0
    "AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAT" +
    "TATGAATGGCGAGGCAATCGCTGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCAT" +
    "AAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA",
    generic_dna)

print(gene.translate(table="Bacterial"))
print(gene.translate(table="Bacterial", cds=True))

##查看密码子表
from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
mito_table = CodonTable.unambiguous_dna_by_id[2]

print(standard_table)
print(mito_table.start_codons)
print(mito_table.stop_codons)
print(mito_table.forward_table["ACG"])

##可变对象
from Bio.Seq import MutableSeq
mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)
print(mutable_seq)
mutable_seq[5] = "C"
print(mutable_seq)
mutable_seq.remove("T")
print(mutable_seq)
mutable_seq.reverse()
print(mutable_seq)
new_seq = mutable_seq.toseq()
print(new_seq)
Exemplo n.º 48
0
 def __init__(self, seqFile, format="fasta"):
     for seq in SeqIO.parse(seqFile, format):
         seq.seq = MutableSeq(seq.seq.tostring())
         self.append(seq)
Exemplo n.º 49
0
class MuGen(object):
	""" performs mutations and deletion/insertion with desired porbability
	and desired structure. Gets a Seq object, a mutation or indel dicitonary,
	and the probablities for each item in those dictionaries.
	insertprob and deleteprob are base specefic probabilities of length 4
	mualphabet is a dictionary specifying the possible mutations for each letter of
	the sequence alphabet.
	muprob gives the mutation probality for each letter of the sequence alphabet."""

	def __init__(self, seq, alphaproperty=None, insertprob=None,
		     deleteprob=None, mualphabet=None,
		     muprob=None, mupos=None, delpos=None, inpos=None,
		     verbose=False):
		try:
			self.occureddel = list()  # This is to keep a history of chnges made to the reference
			self.occuredmu = list()  # This is necessary for writing the haplotypes in the format
			self.occuredins = list()  # of haplotyping software's.
			self.inserted_allele = list()  # keeps track of the inserted allele to be able to get them back when needed!
			self.alt_allele = list()  # keeps track of the substituted
			if not isinstance(verbose, bool):
				raise CustomException("ERROR: verbose must be set to either True or False. \
Default is to False")
			else:
				self.verbose = verbose
			if isinstance(seq, str):
				if alphaproperty is None:
					if self.verbose:
						print(
							"WARNING: No alphabet type is specified for the sequence string!")
					else:
						pass
					self.alphaproperty = Alphabet()
				else:
					self.alphaproperty = alphaproperty
				self.seq = MutableSeq(seq, self.alphaproperty)
			elif isinstance(seq, Seq):
				self.alphaproperty = seq.__getattribute__(
					'alphabet')
				self.seq = seq.tomutable()
			elif isinstance(seq, MutableSeq):
				self.alphaproperty = seq.__getattribute__(
					'alphabet')
				self.seq = copy.deepcopy(seq)
			else:
				raise CustomException("ERROR: Should provide a Seq or MutableSeq object, \n \
or a string sequence!")
			self.alphabet = set(str(self.seq))
			self.ref = str(self.seq)
			if not delpos:
				self.delpos = []
			else:
				if set(delpos).issubset(
					set(range(len(self.ref)))):
					self.delpos = list(
						delpos)  # Deletion by specifying the positions
				else:
					raise CustomException(
						"ERROR: Deletion positions exceed the range of the reference or are not positive integers!")
			if not inpos:
				self.inpos = []
			else:
				if set(inpos).issubset(
					set(range(len(self.ref)))):
					self.inpos = list(
						inpos)  # Insertion by specifying the positions
				else:
					raise CustomException(
						"ERROR: Insertion positions exceed the range of the reference or are not positive integers!")
			if not mupos:
				self.mupos = []
			else:
				if set(mupos).issubset(
					set(range(len(self.ref)))):
					self.mupos = list(
						mupos)  # Mutation by specifying the positions
				else:
					raise CustomException(
						"ERROR: Mutation positions exceed the range of the reference or are not positive integers!")
			if not mualphabet:
				if self.verbose:
					print("WARNING: You have specified no mutation alphabet! Mutations are set to random \
letters!")
				self.mualphabet = dict()
				for key in self.alphabet:
					self.mualphabet[key] = ''.join(
						self.alphabet - {
						key,'N'})  # Non-specified mutations could happen to any letter
			else:
				mualphabet = dict([(str(k), str(v)) for k, v in
						   mualphabet.iteritems()])
				for key, value in mualphabet.iteritems():
					if len(key) != 1:
						raise CustomException("ERROR: the mutation alphabet deals with point mutations! Only single letters are\
 allowed as keys!")
					elif key in set(''.join(value)):
						raise CustomException("ERROR: Wrong mutation values specified! A letter could just be substituted with a\
 different letter for mutation!")
				if set(
					mualphabet.keys()) == self.alphabet and set(
					''.join(
						mualphabet.values())) <= self.alphabet:
					self.mualphabet = copy.deepcopy(
						mualphabet)
				elif set(
					mualphabet.keys()) < self.alphabet and set(
					''.join(
						mualphabet.values())) < self.alphabet:
					if self.verbose:
						print("WARNING: Mutation is not specified for some letters! Those mutations are set\
 to random letters!")
					self.mualphabet = copy.deepcopy(
						mualphabet)  # Whatever has been specified for mutation alphabet is kep intact
					for key in self.alphabet - set(
						mualphabet.keys()):
						self.mualphabet[key] = ''.join(
							self.alphabet - {
							key,'N'})  # Non-specified mutations could happen to any letter
				else:
					if self.verbose:
						print("WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\
 updated and\nunspecified mutations are set to random letters!")
					new_mualphabet = dict()  # As mutation may introduce novel alleles in the sequence, alphabet is updated first
					for key, value in mualphabet.iteritems():  # Whatever has been specified for mutation alphabet is kep intact
						self.alphabet.add(
							key)  # Only the alphabet is updated if necessary
						self.alphabet |= (set(''.join(
							value)) - self.alphabet)
						new_mualphabet.update(
							{key: value})
					for key in self.alphabet - set(
						new_mualphabet.keys()):
						new_mualphabet[key] = ''.join(
							self.alphabet - {
							key,'N'})  # Non-specified mutations could happen to any letter
					self.mualphabet = copy.deepcopy(
						new_mualphabet)
			if not insertprob:
				self.insertprob = dict()  # If no insertprob is given, it is set to zero everywhere
				for key in self.alphabet:
					self.insertprob[key] = 0
			else:
				if set(list(
					insertprob.keys())) != self.alphabet:
					if self.verbose:
						print("WARNING: Missing/Invalid letter(s) in insertion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_insertprob = dict()
				for key, value in insertprob.iteritems():
					if value >= 0 and value <= 1:
						new_insertprob.update(
							{key: value})
					else:
						raise CustomException(
							"ERROR: Insertion probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_insertprob.keys()):
					new_insertprob[key] = 0
				self.insertprob = copy.deepcopy(new_insertprob)
			if not deleteprob:  # If no deleteprob is given, it is set to zero everywhere
				self.deleteprob = dict()
				for key in self.alphabet:
					self.deleteprob[key] = 0
			else:
				if set(list(
					deleteprob.keys())) != self.alphabet:
					if self.verbose:
						print("WARNING: Missing/Invalid letter(s) in deletion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_deleteprob = dict()
				for key, value in deleteprob.iteritems():
					if value >= 0 and value <= 1:
						new_deleteprob.update(
							{key: value})
					else:
						raise CustomException(
							"ERROR: Deletion probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_deleteprob.keys()):
					new_deleteprob[key] = 0
				self.deleteprob = copy.deepcopy(new_deleteprob)
			if not muprob:
				self.muprob = dict()  # If no muprob is given, it is set to zero everywhere
				for key in self.alphabet:
					self.muprob[key] = 0
			else:
				if set(list(muprob.keys())) != self.alphabet:
					if self.verbose:
						print("WARNING: Missing/Invalid letter(s) in mutation probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_muprob = dict()
				for key, value in muprob.iteritems():
					if value >= 0 and value <= 1:
						new_muprob.update({key: value})
					else:
						raise CustomException(
							"ERROR: Mutation probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_muprob.keys()):
					new_muprob[key] = 0
				self.muprob = copy.deepcopy(new_muprob)
		except CustomException as instance:
			print(instance)
			sys.exit(2)
		else:
			if self.verbose:
				print(
					"MuGen object successfully created.\nWARNING: MuGen sequence is case sensitive!")

	def __repr__(self):
		return "Haplotype: %s, \n Reference sequence: %s, \n Mutation probabilty: %s, \n Mutations: %s, \n \
Insertion probabilty: %s, \n Deletion Probability: %s, \n \
Insertion positions: %s, \n Deletion positions: %s, \n Mutation positions: %s \n" % (
		self.seq, self.ref,
		self.muprob, self.mualphabet, self.insertprob, self.deleteprob,
		self.inpos, self.delpos, self.mupos)

	def __str__(self):
		return repr(self)

	def get_hap(self):  # Access Methods
		return self.seq

	def get_ref(self):
		return self.ref

	def get_insertprob(self):
		return self.insertprob

	def get_deleteprob(self):
		return self.deleteprob

	def get_muprob(self):
		return self.muprob

	def get_mualphabet(self):
		return self.mualphabet

	def get_mupos(self):
		return self.mupos

	def get_inpos(self):
		return self.inpos

	def get_delpos(self):
		return self.delpos

	def get_occureddelpos(self):
		return self.occureddel

	def get_occuredmupos(self):
		return self.occuredmu

	def get_occuredinspos(self):
		return self.occuredins

	def get_ins_allele(self):
		return self.inserted_allele

	def get_mu_allele(self):
		return self.alt_allele

	def set_ref(self, ref):  # Modifier methods
		"""Changes the reference sequence of the MuGen object. Could become problematic if the new reference
		has a different length than the current reference, while indel and mutation positions are specified.
		A useful method if reference is a mutable seq entity which is constantly called and changed by other
		methods and calsses."""
		try:
			if set(str(ref)).issubset(self.alphabet):
				if not set(self.mupos).issubset(
					set(range(len(str(ref))))):
					raise CustomException(
						"ERROR: Mutation positions exceed the range of the new reference!")
				elif not set(self.inpos).issubset(
					set(range(len(str(ref))))):
					raise CustomException(
						"ERROR: Insertion positions exceed the range of the new reference!")
				elif not set(self.delpos).issubset(
					set(range(len(str(ref))))):
					raise CustomException(
						"ERROR: Deletion positions exceed the range of the new reference!")
				else:
					self.ref = str(ref)
			else:
				raise CustomException(
					"ERROR: the new reference is not compatible with the current alphabet!")
		except CustomException as instance:
			print("Failed to update the reference!")
			print(instance)
		except:
			print("Failed to update the reference!")
			raise
		else:
			if self.verbose:
				print(
					"The reference sequence has been updated!")

	def set_pos(self, inpos=None, delpos=None, mupos=None, ):
		"""Changes the insertion, deletion and substitution sites of the MuGen object. A useful method if
		posmu and probmu methods are constantly called."""
		try:
			changedel = 0  # If set to 1, delpos is changed. Otherwise no change to delpos.
			changein = 0  # If set to 1, inpos is changed. Otherwise no change to inpos.
			changemu = 0  # If set to 1, mupos is changed. Otherwise no change to mupos.
			if delpos is None:  # Default is no change
				pass
			else:
				if set(delpos).issubset(
					set(range(len(self.ref)))):
					changedel = 1
				else:
					raise CustomException(
						"ERROR: New deletion positions exceed the range of the reference or are not positive integers!")
			if inpos is None:  # Deafult is no change
				pass
			else:
				if set(inpos).issubset(
					set(range(len(self.ref)))):
					changein = 1
				else:
					raise CustomException(
						"ERROR: New insertion positions exceed the range of the reference or are not positive integers!")
			if mupos is None:  # Default is no change
				pass
			else:
				if set(mupos).issubset(
					set(range(len(self.ref)))):
					changemu = 1
				else:
					raise CustomException(
						"ERROR: New mutation positions exceed the range of the reference or are not positive integers!")
			if changedel:
				self.delpos = list(delpos)  # Update delpos
			else:
				pass
			if changein:
				self.inpos = list(inpos)  # Update inpos
			else:
				pass
			if changemu:
				self.mupos = list(mupos)  # Update mupos
			else:
				pass
		except CustomException as instance:
			print("Failed to update indel and mutation positions!")
			print(instance)
		except:
			print("Failed to update indel and mutation positions!")
			raise
		else:
			if self.verbose:
				print("Indel and mutation positions updated!")

	def set_prob(self, insertprob=None, deleteprob=None, muprob=None):
		"""Changes the insertion, deletion and mutation probabilities of the MuGen object. A useful method if
		posmu and probmu methods are constantly called."""
		try:
			noinsert = -1
			nodel = -1
			nomu = -1
			if insertprob is None:  # Default to no change
				noinsert = 0
			elif not insertprob:
				noinsert = 1
			elif set(list(insertprob.keys())) != self.alphabet:
				if self.verbose:
					print("WARNING: Missing/Invalid letter(s) in insertion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_insertprob = dict()
				for key, value in insertprob.iteritems():
					if value >= 0 and value <= 1:
						new_insertprob.update(
							{key: value})
					else:
						raise CustomException(
							"ERROR: Insertion probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_insertprob.keys()):
					new_insertprob[key] = 0
			else:
				new_insertprob = copy.deepcopy(insertprob)
			if deleteprob is None:  # Default to no change
				nodel = 0
			elif not deleteprob:  # If empty deleteprob is given, it is set to zero everywhere
				nodel = 1
			elif set(list(deleteprob.keys())) != self.alphabet:
				if self.verbose:
					print("WARNING: Missing/Invalid letter(s) in deletion probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_deleteprob = dict()
				for key, value in deleteprob.iteritems():
					if value >= 0 and value <= 1:
						new_deleteprob.update(
							{key: value})
					else:
						raise CustomException(
							"ERROR: Deletion probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_deleteprob.keys()):
					new_deleteprob[key] = 0
			else:
				new_deleteprob = copy.deepcopy(deleteprob)
			if muprob is None:  # Default to no change
				nomu = 0
			elif not muprob:
				nomu = 1
			elif set(list(muprob.keys())) != self.alphabet:
				if self.verbose:
					print("WARNING: Missing/Invalid letter(s) in mutation probability!\n\
Probabilities are set to zero for missing letters! Invalid letters are ignored!")
				new_muprob = dict()
				for key, value in muprob.iteritems():
					if value >= 0 and value <= 1:
						new_muprob.update({key: value})
					else:
						raise CustomException(
							"ERROR: Mutation probability must be >=0 and <=1!")
				for key in self.alphabet - set(
					new_muprob.keys()):
					new_muprob[key] = 0
			else:
				new_muprob = copy.deepcopy(muprob)
			if nodel == 0:
				pass
			elif nodel == 1:
				self.deleteprob = dict()
				for key in self.alphabet:
					self.deleteprob[key] = 0
			else:
				self.deleteprob = copy.deepcopy(
					new_deleteprob)  # Update deleteprob
			if nomu == 0:
				pass
			elif nomu == 1:
				self.muprob = dict()  # If empty muprob is given, it is set to zero everywhere
				for key in self.alphabet:
					self.muprob[key] = 0
			else:
				self.muprob = copy.deepcopy(
					new_muprob)  # Update muprob
			if noinsert == 0:
				pass
			elif noinsert == 1:
				self.insertprob = dict()  # If empty insertprob is given, it is set to zero everywhere
				for key in self.alphabet:
					self.insertprob[key] = 0
			else:
				self.insertprob = copy.deepcopy(
					new_insertprob)  # Update insertprob
		except CustomException as instance:
			print(instance)
			print(
				"Failed to update indel and mutation probabilities!")
		except:
			print(
				"Failed to update indel and mutation probabilities!")
			raise
		else:
			if self.verbose:
				print(
					"Indel and mutation probabilities successfully updated!")

	def set_mualphabet(self, mualphabet=None):
		"""Changes the mutation alphabet of the MuGen object. A useful method if posmu and probmu methods
		are constantly called."""
		try:
			if not mualphabet:
				if self.verbose:
					print("WARNING: You have specified no mutation alphabet! Mutations are set to random \
letters!")
				self.mualphabet = dict()
				for key in self.alphabet:
					self.mualphabet[key] = ''.join(
						self.alphabet - {
						key,'N'})  # Non-specified mutations could happen to any letter
			else:
				mualphabet = dict([(str(k), str(v)) for k, v in
						   mualphabet.iteritems()])
				for key, value in mualphabet.iteritems():
					if len(key) != 1:
						raise CustomException("ERROR: the mutation alphabet deals with point mutations! Only single letters are\
 allowed as keys!")
					elif key in set(''.join(value)):
						raise CustomException("ERROR: Wrong mutation values specified! A letter could just be substituted with a\
 different letter for mutation!")
				if set(
					mualphabet.keys()) == self.alphabet and set(
					''.join(
						mualphabet.values())) <= self.alphabet:
					self.mualphabet = copy.deepcopy(
						mualphabet)
				elif set(
					mualphabet.keys()) < self.alphabet and set(
					''.join(
						mualphabet.values())) < self.alphabet:
					if self.verbose:
						print("WARNING: Mutation is not specified for some letters! Those mutations are set\
 to random letters!")
					self.mualphabet = copy.deepcopy(
						mualphabet)  # Whatever has been specified for mutation alphabet is kep intact
					for key in self.alphabet - set(
						mualphabet.keys()):
						self.mualphabet[key] = ''.join(
							self.alphabet - {
							key,'N'})  # Non-specified mutations could happen to any letter
				else:
					if self.verbose:
						print("WARNING: Mutation alphabet is not compatible with sequence alphabet! Both alphabets are\
 updated and\nunspecified mutations are set to random letters!")
					new_mualphabet = dict()  # As mutation may introduce novel alleles in the sequence, alphabet is updated first
					for key, value in mualphabet.iteritems():  # Whatever has been specified for mutation alphabet is kep intact
						self.alphabet.add(
							key)  # Only the alphabet is updated if necessary
						self.alphabet |= (set(''.join(
							value)) - self.alphabet)
						new_mualphabet.update(
							{key: value})
					for key in self.alphabet - set(
						new_mualphabet.keys()):
						new_mualphabet[key] = ''.join(
							self.alphabet - {
							key,'N'})  # Non-specified mutations could happen to any letter
					self.mualphabet = copy.deepcopy(
						new_mualphabet)

		except CustomException as instance:
			print(instance)
			print("Mualphabet could not be updated!")
		except:
			print("Mualphabet could not be updated!")
			raise
		else:
			if self.verbose:
				print("Mualphabet successfully updated!")

	def probmu(self):
		self.occuredmu = list()
		self.occureddel = list()
		self.occuredins = list()
		self.inserted_allele = list()
		self.alt_allele = list()
		"""Operates on a MuGen object, and returns a Seq object obtained by making random changes
		to the reference sequence of the MuGen object, using the probabilities given to MuGen"""
		self.seq = []
		for __site, __base in enumerate(self.ref):
			if __site in set(self.mupos) | set(self.inpos) | set(
				self.delpos):
				self.seq.append(
					__base)  # No change is made at indel/mutation positions
			else:
				__prob = {'ins': self.insertprob.get(__base),
					  'del': self.deleteprob.get(__base),
					  'sub': self.muprob.get(__base)}
				__error = random.choice(['ins', 'del', 'sub',
							 'sub'])  # An error occurs randomly: insertion or \
				# deletion or substitution
				__rnd = float(int(
					random.random() * 100000)) / 100000  # The probability that this error is \
				# not corrected by replication machinary is determined \
				if __rnd < __prob.get(
					__error):  # by insertprob,deleteprob and muprob
					if __error == 'sub':
						self.seq.append(random.choice(
							self.mualphabet.get(
								__base)))  # Substitute tha letter with one from the mutation alphabet
						self.occuredmu.append(
							__site)  # Update the list of the sites where a mutation has occured
						self.alt_allele.extend([
									       self.seq[
										       -1]])  # Update the list of alternative alleles
					elif __error == 'ins':
						self.seq.append(__base)
						self.seq.append(random.choice(
							list(
								self.alphabet)))  # Insert a random letter right after the letter
						self.occuredins.append(
							__site)  # Update the list of the sites after which an insertion has occured
						self.inserted_allele.extend([
										    __base +
										    self.seq[
											    -1]])  # Update the list of inserted alleles
					else:
						self.occureddel.append(
							__site)  # Delete the letter in the progeny sequence by just not adding it
				else:  # Update the list of the sites which are deleted in the progeny sequence
					self.seq.append(
						__base)  # No change is induced at the site in the progeny sequence
		self.seq = ''.join(self.seq)
		self.seq = MutableSeq(self.seq, self.alphaproperty)
		if (self.occuredins):
			_ins_allele = zip(self.occuredins,
					  self.inserted_allele)
			_ins_allele.sort(key=lambda tup: tup[
				0])  # Sort the occured change positions in ascending order
			self.occuredins, self.inserted_allele = zip(
				*_ins_allele)
			self.occuredins = list(self.occuredins)
			self.inserted_allele = list(self.inserted_allele)
			_ins_allele = None
		else:
			self.inserted_allele = []
			self.occuredins = []
		if (self.occuredmu):
			_alt_allele = zip(self.occuredmu, self.alt_allele)
			_alt_allele.sort(key=lambda tup: tup[0])
			self.occuredmu, self.alt_allele = zip(*_alt_allele)
			self.occuredmu = list(self.occuredmu)
			self.alt_allele = list(self.alt_allele)
			_alt_allele = None
		else:
			self.occuredmu = []
			self.alt_allele = []
		if (self.occureddel):
			self.occureddel.sort()
		else:
			self.occureddel = []
		if self.verbose:
			print("WARNING: If indel/mutation positions are specified, MuGen.probmu() makes no change at those sites. \n \
Use MuGen.posmu() or Mugen.hapchanger() to apply changes at those sites!")
			print("Changes made to the haplotype!")

	def posmu(self):
		"""Operates on a MuGen object, and returns a Seq object obtained by making specefic changes
		at specefic locations on the reference sequence of the MuGen object, using the
		indel and mutation positions already given to MuGen"""
		__change = [None] * len(self.ref)
		self.occuredmu = list()
		self.occureddel = list()
		self.occuredins = list()
		self.inserted_allele = list()  # Preservation and change site are determined
		self.alt_allele = list()
		for __site in self.inpos:  # Preservation and change site are determined
			__change[
				__site] = 'ins'  # with respect to the reference seq
		for __site in self.delpos:  # type of the change is also specified
			__change[__site] = 'del'  # The substituion base at the
		for __site in self.mupos:  # specified position is determined
			__change[__site] = 'sub'  # from the mutation alphabet.
		self.seq = []
		for __site, __error in iter(
			zip(range(len(self.ref)), __change)):
			__base = self.ref[__site]
			if __error is None:
				self.seq.append(__base)
			elif __error == 'sub':
				self.seq.append(random.choice(
					self.mualphabet.get(
						__base)))  # Substitute tha letter with one from the mutation alphabet
				self.occuredmu.append(
					__site)  # Update the list of the sites where a mutation has occured
				self.alt_allele.extend([self.seq[
								-1]])  # Update the list of alternative alleles
			elif __error == 'ins':
				self.seq.append(__base)
				self.seq.append(random.choice(list(
					self.alphabet)))  # Insert a random letter right after the letter
				self.occuredins.append(
					__site)  # Update the list of the sites after which an insertion has occured
				self.inserted_allele.extend([__base + self.seq[
					-1]])  # Update the list of inserted alleles
			else:
				self.occureddel.append(
					__site)  # Delete the letter in the progeny sequence by just not adding it
		self.seq = ''.join(self.seq)
		self.seq = MutableSeq(self.seq,
				      self.alphaproperty)  # Update the list of the sites which are deleted in the progeny sequence
		if self.occuredins:
			_ins_allele = zip(self.occuredins,
					  self.inserted_allele)
			_ins_allele.sort(key=lambda tup: tup[
				0])  # Sort the occured change positions
			self.occuredins, self.inserted_allele = zip(
				*_ins_allele)
			self.occuredins = list(self.occuredins)
			self.inserted_allele = list(self.inserted_allele)
			_ins_allele = None
		else:
			self.inserted_allele = []
			self.occuredins = []
		if (self.occuredmu):
			_alt_allele = zip(self.occuredmu, self.alt_allele)
			_alt_allele.sort(key=lambda tup: tup[0])
			self.occuredmu, self.alt_allele = zip(*_alt_allele)
			self.occuredmu = list(self.occuredmu)
			self.alt_allele = list(self.alt_allele)
			_alt_allele = None
		else:
			self.occuredmu = []
			self.alt_allele = []
		if (self.occureddel):
			self.occureddel.sort()
		else:
			self.occureddel = []
		if self.verbose:
			print("WARNING: if there are overlaps betweeen deletion, insertion and mutation positions, \n \
just one of the changes takes place with the following priority: \n \
1)Mutation  2)Deletion 3)Insertion. \n")
			print("Changes made to the haplotype!")

	def hapchanger(self):
		"""Operates on a MuGen object, and returns a Seq object obtained by making random and specified
		changes to the reference sequence of the MuGen object, using the probabilities as well as the
		positions given to MuGen."""
		self.seq = []
		self.occuredmu = list()
		self.occureddel = list()
		self.occuredins = list()
		self.inserted_allele = list()
		self.alt_allele = list()
		for __site, __base in enumerate(self.ref):
			if __site in set(
				self.mupos):  # Making specified changes at the specified positions
				self.seq.append(random.choice(
					self.mualphabet.get(
						__base)))  # Induce mutation at the site whose position is given
				self.occuredmu.append(
					__site)  # Update the list of the sites where a mutation has occured
				self.alt_allele.extend([self.seq[
								-1]])  # Update the list of alternative alleles
			elif __site in set(self.inpos):
				self.seq.append(
					__base)  # Make an insertion right after the site whose position is given
				self.seq.append(
					random.choice(list(self.alphabet)))
				self.occuredins.append(
					__site)  # Update the list of the sites after which an insertion has occured
				self.inserted_allele.extend([__base + self.seq[
					-1]])  # Update the list of inserted alleles
			elif __site in set(self.delpos):
				self.occureddel.append(
					__site)  # Update the list of the sited with deleted letter
			else:  # If not change is specified at the position, \
				# make a random change according to the prob model
				__prob = {'ins': self.insertprob.get(__base),
					  'del': self.deleteprob.get(__base),
					  'sub': self.muprob.get(__base)}
				__error = random.choice(['ins', 'del', 'sub',
							 'sub'])  # An error occurs randomly: insertion or \
				# deletion or substitution
				__rnd = float(int(
					random.random() * 100000)) / 100000  # The probability that this error is \
				# not corrected by replication machinary is determined \
				if __rnd < __prob.get(
					__error):  # by insertprob,deleteprob and muprob
					if __error == 'sub':
						self.seq.append(random.choice(self.mualphabet.get(__base)))
						self.occuredmu.append(__site)  # Update the list of the sites where a mutation has occured
						self.alt_allele.extend([self.seq[-1]])  # Update the list of alternative alleles
					elif __error == 'ins':
						self.seq.append(__base)
						self.seq.append(random.choice(list(self.alphabet)))
						self.occuredins.append(__site)  # Update the list of the sites after which an insertion has occured
						self.inserted_allele.extend([__base + self.seq[-1]])  # Update the list of inserted alleles
					elif __error == 'del':
						self.occureddel.append(__site)  # Update the list of the sited with deleted letter
				else:
					self.seq.append(__base)
		self.seq = ''.join(self.seq)
		self.seq = MutableSeq(self.seq, self.alphaproperty)
		if (self.occuredins):
			_ins_allele = zip(self.occuredins,
					  self.inserted_allele)
			_ins_allele.sort(key=lambda tup: tup[
				0])  # Sort the occured change positions
			self.occuredins, self.inserted_allele = zip(
				*_ins_allele)
			self.occuredins = list(self.occuredins)
			self.inserted_allele = list(self.inserted_allele)
			_ins_allele = None
		else:
			self.inserted_allele = []
			self.occuredins = []
		if (self.occuredmu):
			_alt_allele = zip(self.occuredmu, self.alt_allele)
			_alt_allele.sort(key=lambda tup: tup[0])
			self.occuredmu, self.alt_allele = zip(*_alt_allele)
			self.occuredmu = list(self.occuredmu)
			self.alt_allele = list(self.alt_allele)
			_alt_allele = None
		else:
			self.occuredmu = []
			self.alt_allele = []
		if (self.occureddel):
			self.occureddel.sort()
		else:
			self.occureddel = []
		if self.verbose:
			print("Changes made to the haplotype!")
Exemplo n.º 50
0
print CodonTable.unambiguous_dna_by_id[2].start_codons
print CodonTable.unambiguous_dna_by_id[1].forward_table['ACG']    # which aminoacid for this codon

#Comparing Sequences
seq1 = Seq('ACGT',IUPAC.unambiguous_dna)
seq2 = Seq('ACGT',IUPAC.unambiguous_dna)
seq3 = Seq('ACGT',IUPAC.protein)
print id(seq1) == id(seq2)    # seq1 == seq2 look for the same object
print str(seq1) == str(seq2)    # convert to string
print str(seq1) == str(seq3)    # dna similar enought to protein

#MutableSeq
from Bio.Seq import MutableSeq
mutseq = seq1.tomutable()    # convert to MutableSeq
print mutseq, type(mutseq)
mutSeq = MutableSeq('CGTTTAAGCTGC',IUPAC.unambiguous_dna)
print mutSeq, type(mutSeq)
mutseq[1]='T'    # imposible on simple Seq
print mutseq
seq1 = mutseq.toseq()    # convert to Seq
mutSeq.remove('A')    # remove first A
mutSeq[2:-5]='TTTT'
mutSeq.reverse()    # reverse() and reverse_complement() change object itself
print mutSeq
#MutableSeq can't be a dictionary key, Seq and string can

#UnknownSeq
# Subclass of Seq when you know length but not the characters to save memory
from Bio.Seq import UnknownSeq
unk = UnknownSeq(25)
print unk, len(unk), type(unk)