if re.search(rna_string,k): rna_seq = seqdict.get(k).upper() elif re.search(gen_string,k): gen_seq = seqdict.get(k).upper() else: ref_seq = seqdict.get(k).upper() # Need to find beginning and end of aligned region i = 0 j = 0 # Need to keep track of gen index gen_index = 0 try: # Compare genomic and RNA sequences to find local regions of good # similarity, this is taken as the start and end of aligned region while not sequence.compare_seqs((strings.gulp(rna_seq, i, size)), (strings.gulp(gen_seq, i, size)), num_equal): if gen_seq[i] != '-': # If there is actually an amino acid in the genomic sequence # we need to move the index ahead gen_index += 1 i += 1 while not sequence.compare_seqs((strings.gulp(rna_seq[::-1], j, size)), (strings.gulp(gen_seq[::-1], j, size)), num_equal): j += 1 # If we get an index error then we cannot find start and end of both sequences except(IndexError): print "Could not discern aligned part of sequences" # Exit cleanly sys.exit(0)
def test_gulp_long_string(self): long_string = "This is a very long string" sub_string = strings.gulp(long_string, 0, 7) self.assertEqual(sub_string, "This is")
gen_string = str(args.genomic) # Sequences must be in upper-case for k in seqdict.keys(): if re.search(rna_string, k): rna_seq = seqdict.get(k).upper() elif re.search(gen_string, k): gen_seq = seqdict.get(k).upper() # Need to find beginning and end of aligned region i = 0 j = 0 try: # Compare genomic and RNA sequences to find local regions of good # similarity, this is taken as the start and end of aligned region while not sequence.compare_seqs( (strings.gulp(rna_seq, i, size)), (strings.gulp(gen_seq, i, size)), num_equal): #while not sequence.compare_seqs(gen_seq[i], rna_seq[i]): i += 1 while not sequence.compare_seqs( (strings.gulp(rna_seq[::-1], j, size)), (strings.gulp(gen_seq[::-1], j, size)), num_equal): #while not sequence.compare_seqs(gen_seq[-j], rna_seq[-j]): j += 1 # If we get an index error then we cannot find start and end of both sequences except (IndexError): print "Could not discern aligned part of sequences" # Exit cleanly sys.exit(0) # Once we know the start and end, simply chop off everything else
def test_gulp_index_largeer_than_string(self): long_string = "Some string" sub_string = strings.gulp(long_string, 0, 20) self.assertEqual(sub_string, "Some string")
def test_gulp_nothingggg(self): sub_string = strings.gulp("", 0, 0) self.assertEqual(sub_string, "")
def test_gulp_long_string_0_length(self): long_string = "This is a very long string" sub_string = strings.gulp(long_string, 0, 0) self.assertEqual(sub_string, "")
def test_gulp_long_string_reverse_indices(self): long_string = "This is a very long string" sub_string = strings.gulp(long_string, 7, 0) self.assertEqual(sub_string, "")
elif re.search(gen_string,k): gen_seq = seqdict.get(k).upper() # We directly compare aligned sequences, but class implementation uses # unaligned sequences (i.e. no gap characters '-') san_rna_seq = strings.sanitize(rna_seq) san_gen_seq = strings.sanitize(gen_seq) seq_pair = classes.SeqPair(san_rna_seq,san_gen_seq,name) # Find beginning and end of aligned region i = 0 j = 0 try: # Compare genomic and RNA sequences to find local regions of good # similarity, this is taken as the start and end of aligned region while not sequence.compare_seqs((strings.gulp(rna_seq, i, size)), (strings.gulp(gen_seq, i, size)), num_equal): # If we find residues in either sequence, we need to increment # certain class values accordingly if gen_seq[i] != '-': seq_pair.incr_all() if rna_seq[i] != '-': seq_pair.incr_mrna() i += 1 while not sequence.compare_seqs((strings.gulp(rna_seq[::-1], j, size)), (strings.gulp(gen_seq[::-1], j, size)), num_equal): j += 1 # If we get an index error then we cannot find start and end of both sequences except(IndexError): print "Could not discern aligned part of sequences for gene " + str(name) # Exit cleanly
mnuc = seq_pair.lookup_mnuc() gcod = seq_pair.lookup_gcodon() mcod = seq_pair.lookup_mcodon() gaa = seq_pair.lookup_gaa() maa = seq_pair.lookup_maa() scr = (matrices.Blosum62(gaa, maa).sub_score()) non_syn = sequence.check_nonsynonymous_edit(cpos, gcod, mnuc) # We can identify whether the residue is present in a region of local # 'T' concentration, i.e. polyT if args.polyt: # Test whether the base is in a region of 4 or more sequential 'T's is_polyt = "N" # Only look at first seven bases at the start if i <= 4: polyt_test_seq = strings.gulp(new_gen_seq, 0, 7) # Only look at last seven bases at the end elif i >= len(new_gen_seq) - 4: polyt_test_seq = strings.gulp(new_gen_seq, len(new_gen_seq)-7, 7) # In the middle take 3 bases on either side (seven total) else: polyt_test_seq = strings.gulp(new_gen_seq, i-3, 7) # Determine whether the region fits the definition of "polyT" if sequence.polyT(polyt_test_seq): is_polyt = "Y" # Test whether the base is present in region of X % 'T' is_polyt_percent = "N" percent_polyt_seqs = [] for y in range(10): # i.e. for 10 base window
# If we want to calculate protein sequence similarity over the same # stretch, we have to accept some inherent complexity if args.protein: # Note we are calculating similarity, not identity # This is largely because of the difference between comparing # four nucleotides versus 20 amino acids gen_similarity_list = [] for i, (rg,rr) in enumerate(zip(new_gen_seq, new_ref_seq)): similarity_sum = 0.0 # It is vital to know the current codon position rpos = ref_pair.index_rposition() gpos = ref_pair.index_gposition() # Get the reference sequence for the window rnuc_seq = strings.gulp(new_ref_seq, i, int(window_size)) # Using the right RF, translate the sequence raa_seq = sequence.translate(rnuc_seq,rpos) # Repeat this for the genomic sequence gnuc_seq = strings.gulp(new_gen_seq, i, int(window_size)) gaa_seq = sequence.translate(gnuc_seq,gpos) if (len(rnuc_seq) == int(window_size) and len(gnuc_seq) == int(window_size)): # Sanity check! # If the lengths aren't equal, align them if len(raa_seq) != len(gaa_seq): raa_seq,gaa_seq = sequence_alignment.affine_align(raa_seq,gaa_seq) # Whether we align or not, continue on... # Determine how similar raa_seq and gaa_seq are for raa,gaa in zip(raa_seq,gaa_seq): # Gaps are neutral
def test_gulp_nothingggg(self): sub_string = strings.gulp("", 0,0) self.assertEqual(sub_string, "")
ref_pair2 = classes.RefPair(san_ref_seq,san_gen_seq,name) # To use only synonymous edits, we need to have the corresponding # amino acids for both the genomic and RNA sequences # This is also the case for comparing RNA to reference if args.synonymous: san_gen_seq = strings.sanitize(gen_seq) san_rna_seq = strings.sanitize(rna_seq) seq_pair = classes.SeqPair(san_rna_seq,san_gen_seq,name) # Find beginning and end of aligned region i = 0 j = 0 try: # Compare genomic and RNA sequences to find local regions of good # similarity, this is taken as the start and end of aligned region while not sequence.compare_seqs((strings.gulp(rna_seq, i, size)), (strings.gulp(gen_seq, i, size)), num_equal): if gen_seq[i] != '-': # If we are using classes, need to update them as we go if args.protein: ref_pair.incr_all_gen() if args.both: ref_pair2.incr_all_gen() if args.synonymous: seq_pair.incr_all() if ref_seq[i] != '-': if args.protein: ref_pair.incr_all_ref() if args.both: ref_pair2.incr_all_ref() if rna_seq[i] != '-':