def put_it_all_together(sequence): seq = sequence seq = prepare(seq) cleaned_up_seq = '' winning_donor= [] winning_acceptor=[] confirmed_donor_list=[] confirmed_acceptor_list=[] # First, we need to break it up into the rough chunks where transitions are happening chunk_locations_list = find_change_points(seq) # NOW STUBBING/UPDATING THIS LIST WITH THE VALUES I'D EXPECT THE ABOVE FUNCTION TO RETURN ONCE IT'S WORKING CORRECTLY # These positions are the start of the 25bp chunk that differs from it's following neighbor chunk # (as opposed to spoon-feeding the positions of the splice sites themselves, I want to make find_motif do its thing) # Actual splice site start positions in this seq are 85, 276, 358, 549, 630 chunk_locations_list = [[75, 'donor', 0, 0, 0, 0], [250, 'acceptor', 0, 0, 0, 0], [325, 'donor', 0, 0, 0, 0], [525, 'acceptor', 0, 0, 0, 0], [600,'donor', 0, 0, 0, 0]] # We have our positions for where something's generally happening, so let's search a nice window (99bp) for exact motif location for chunk in chunk_locations_list: temp_string = seq[chunk[0]-1:chunk[0]+100] # at the beginning of the slice, subtract 1 to go from position to cursor motif_type = chunk[1] if motif_type =='donor': this_string_donors= find_motif(temp_string, 'donor', chunk[0]) winning_donor = get_winner(this_string_donors) confirmed_donor_list.append(winning_donor) if motif_type =='acceptor': this_string_acceptors = find_motif(temp_string, 'acceptor',chunk[0]) winning_acceptor = get_winner(this_string_acceptors) confirmed_acceptor_list.append(winning_acceptor) cleaned_up_seq = make_clean_seq (seq, confirmed_donor_list, confirmed_acceptor_list) return cleaned_up_seq
def find_change_points(sequence): seq = sequence seq = prepare(seq) length = len(seq) cursor = 0 segments_list=[] # here's where we'll store the starting position and frequencies for each chunk change_points_list = [] # here's where we'll store the starting position where there's a significant delta from chunk to chunk change_threshold = .23 # threshold for significance chunk_size_to_check = 25.0 # chose small size in case of short exons # First, grab chunks of bases (based on size specified above) and save the base frequencies in each chunk while (cursor < length): this_chunk = seq[cursor:cursor+int(chunk_size_to_check)] this_chunk_A_freq = this_chunk.count('A')/chunk_size_to_check this_chunk_C_freq = this_chunk.count('C')/chunk_size_to_check this_chunk_G_freq = this_chunk.count('G')/chunk_size_to_check this_chunk_T_freq = this_chunk.count('T')/chunk_size_to_check segments_list.append([cursor+1, this_chunk_A_freq, this_chunk_C_freq, this_chunk_G_freq, this_chunk_T_freq]) #+1 for position (rather than cursor) cursor += int(chunk_size_to_check) # Next, let's see which ones had a big (quantify) change from one chunk to the next number_of_chunks = len(segments_list) counter = 0 motif_type='' while (counter < number_of_chunks-1): first_list = segments_list[counter] second_list =segments_list[counter+1] big_change_found = False A_delta = (first_list[1]-second_list[1]) C_delta = (first_list[2]-second_list[2]) G_delta = (first_list[3]-second_list[3]) T_delta = (first_list[4]-second_list[4]) # If any of the frequencies change a lot, save this to our change_points_list: donor or acceptor depending on the details # Before-vs-after donor sites, C&G go down and T goes up if C_delta > change_threshold or G_delta > change_threshold or (T_delta*-.01) > change_threshold: motif_type = 'donor' big_change_found = True # Before-vs-after acceptor sites, C&G go up and T goes down if T_delta > change_threshold or (C_delta *-1.0) > change_threshold or (G_delta *-1.0) > change_threshold: motif_type = 'acceptor' big_change_found = True if big_change_found: change_points_list.append([first_list[0], motif_type, A_delta, C_delta, G_delta, T_delta]) counter += 1 return change_points_list
def process_all_reading_frames(text, min_ORF_length): seq = prepare(text) # clean up our sequence antisense = reverseComplement(seq) #get the reverse complement #get results for each reading frame, using offset to start cursor in the right place) frame_1_ORFs= find_ORFs(seq, 0) frame_2_ORFs= find_ORFs(seq, 1) frame_3_ORFs= find_ORFs(seq, 2) frame_a1_ORFs= find_ORFs(antisense, 0) frame_a2_ORFs= find_ORFs(antisense, 1) frame_a3_ORFs= find_ORFs(antisense, 2) #print the results print_ORFs('+1', frame_1_ORFs, min_ORF_length, seq) print_ORFs('+2', frame_2_ORFs, min_ORF_length, seq) print_ORFs('+3', frame_3_ORFs, min_ORF_length, seq) print_ORFs('-1', frame_a1_ORFs, min_ORF_length, antisense) print_ORFs('-2', frame_a2_ORFs, min_ORF_length, antisense) print_ORFs('-3', frame_a3_ORFs, min_ORF_length, antisense)
def process_all_reading_frames(text, min_ORF_length): #named findAllOrf in sample seq = prepare(text) # clean up our sequence antisense = reverseComplement(seq) #get the reverse complement #get results for each reading frame, using offset to start cursor in the right place) frame_1_ORFs = find_ORFs(seq, 0) frame_2_ORFs = find_ORFs(seq, 1) frame_3_ORFs = find_ORFs(seq, 2) frame_a1_ORFs = find_ORFs(antisense, 0) frame_a2_ORFs = find_ORFs(antisense, 1) frame_a3_ORFs = find_ORFs(antisense, 2) #print the results print_ORFs('+1', frame_1_ORFs, min_ORF_length, seq) print_ORFs('+2', frame_2_ORFs, min_ORF_length, seq) print_ORFs('+3', frame_3_ORFs, min_ORF_length, seq) print_ORFs('-1', frame_a1_ORFs, min_ORF_length, antisense) print_ORFs('-2', frame_a2_ORFs, min_ORF_length, antisense) print_ORFs('-3', frame_a3_ORFs, min_ORF_length, antisense)
this_snippet = seq[cursor:cursor+length_to_try] reverse = reverseComplement(this_snippet) if this_snippet == reverse: #if we get a palindrome, save it and move on to checking the next length this_start = cursor + 1 best_so_far = [this_snippet, this_start] length_to_try +=2 cursor = 0 else: #if not a palindrome, keep checking snippets of this length cursor += 1 found_longest=True #If we make it here, no palindromes for that length so we're done return best_so_far if ((len(sys.argv) < 2) or (len(sys.argv) > 3)): print "Usage: python", sys.argv[0], "<filename>" else: fileName = sys.argv[1] if (len(sys.argv) > 2): # This should be an integer try: limit = int(sys.argv[2]) # Convert string to integer except ValueError: # try-except catches errors print sys.argv[2] exit() text = readFastaFile(fileName) #remove 'cs58FileUtil" because it was giving a not defined error text = prepare(text) print find_longest_palindrome (text) profile.run("find_longest_palindrome(text)")
return biggest_repeat if ((len(sys.argv) < 2) or (len(sys.argv) > 3)): print "Usage: python", sys.argv[0], "<filename>" else: fileName = sys.argv[1] if (len(sys.argv) > 2): # This should be an integer try: limit = int(sys.argv[2]) # Convert string to integer except ValueError: # try-except catches errors print "\n\tExpecting an integer to define min ORF length, found", print sys.argv[2] exit() cleanedArray = [] # instead of just one seq we'll have mutliple, need function to return an array of strings seqArray = readMultiFastaFile(fileName) #print "We got back: ", seqArray for seq in seqArray: seq = prepare(seq) cleanedArray.append(seq) # not sure how to save back to seqArray, so saving into cleanedArray print "We'll be comparing: ", cleanedArray print "The biggest repeat in this sequence is: ", find_biggest_repeat( cleanedArray)
# CDK10, exon 3 seq3 = 'aacggggacccctgtggctcagggagagcctcccgttcagcgctagggagcccacgaggggcatcgagatgatgtcatcaccaatgtgtttccattccagATCGGGCCCGGGACACCCAGACAGATGAGATTGTCGCACTGAAGAAGGTGCGGATGGACAAGGAGAAGGATGgtgagcaggaaattggggtgttgggacctcgcactgggaggagcagaaggatgtgagttacctgaagtttcctcagagcgactgcac ggtgcttgtagc' # CDK10, exon 4 #seq4 = 'gcacctgctatcaggtgttcgtgaagcccaagagtggctggggttggggcttccccgccatcactggggtggggctcgctgaggccacctccctccccagGCATCCCCATCAGCAGCTTGCGGGAGATCACGCTGCTGCTCCGCCTGCGTCATCCGAACATCGTGGAGCTGAAGGAGGTGGTTGTGGGGAACCACCTGGAGAGgtacgtggtctcctggtctgcacattgggccctagggagcatgtgtcttgggctagaggtgttgcacagagcgaggactgagtgtcactgggcatgaggt' # ALDH10, exon 1: #seq5='ATGGAGCTCGAAGTCCGGCGGGTCCGACAGGCGTTCCTGTCCGGCCGGTCGCGACCTCTGCGGTTTCGGCTGCAGCAGCTGGAGGCCCTGCGGAGGATGGTGCAGGAGCGCGAGAAGGATATCCTGACGGCCATCGCCGCCGACCTGTGCAAGgtacgcacgcgtgcggcggggtgtggggaaactggcccccgccgcgcacttgtggactggagcttcggctgggttttgtttttgcttttacatttcggat' # ALDH10, exon 2: #seq6= 'ttgttgtcactacaggtgtacctggtgtgagtgttctgacattcagggccaagtgtatcatacttactctgcaagattaactgtgattctcttataacagAGTGAATTCAATGTGTACAGTCAGGAAGTCATTACTGTCCTTGGGGAAATTGATTTTATGCTTGAGAATCTTCCTGAATGGGTTACTGCTAAACCAGTTAAGAAGAACGTGCTCACCATGCTGGATGAGGCCTATATTCAGCCACAGCCTCTGGGAGTGGTGCTGATAATCGGAGCTTGGAATTACCCCTTCGTTCTCACCATTCAGCCACTGATAGGAGCCATCGCTGCAGgtctggtgccaccttatgtctatatacctttttagggaggcttattttctcatattaattggaaattaaggatagtggctaattaaatacatttacttgg' # ALDH10, exon 3: #seq7 = 'taattgggagtacctagcctgttcttcccactgaacatcattttggtagctattaaagttaaatattagatgatactgttctactttttactttatttagGAAATGCTGTGATTATAAAGCCTTCTGAACTGAGTGAAAATACAGCCAAGATCTTGGCAAAGCTTCTCCCTCAGTATTTAGACCAGgtaagaatttcttgactcatctccaacatatgtgtttactgtggaaaacacacattttattttcttgctattgcatgttattgctggccggggacccaat' seq= prepare(seq3) donor_result = find_motif (seq, 'donor', 0) acceptor_result = find_motif (seq, 'acceptor', 0) print 'Testing CDK10, exon 3: ' print seq3 print '\nAcceptor motifs in this seq:' for result in acceptor_result: print result print '\nDonor motifs in this seq:' for result in donor_result: print result winning_donor = get_winner(donor_result) winning_acceptor = get_winner(acceptor_result)