예제 #1
0
def put_it_all_together(sequence):
    seq = sequence
    seq = prepare(seq)
    cleaned_up_seq = ''
    winning_donor= []
    winning_acceptor=[]
    confirmed_donor_list=[]
    confirmed_acceptor_list=[]
   
    # First, we need to break it up into the rough chunks where transitions are happening 
    chunk_locations_list = find_change_points(seq)
    
    # NOW STUBBING/UPDATING THIS LIST WITH THE VALUES I'D EXPECT THE ABOVE FUNCTION TO RETURN ONCE IT'S WORKING CORRECTLY
    # These positions are the start of the 25bp chunk that differs from it's following neighbor chunk
    # (as opposed to spoon-feeding the positions of the splice sites themselves, I want to make find_motif do its thing)
    # Actual splice site start positions in this seq are 85, 276, 358, 549, 630
    chunk_locations_list = [[75, 'donor', 0, 0, 0, 0], [250, 'acceptor', 0, 0, 0, 0], [325, 'donor', 0, 0, 0, 0], [525, 'acceptor', 0, 0, 0, 0], [600,'donor', 0, 0, 0, 0]] 
    
    # We have our positions for where something's generally happening, so let's search a nice window (99bp) for exact motif location
    for chunk in chunk_locations_list: 
        temp_string = seq[chunk[0]-1:chunk[0]+100] # at the beginning of the slice, subtract 1 to go from position to cursor
        motif_type = chunk[1]
        
        if motif_type =='donor':
            this_string_donors= find_motif(temp_string, 'donor', chunk[0])
            winning_donor = get_winner(this_string_donors) 
            confirmed_donor_list.append(winning_donor)

        if motif_type =='acceptor':         
            this_string_acceptors = find_motif(temp_string, 'acceptor',chunk[0])
            winning_acceptor = get_winner(this_string_acceptors) 
            confirmed_acceptor_list.append(winning_acceptor)

    cleaned_up_seq = make_clean_seq (seq, confirmed_donor_list, confirmed_acceptor_list)    
    return cleaned_up_seq
def put_it_all_together(sequence):
    seq = sequence
    seq = prepare(seq)
    cleaned_up_seq = ''
    winning_donor= []
    winning_acceptor=[]
    confirmed_donor_list=[]
    confirmed_acceptor_list=[]
   
    # First, we need to break it up into the rough chunks where transitions are happening 
    chunk_locations_list = find_change_points(seq)
    
    # NOW STUBBING/UPDATING THIS LIST WITH THE VALUES I'D EXPECT THE ABOVE FUNCTION TO RETURN ONCE IT'S WORKING CORRECTLY
    # These positions are the start of the 25bp chunk that differs from it's following neighbor chunk
    # (as opposed to spoon-feeding the positions of the splice sites themselves, I want to make find_motif do its thing)
    # Actual splice site start positions in this seq are 85, 276, 358, 549, 630
    chunk_locations_list = [[75, 'donor', 0, 0, 0, 0], [250, 'acceptor', 0, 0, 0, 0], [325, 'donor', 0, 0, 0, 0], [525, 'acceptor', 0, 0, 0, 0], [600,'donor', 0, 0, 0, 0]] 
    
    # We have our positions for where something's generally happening, so let's search a nice window (99bp) for exact motif location
    for chunk in chunk_locations_list: 
        temp_string = seq[chunk[0]-1:chunk[0]+100] # at the beginning of the slice, subtract 1 to go from position to cursor
        motif_type = chunk[1]
        
        if motif_type =='donor':
            this_string_donors= find_motif(temp_string, 'donor', chunk[0])
            winning_donor = get_winner(this_string_donors) 
            confirmed_donor_list.append(winning_donor)

        if motif_type =='acceptor':         
            this_string_acceptors = find_motif(temp_string, 'acceptor',chunk[0])
            winning_acceptor = get_winner(this_string_acceptors) 
            confirmed_acceptor_list.append(winning_acceptor)

    cleaned_up_seq = make_clean_seq (seq, confirmed_donor_list, confirmed_acceptor_list)    
    return cleaned_up_seq
def find_change_points(sequence):
    seq = sequence
    seq = prepare(seq)
    length = len(seq)
    cursor = 0
    segments_list=[] # here's where we'll store the starting position and frequencies for each chunk
    change_points_list = [] # here's where we'll store the starting position where there's a significant delta from chunk to chunk
    change_threshold = .23 # threshold for significance
    chunk_size_to_check = 25.0  # chose small size in case of short exons

    # First, grab chunks of bases (based on size specified above) and save the base frequencies in each chunk
    while (cursor < length):
        this_chunk = seq[cursor:cursor+int(chunk_size_to_check)]

        this_chunk_A_freq = this_chunk.count('A')/chunk_size_to_check
        this_chunk_C_freq = this_chunk.count('C')/chunk_size_to_check
        this_chunk_G_freq = this_chunk.count('G')/chunk_size_to_check
        this_chunk_T_freq = this_chunk.count('T')/chunk_size_to_check
    
        segments_list.append([cursor+1, this_chunk_A_freq, this_chunk_C_freq, this_chunk_G_freq, this_chunk_T_freq]) #+1 for position (rather than cursor)
        cursor += int(chunk_size_to_check)
        
        
    # Next, let's see which ones had a big (quantify) change from one chunk to the next
    number_of_chunks = len(segments_list)
    counter = 0
    motif_type=''
    while (counter < number_of_chunks-1):
        first_list = segments_list[counter]
        second_list =segments_list[counter+1]
        big_change_found = False
        
        A_delta = (first_list[1]-second_list[1])
        C_delta = (first_list[2]-second_list[2])
        G_delta = (first_list[3]-second_list[3])
        T_delta = (first_list[4]-second_list[4])

        # If any of the frequencies change a lot, save this to our change_points_list: donor or acceptor depending on the details      
        # Before-vs-after donor sites, C&G go down and T goes up
        if C_delta > change_threshold or G_delta > change_threshold or (T_delta*-.01) > change_threshold: 
            motif_type = 'donor'
            big_change_found = True
        # Before-vs-after acceptor sites, C&G go up and T goes down
        if T_delta > change_threshold or (C_delta *-1.0) > change_threshold or (G_delta *-1.0) > change_threshold: 
            motif_type = 'acceptor'
            big_change_found = True
            
        if big_change_found:
            change_points_list.append([first_list[0], motif_type, A_delta, C_delta, G_delta, T_delta])        
            
        counter += 1
    
    return change_points_list
예제 #4
0
def find_change_points(sequence):
    seq = sequence
    seq = prepare(seq)
    length = len(seq)
    cursor = 0
    segments_list=[] # here's where we'll store the starting position and frequencies for each chunk
    change_points_list = [] # here's where we'll store the starting position where there's a significant delta from chunk to chunk
    change_threshold = .23 # threshold for significance
    chunk_size_to_check = 25.0  # chose small size in case of short exons

    # First, grab chunks of bases (based on size specified above) and save the base frequencies in each chunk
    while (cursor < length):
        this_chunk = seq[cursor:cursor+int(chunk_size_to_check)]

        this_chunk_A_freq = this_chunk.count('A')/chunk_size_to_check
        this_chunk_C_freq = this_chunk.count('C')/chunk_size_to_check
        this_chunk_G_freq = this_chunk.count('G')/chunk_size_to_check
        this_chunk_T_freq = this_chunk.count('T')/chunk_size_to_check
    
        segments_list.append([cursor+1, this_chunk_A_freq, this_chunk_C_freq, this_chunk_G_freq, this_chunk_T_freq]) #+1 for position (rather than cursor)
        cursor += int(chunk_size_to_check)
        
        
    # Next, let's see which ones had a big (quantify) change from one chunk to the next
    number_of_chunks = len(segments_list)
    counter = 0
    motif_type=''
    while (counter < number_of_chunks-1):
        first_list = segments_list[counter]
        second_list =segments_list[counter+1]
        big_change_found = False
        
        A_delta = (first_list[1]-second_list[1])
        C_delta = (first_list[2]-second_list[2])
        G_delta = (first_list[3]-second_list[3])
        T_delta = (first_list[4]-second_list[4])

        # If any of the frequencies change a lot, save this to our change_points_list: donor or acceptor depending on the details      
        # Before-vs-after donor sites, C&G go down and T goes up
        if C_delta > change_threshold or G_delta > change_threshold or (T_delta*-.01) > change_threshold: 
            motif_type = 'donor'
            big_change_found = True
        # Before-vs-after acceptor sites, C&G go up and T goes down
        if T_delta > change_threshold or (C_delta *-1.0) > change_threshold or (G_delta *-1.0) > change_threshold: 
            motif_type = 'acceptor'
            big_change_found = True
            
        if big_change_found:
            change_points_list.append([first_list[0], motif_type, A_delta, C_delta, G_delta, T_delta])        
            
        counter += 1
    
    return change_points_list
예제 #5
0
def process_all_reading_frames(text, min_ORF_length):  
    seq = prepare(text)    # clean up our sequence
    antisense = reverseComplement(seq) #get the reverse complement 

    #get results for each reading frame, using offset to start cursor in the right place)
    frame_1_ORFs= find_ORFs(seq, 0) 
    frame_2_ORFs= find_ORFs(seq, 1) 
    frame_3_ORFs= find_ORFs(seq, 2) 
    
    frame_a1_ORFs= find_ORFs(antisense, 0) 
    frame_a2_ORFs= find_ORFs(antisense, 1) 
    frame_a3_ORFs= find_ORFs(antisense, 2) 
    
    #print the results
    print_ORFs('+1', frame_1_ORFs, min_ORF_length, seq)
    print_ORFs('+2', frame_2_ORFs, min_ORF_length, seq)   
    print_ORFs('+3', frame_3_ORFs, min_ORF_length, seq)
    
    print_ORFs('-1', frame_a1_ORFs, min_ORF_length, antisense)
    print_ORFs('-2', frame_a2_ORFs, min_ORF_length, antisense)   
    print_ORFs('-3', frame_a3_ORFs, min_ORF_length, antisense)
예제 #6
0
def process_all_reading_frames(text,
                               min_ORF_length):  #named findAllOrf in sample
    seq = prepare(text)  # clean up our sequence
    antisense = reverseComplement(seq)  #get the reverse complement

    #get results for each reading frame, using offset to start cursor in the right place)
    frame_1_ORFs = find_ORFs(seq, 0)
    frame_2_ORFs = find_ORFs(seq, 1)
    frame_3_ORFs = find_ORFs(seq, 2)

    frame_a1_ORFs = find_ORFs(antisense, 0)
    frame_a2_ORFs = find_ORFs(antisense, 1)
    frame_a3_ORFs = find_ORFs(antisense, 2)

    #print the results
    print_ORFs('+1', frame_1_ORFs, min_ORF_length, seq)
    print_ORFs('+2', frame_2_ORFs, min_ORF_length, seq)
    print_ORFs('+3', frame_3_ORFs, min_ORF_length, seq)

    print_ORFs('-1', frame_a1_ORFs, min_ORF_length, antisense)
    print_ORFs('-2', frame_a2_ORFs, min_ORF_length, antisense)
    print_ORFs('-3', frame_a3_ORFs, min_ORF_length, antisense)
            this_snippet = seq[cursor:cursor+length_to_try]
            reverse = reverseComplement(this_snippet)
            if this_snippet == reverse:  #if we get a palindrome, save it and move on to checking the next length
                this_start = cursor + 1
                best_so_far = [this_snippet, this_start]
                length_to_try +=2
                cursor = 0
            else:         #if not a palindrome, keep checking snippets of this length
                cursor += 1    
        found_longest=True  #If we make it here, no palindromes for that length so we're done

    return best_so_far 

                
if ((len(sys.argv) < 2) or (len(sys.argv) > 3)):
    print "Usage: python", sys.argv[0], "<filename>"
else:
    fileName = sys.argv[1]
    if (len(sys.argv) > 2):             # This should be an integer
        try:
            limit = int(sys.argv[2])    # Convert string to integer
        except ValueError:              # try-except catches errors
            print sys.argv[2]
            exit()       
            
    
    text = readFastaFile(fileName) #remove 'cs58FileUtil" because it was giving a not defined error
    text = prepare(text)
    print find_longest_palindrome (text)
    
profile.run("find_longest_palindrome(text)")
예제 #8
0
    return biggest_repeat

if ((len(sys.argv) < 2) or (len(sys.argv) > 3)):
    print "Usage: python", sys.argv[0], "<filename>"
else:
    fileName = sys.argv[1]
    if (len(sys.argv) > 2):  # This should be an integer
        try:
            limit = int(sys.argv[2])  # Convert string to integer
        except ValueError:  # try-except catches errors
            print "\n\tExpecting an integer to define min ORF length, found",
            print sys.argv[2]
            exit()

    cleanedArray = []

    # instead of just one seq we'll have mutliple, need function to return an array of strings
    seqArray = readMultiFastaFile(fileName)
    #print "We got back: ", seqArray

    for seq in seqArray:
        seq = prepare(seq)
        cleanedArray.append(seq)
        # not sure how to save back to seqArray, so saving into cleanedArray

    print "We'll be comparing: ", cleanedArray

    print "The biggest repeat in this sequence is: ", find_biggest_repeat(
        cleanedArray)
# CDK10, exon 3
seq3 = 'aacggggacccctgtggctcagggagagcctcccgttcagcgctagggagcccacgaggggcatcgagatgatgtcatcaccaatgtgtttccattccagATCGGGCCCGGGACACCCAGACAGATGAGATTGTCGCACTGAAGAAGGTGCGGATGGACAAGGAGAAGGATGgtgagcaggaaattggggtgttgggacctcgcactgggaggagcagaaggatgtgagttacctgaagtttcctcagagcgactgcac ggtgcttgtagc'

# CDK10, exon 4
#seq4 = 'gcacctgctatcaggtgttcgtgaagcccaagagtggctggggttggggcttccccgccatcactggggtggggctcgctgaggccacctccctccccagGCATCCCCATCAGCAGCTTGCGGGAGATCACGCTGCTGCTCCGCCTGCGTCATCCGAACATCGTGGAGCTGAAGGAGGTGGTTGTGGGGAACCACCTGGAGAGgtacgtggtctcctggtctgcacattgggccctagggagcatgtgtcttgggctagaggtgttgcacagagcgaggactgagtgtcactgggcatgaggt'

# ALDH10, exon 1: 
#seq5='ATGGAGCTCGAAGTCCGGCGGGTCCGACAGGCGTTCCTGTCCGGCCGGTCGCGACCTCTGCGGTTTCGGCTGCAGCAGCTGGAGGCCCTGCGGAGGATGGTGCAGGAGCGCGAGAAGGATATCCTGACGGCCATCGCCGCCGACCTGTGCAAGgtacgcacgcgtgcggcggggtgtggggaaactggcccccgccgcgcacttgtggactggagcttcggctgggttttgtttttgcttttacatttcggat'

# ALDH10, exon 2: 
#seq6= 'ttgttgtcactacaggtgtacctggtgtgagtgttctgacattcagggccaagtgtatcatacttactctgcaagattaactgtgattctcttataacagAGTGAATTCAATGTGTACAGTCAGGAAGTCATTACTGTCCTTGGGGAAATTGATTTTATGCTTGAGAATCTTCCTGAATGGGTTACTGCTAAACCAGTTAAGAAGAACGTGCTCACCATGCTGGATGAGGCCTATATTCAGCCACAGCCTCTGGGAGTGGTGCTGATAATCGGAGCTTGGAATTACCCCTTCGTTCTCACCATTCAGCCACTGATAGGAGCCATCGCTGCAGgtctggtgccaccttatgtctatatacctttttagggaggcttattttctcatattaattggaaattaaggatagtggctaattaaatacatttacttgg'

# ALDH10, exon 3: 
#seq7 = 'taattgggagtacctagcctgttcttcccactgaacatcattttggtagctattaaagttaaatattagatgatactgttctactttttactttatttagGAAATGCTGTGATTATAAAGCCTTCTGAACTGAGTGAAAATACAGCCAAGATCTTGGCAAAGCTTCTCCCTCAGTATTTAGACCAGgtaagaatttcttgactcatctccaacatatgtgtttactgtggaaaacacacattttattttcttgctattgcatgttattgctggccggggacccaat'

seq= prepare(seq3) 
donor_result = find_motif (seq, 'donor', 0)
acceptor_result =  find_motif (seq, 'acceptor', 0)

print 'Testing CDK10, exon 3: '
print seq3
print '\nAcceptor motifs in this seq:'
for result in acceptor_result:
    print result
    
print '\nDonor motifs in this seq:' 
for result in donor_result:
    print result

winning_donor = get_winner(donor_result) 
winning_acceptor = get_winner(acceptor_result)