def real_dna(): dna = load_seq("./data/X73525.fa") threshold = longest_ORF_noncoding(dna, 500) aminoacidsequence = gene_finder(dna, threshold) return aminoacidsequence
def gene_finder(dna): """ Returns the amino acid sequences that are likely coded by the specified dna dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ a=[] #creating an empty list threshold = longest_ORF_noncoding(dna,1500) #Assign Value to threshold Long_Orfs = len(longest_ORF(dna)) #assign value to Long_Orfs if Long_Orfs>threshold): #Compare values a.append(coding_strand_to_AA(dna)) #add to the list dna = load_seq("./data/X73525.fa") #obtaining genes print gene_finder(dna) #showing the list of Amino Acids
def gene_finder(dna): #Makes sure it all comes together from load import load_seq dna = load_seq("/home/tolu/GeneFinder/data/X73525.fa") threshold = longest_ORF_noncoding(dna, 1500) output = [] protein = '' orfs = find_all_ORFs_both_strands(dna) i = 0 for i in range(len(orfs)): if (len(orfs[i]) >= threshold): protein = coding_strand_to_AA(orfs[i]) output.append(protein) return output
returns: a list of all amino acid sequences coded by the sequence dna. >>> gene_finder("ATGCGAATGTAGCATCAAA") ['MRM', 'MLHSH'] """ print 'Finding threshold...' threshold = len(longest_ORF_noncoding(dna, 1500)) print threshold all_ORFs = find_all_ORFs_both_strands(dna) returns = [] i=0 print 'Entering while loop... ' while i < len(all_ORFs): if len(all_ORFs[i]) >= threshold: returns.append(coding_strand_to_AA(all_ORFs[i])) print 'Added sequence' i+=1 print str(len(returns)) + 'sequences added: \n' print returns if __name__ == "__main__": import doctest #doctest.testmod() #doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose=True) gene_finder(load_seq("./data/X73525.fa"))
return proteins def gene_finder(dna): """ Returns the amino acid sequences that are likely coded by the specified dna dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ len(dna) num_trials = 1500 allorfs = find_all_ORFs_both_strands(dna) longest = len(longest_ORF(dna)) amirandom = longest_ORF_noncoding(dna, num_trials) finallist = ORFsinOrder(dna , amirandom) translated = [] for i in range(0, len(finallist)-1): #This loop translates all of the resulting potential genes into their final format. translated.append(coding_strand_to_AA(finallist[i])) return translated if __name__ == "__main__": import doctest doctest.testmod() mygenes = gene_finder(load_seq("./data/X73525.fa")) #mygenes is the final variable that contains all the potential genes above a threshold.
# Sets i to weep dna, j to sweep codons and sees where it sees where i:i+3 is in Codons # and returns the AA sequence for that codon. return ''.join([ aa[j] for i in range(0,len(dna),3) for j in range(len(codons)) if dna[i:i+3] in codons[j]]) def gene_finder(dna): """ Returns the amino acid sequences that are likely coded by the specified dna dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ # Finds the ORF length for threshold ORF_length = longest_ORF_noncoding( dna ,threshold) # End ORFs Real_ORFs = [] # Finds all the ORFs and sets i to them for i in find_all_ORFs_both_strands(dna): if len(i) > ORF_length: Real_ORFs.append(coding_strand_to_AA(i)) return len(Real_ORFs),Real_ORFs if __name__ == "__main__": import doctest doctest.testmod() print gene_finder( load_seq("./data/X73525.fa"), 1500)
from gene_finder import * from load import load_seq salmonella_dna = load_seq("./data/X73525.fa") def two_lists_contain_same_elements(list1, list2): if len(list1) != len(list2): return False else: for list_item in list1: if list_item in list2: continue else: return False return True def coding_strand_to_AA_unit_tests(): """ Unit tests for the coding_strand_to_AA function """ # DNA input strands dna_input1 = "ACTGCCCC" dna_input2 = "AGCTGAGGGTGTTTTGGA" dna_input3 = "CAGGCTTGCGGCTTCTTAA" # Expected output amino acid strands e_output1 = "TA" e_output2 = "S|GCFG" e_output3 = "QACGFL" # Actual output amino acid strands a_output1 = coding_strand_to_AA(dna_input1)
def main(): dna = load_seq("./data/X73525.fa") threshold = longest_ORF_noncoding(dna, 1500) candidate_genes = gene_finder(dna, threshold)
def gene_finder(dna): """ Returns the amino acid sequences coded by all genes that have an ORF larger than the specified threshold. dna: a DNA sequence threshold: the minimum length of the ORF for it to be considered a valid gene. returns: a list of all amino acid sequences whose ORFs meet the minimum length specified. """ # TODO: implement this # determine threshold threshold = longest_ORF_noncoding(dna, 1500) # call coding_strand_to_AA on each of the ORFs return [ coding_strand_to_AA(ORF) for ORF in (find_all_ORFs_both_strands(dna)) if (len(ORF) > threshold) ] if __name__ == "__main__": import doctest doctest.testmod() dna = load_seq("./data/X73525.fa") print gene_finder(dna)
""" acids = [] for i in range(0, len(dna) - 2, 3): acids.append(aa_table[dna[i:i + 3]]) return ''.join(acids) def gene_finder(dna): """ Returns the amino acid sequences coded by all genes that have an ORF larger than the specified threshold. dna: a DNA sequence threshold: the minimum length of the ORF for it to be considered a valid gene. returns: a list of all amino acid sequences whose ORFs meet the minimum length specified. """ acids = [] threshold = longest_ORF_noncoding(dna, 1500) for i in find_all_ORFs_both_strands(dna): if len(i) >= threshold: acids.append(coding_strand_to_AA(i)) return acids if __name__ == "__main__": import doctest doctest.testmod() print(gene_finder(load_seq('./data/X73525.fa')))
def run_gene_finder(): """Loads gene and returns long_ORFS""" dna = load_seq('./data/X73525.fa') amino_acids = gene_finder(dna) return amino_acids
def get_threshold(): """Returns a conservative threshold to use to get ORFS. Prints 789""" dna = load_seq('./data/X73525.fa') return longest_ORF_noncoding(dna,1500)
def gene_finder_salmonella(): from load import load_seq dna = load_seq("./data/X73525.fa") threshold = len(longest_ORF_noncoding(dna,1500)) salmonella_aa = gene_finder(dna,threshold) return salmonella_aa
def get_reverse_complement(dna): """ Computes the reverse complementary sequence of DNA for the specfied DNA sequence dna: a DNA sequence represented as a string returns: the reverse complementary DNA sequence represented as a string >>> get_reverse_complement("ATGCCCGCTTT") 'AAAGCGGGCAT' >>> get_reverse_complement("CCGCGTTCA") 'TGAACGCGG' >>> get_reverse_complement("ATCG") 'CGAT' """ # TODO: implement this reversed_dna = dna[::-1] result = ' ' for letter in reversed_dna: result = result + get_complement(letter) return result def divide_to_codons(dna): """Takes a DNA sequence and outputs a list of string triplets(codons) that makes up the sequence Last element might be incomplete codon with less then three letters >>> divide_to_codons("ATGTGAA") ['ATG', 'TGA', 'A'] >>> divide_to_codons("ATGTGA") ['ATG', 'TGA'] >>> divide_to_codons("ATGTGAAA") ['ATG', 'TGA', 'AA'] """ index = 0 result = [] while index < len(dna): result.append(dna[index:index+3]) index = index + 3 return result def rest_of_ORF(dna): """ Takes a DNA sequence that is assumed to begin with a start codon and returns the sequence up to but not including the first in frame stop codon. If there is no in frame stop codon, returns the whole string. dna: a DNA sequence returns: the open reading frame represented as a string >>> rest_of_ORF("ATGTGAA") 'ATG' >>> rest_of_ORF("ATGAGATAGG") 'ATGAGA' >>> rest_of_ORF("ATG") 'ATG' >>> rest_of_ORF("AT") 'AT' >>> rest_of_ORF("ATGASDASDWASDWADASDSAD") 'ATGASDASDWASDWADASDSAD' >>> rest_of_ORF("ATGTGTTAAATGAAAAAATAGAA") 'ATGTGT' """ stop_codons = ['TAG', 'TAA, TGA'] #list of codons from which the dna is composed of codons = divide_to_codons(dna) result = "" index = 0 while index + 1 < len(codons): #If next codons isn't a stop codon, add it to string and iterate if codons[index + 1] not in stop_codons: result = result + codons[index] index = index + 1 else: #Add codon before stop codon result = result + codons[index] return result return dna def find_all_ORFs_oneframe(dna): """ Finds all non-nested open reading frames in the given DNA sequence and returns them as a list. This function should only find ORFs that are in the default frame of the sequence (i.e. they start on indices that are multiples of 3). By non-nested we mean that if an ORF occurs entirely within another ORF, it should not be included in the returned list of ORFs. dna: a DNA sequence returns: a list of non-nested ORFs >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC") ['ATGCATGAATGTAGA', 'ATGTGCCC'] >>> find_all_ORFs_oneframe("ATGTGAA") ['ATG'] >>> find_all_ORFs_oneframe('ASDASDAWSDSD') [] >>> find_all_ORFs_oneframe('TATATGCATGAATGTAGATAGATGTGCTAAATAATAATGTTTTAAATT') ['ATGCATGAATGTAGA', 'ATGTGC', 'ATGTTT'] """ index = 0 orf_list = [] while index < len(dna): if dna[index:index+3] == 'ATG': #appended ORF orf = rest_of_ORF(dna[index:]) orf_list.append(orf) index = index + len(orf) else: index = index + 3 return orf_list def find_all_ORFs(dna): """ Finds all non-nested open reading frames in the given DNA sequence in all 3 possible frames and returns them as a list. By non-nested we mean that if an ORF occurs entirely within another ORF and they are both in the same frame, it should not be included in the returned list of ORFs. dna: a DNA sequence returns: a list of non-nested ORFs This unit testing would be enough because there isn't any special exceptions that needs to be tested. Also, this case tests this function's ability to grab orf from three different possible reading frames. >>> find_all_ORFs("ATGCATGAATGTAG") ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG'] """ #orf list from all frames orf_list = [] #zero offset frame orf_list = orf_list + find_all_ORFs_oneframe(dna) #first offset frame orf_list = orf_list + find_all_ORFs_oneframe(dna[1:]) #second offset frame orf_list = orf_list + find_all_ORFs_oneframe(dna[2:]) return orf_list def find_all_ORFs_both_strands(dna): """ Finds all non-nested open reading frames in the given DNA sequence on both strands. dna: a DNA sequence returns: a list of non-nested ORFs >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA") ['ATGCGAATG', 'ATGCTACATTCGCAT'] """ reverse = get_reverse_complement(dna) #finds orfs in both direction orf_list = find_all_ORFs(dna) + find_all_ORFs(reverse_complement) return orf_list def longest_ORF(dna): """ Finds the longest ORF on both strands of the specified DNA and returns it as a string >>> longest_ORF("ATGCGAATGTAGCATCAAA") 'ATGCTACATTCGCAT' """ longest_length = 0 orfs = find_all_ORFs_both_strands(dna) for orf in orfs: if len(orf) > longest_length: longest_orf = orf longest_length = len(orf) return longest_orf def longest_ORF_noncoding(dna, num_trials): """ Computes the maximum length of the longest ORF over num_trials shuffles of the specfied DNA sequence dna: a DNA sequence num_trials: the number of random shuffles returns: the maximum length longest ORF """ x = 0 longest = 0 while x < num_trials: shuffled_dna = shuffle_string(dna) longest_orf_length = len(longest_ORF(shuffled_dna)) if longest_orf_length > longest: longest = longest_orf_length x = x + 1 return longest def coding_strand_to_AA(dna): """ Computes the Protein encoded by a sequence of DNA. This function does not check for start and stop codons (it assumes that the input DNA sequence represents an protein coding region). dna: a DNA sequence represented as a string returns: a string containing the sequence of amino acids encoded by the the input DNA fragment >>> coding_strand_to_AA("ATGCGA") 'MR' >>> coding_strand_to_AA("ATGCCCGCTTT") 'MPA' >>> coding_strand_to_AA("TTTATCATGTTAGTTA") 'FIMLV' """ codons = divide_to_codons(dna) amino_acid = '' for codon in codons: if len(codon) == 3: amino_acid = amino_acid + aa_table[codon] return amino_acid def gene_finder(dna): """ Returns the amino acid sequences that are likely coded by the specified dna dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ threshold = longest_ORF_noncoding(dna, 1500) all_orfs = find_all_ORFs_both_strands(dna) amnio_acids = [] for orf in all_orfs: if len(orf) > threshold : amino_acids.append(coding_strang_to_AA(orf)) return amino_acids if __name__ == "__main__": import doctest doctest.testmod(verbose = True) doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose = True) dna_seq = load_seq('data/X73525.fa') print (gene_finder(dna_seq))
s = 0 while s <= len(dna) - 3: m.append(aa_table[dna[s:s + 3]]) s = s + 3 k = '' k = k.join(m) return k def gene_finder(dna): """ Returns the amino acid sequences that are likely coded by the specified dna dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ ac = [] threshold = longest_ORF_noncoding(dna, 1500) dna_long = find_all_ORFs_both_strands(dna) for i in dna_long: if len(i) > threshold: ac.append(coding_strand_to_AA(i)) return ac if __name__ == "__main__": import doctest print(gene_finder(dna=load_seq("./data/X73525.fa"))) doctest.testmod(verbose=True)
>>> coding_strand_to_AA("TTTATCATGTTAGTTA") 'FIMLV' """ codons = divide_to_codons(dna) amino_acid = '' for codon in codons: if len(codon) == 3: amino_acid += aa_table[codon] return amino_acid def gene_finder(dna): """ Returns the amino acid sequences that are likely coded by the specified dna dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ threshold = longest_ORF_noncoding(dna, 1500) all_orfs = find_all_ORFs_both_strands(dna) amino_acids = [] for orf in all_orfs: if len(orf) > threshold: amino_acids.append(coding_strand_to_AA(orf)) return amino_acids if __name__ == "__main__": import doctest doctest.testmod() #doctest.run_docstring_examples(coding_strand_to_AA, globals()) dna_seq = load_seq('data/X73525.fa') print gene_finder(dna_seq)
def gene_finder(dna): """ Returns the amino acid sequences that are likely coded by the specified dna dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. No appropriate doctests can be written for this function - the result can and most likely will vary with each call, because longest_ORF_noncoding is inherently not consistent in its return values, due to the random nature of the shuffle. """ max_len = len(longest_ORF_noncoding(dna, 1500)) return sorted( [ coding_strand_to_AA(orf) for orf in find_all_ORFs_both_strands(dna) if len(orf) >= max_len ], key=len, reverse=True ) #Returns reverse length sorted list of all translated genes that are longer than threshold if __name__ == "__main__": import doctest doctest.testmod() result = gene_finder(load_seq("./data/X73525.fa")) print "Here is the result:" for translated_protein in result: print translated_protein
for k in range(0,len(dna_three)): threes = dna_three[k] if len(threes)==3: amino_acid_1 = aa_table[threes] amino_acid += amino_acid.join(amino_acid_1) k = k + 1 return amino_acid def gene_finder(dna): """ Returns the amino acid sequences coded by all genes that have an ORF larger than the specified threshold. dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ # TODO: implement this threshold = longest_ORF_noncoding(dna, 1500) both_strand_orfs_unthreshold = find_all_ORFs_both_strands(dna) both_strand_orfs_threshold = [] for orfs in both_strand_orfs_unthreshold: if len(orfs) > threshold: both_strand_orfs_threshold.append(orfs) final_amino_conversion = map(coding_strand_to_AA, both_strand_orfs_threshold) return final_amino_conversion dna = load_seq('./data/X73525.fa') print gene_finder(dna)
def salmonella_gene_finder(): dna = load_seq("./data/X73525.fa") genes = gene_finder(dna) for gene in genes: print gene
for i in range(0, len(dna), 3): codon = dna[i:i+3] if len(codon) == 3: aa_sequence += aa_table[codon] return aa_sequence def gene_finder(dna): """ Returns the amino acid sequences that are likely coded by the specified dna dna: a DNA sequence returns: a list of all amino acid sequences coded by the sequence dna. """ aa_list = [] threshold = longest_ORF_noncoding(dna, 1500) print threshold dna_list = find_all_ORFs_both_strands(dna) for i in range(0, len(dna_list)): if len(dna_list[i]) > threshold: aa_list.append(coding_strand_to_AA(dna_list[i])) return aa_list if __name__ == "__main__": import doctest #doctest.testmod() #doctest.run_docstring_examples(coding_strand_to_AA, globals()) from load import load_seq dna = load_seq("./data/X73525.fa") print gene_finder(dna)
shuffle(DNA) compare = [] for index in range(len(longest_frame)): #cycles through num_trials number of indexes comparing to find longest ORF if len(longest_frame[index]) > len(compare): compare = longest_frame[index] else: compare = compare return len(compare) #print longest_ORF_noncoding('ATGCGAATGTAGCATCAAA', 30) def gene_finder(dna, threshold): """ Returns the amino acid sequences coded by all genes that have an ORF larger than the specified threshold. dna: a DNA sequence threshold: the minimum length of the ORF for it to be considered a valid gene. returns: a list of all amino acid sequences whose ORFs meet the minimum length specified. """ amino_acids = [] all_frames = find_all_ORFs_both_strands(dna) for index in range(len(all_frames)): #if the length of ORF is above the threshold, adds it to list if len(all_frames[index]) > threshold: amino_acids.append(coding_strand_to_AA(all_frames[index])) #finds amino acids coresponding to codons from above list return amino_acids if __name__ == '__main__': dna = load_seq('./data/X73525.fa') gene_finder(dna,600) print(gene_finder(dna, 600))