def evalPrimerPairMT(fprimer, rprimer, ret_mt=False): """This will check the melting temperature The optimal melting temperature of the primers is 60–64°C, with an ideal temperature of 62°C, which is based on typical cycling and reaction conditions and the optimum temperature for PCR enzyme function. Ideally, the melting temperatures of the 2 primers should not differ by more than 2°C in order for both primers to bind simultaneously and efficiently amplify the product. PCR parameters used are from IDT: Oligo 0.2 uM Na 50 mM, Mg 3 mM, dNTPs 0.8 mM :param ret_mt: """ fprimer_MT = MeltingTemp.Tm_GC(fprimer, Na=50, Mg=3, dNTPs=0.8) rprimer_MT = MeltingTemp.Tm_GC(rprimer, Na=50, Mg=3, dNTPs=0.8) fprimer_MT_NN = MeltingTemp.Tm_NN(fprimer, Na=50, Mg=3, dNTPs=0.8) rprimer_MT_NN = MeltingTemp.Tm_NN(fprimer, Na=50, Mg=3, dNTPs=0.8) print( f"forw primer: {fprimer}\nforw primer MT: {fprimer_MT} {fprimer_MT_NN} \n" f"rev primer: {rprimer}\nrev primer MT : {rprimer_MT} {rprimer_MT_NN} \n" ) """Filters for primers that meet the MT standards""" if math.fabs(fprimer_MT - rprimer_MT) <= 3 and\ max(fprimer_MT,rprimer_MT) <= 64 and\ min(fprimer_MT, rprimer_MT) >= 60: print("MT of primer pair passed.\n") if ret_mt == False: return True else: return fprimer_MT, rprimer_MT else: print("MT for the primer pairs did not meet standards\n") return False
def printNSeq(num_seq=20,seq_len=20,GC_low_cutoff=40,GC_high_cutoff=60): uniq_seq = list() while len(uniq_seq) < num_seq: seq = randSeqGen(seq_len) GCcontent = GCpercent(seq) if GCcontent >= GC_low_cutoff and GCcontent <= GC_high_cutoff: uniq_seq.append((str(seq), GCcontent, round(MT.Tm_GC(seq, Na=50, Mg=3, dNTPs=0.8),3))) else: continue pprint(uniq_seq)
def evalSeqMT(seq, ret_mt=False): seq_MT = round(MT.Tm_GC(seq, Na=50, Mg=3, dNTPs=0.8), 2) if seq_MT < 65.0 and seq_MT > 59.0: if ret_mt == False: return True else: return seq_MT else: # print("MT for the primer pairs did not meet standards\n") return False
def calc_tm_value(self): self.primer_fw_seq = Seq(self.primer_fw.upper()) self.primer_rv_seq = Seq(self.primer_rv.upper()) self.tm_value_Wallace = np.mean([ mt.Tm_Wallace(self.primer_fw_seq), mt.Tm_Wallace(self.primer_rv_seq) ]) # GC法で計算 self.tm_value_GC = np.mean([ mt.Tm_GC(self.primer_fw_seq, Na=50, valueset=7), mt.Tm_GC(self.primer_rv_seq, Na=50, valueset=7) ]) # GC法で計算 self.tm_value_NN = np.mean([ mt.Tm_NN(self.primer_fw_seq, Na=50, nn_table=mt.DNA_NN1), mt.Tm_NN(self.primer_rv_seq, Na=50, nn_table=mt.DNA_NN1) ]) # 最近接塩基法で計算 self.tm_table_column = ["計算手法", "Tm値 (°C)"] self.tm_list = [["Wallace法", round(self.tm_value_Wallace, 1)], ["GC法", round(self.tm_value_GC, 1)], ["最近接塩基法", round(self.tm_value_NN, 1)]] self.tm_table = pd.DataFrame(self.tm_list, columns=self.tm_table_column)
if __name__ == '__main__': args = parse_args() file, seq_format, fh = args.infile, args.format, None, if file: if not seq_format: found = re.search(r'(?i)(fasta|fa|fastq|fq)(.gz)?$', file) if not found: print( "invalid file name suffix.\nfile name should like this: infile.[fasfa|fa|fastq|fq][.gz]", file=sys.stderr) sys.exit(1) seq_format, is_gz = found.groups() if seq_format == 'fa': seq_format = 'fasta' if seq_format == 'fq': seq_format = 'fastq' fh = gzip.open(file, 'rt') if file.endswith('.gz') else open(file, 'r') else: fh = sys.stdin seq_format = args.format sys.stdout.write('{}\t{}\t{}\t{}\n'.format('seq_id', 'Tm_Wallace', 'Tm_GC', 'Tm_NN')) for seq in SeqIO.parse(fh, seq_format): sys.stdout.write('{}\t{:0.2f}\t{:0.2f}\t{:0.2f}\n'.format( seq.id, mt.Tm_Wallace(seq.seq), mt.Tm_GC(seq.seq), mt.Tm_NN(seq.seq))) fh.close()
#!/usr/bin/env python import sys from Bio.SeqUtils import molecular_weight from Bio.SeqUtils import MeltingTemp as mt print("python: " + sys.version, end="\n", file=sys.stderr) print(sys.argv[1], end="\n", file=sys.stderr) with open(sys.argv[1]) as file: for line in file: row = line.rstrip('\n').split("\t") seq = row[3] if seq == 'cdna': row.extend(["tm_nn", "tm_gc", "tm_wallace"]) print(",".join(row)) else: mw = molecular_weight(seq, 'DNA', False) row.append('%0.2f' % mt.Tm_NN(seq)) row.append('%0.2f' % mt.Tm_GC(seq)) row.append('%0.2f' % mt.Tm_Wallace(seq)) print(",".join(row))
# st.warning("Please enter a FASTA Sequence !") # st.stop() #INPUT USING UPLOAD st.set_option('deprecation.showfileUploaderEncoding', False) seqfile = st.file_uploader("Upload DNA fasta file", type=["fasta", "fa"]) #BASIC STATS CALCULATION if seqfile is not None: dnarecord = SeqIO.read(seqfile, "fasta") dnaID = dnarecord.id dnadescript = dnarecord.description dnaseq = dnarecord.seq length = len(dnaseq) gccont = GC(dnaseq) melttemp = MeltingTemp.Tm_GC(dnaseq) dnafreq = Counter(dnaseq) #SETTING RADIO BUTTONS FOR OPTIONS ID|DESCRIPTION|SEQUENCE details = st.radio("Sequence Details", ("ID", "Description", "Sequence")) if details == "Description": st.write(dnadescript) elif details == "Sequence": st.write(dnaseq) elif details == "ID": st.write(dnaID) #SETTING RADIO BUTTON FOR OPTIONS LENGTH|FREQUENCY TABLE|GC CONTENT|MELTING TEMPERATURE|PLOT NUCLEOTIDE FREQUENCY stats = st.radio("Sequence Statistics", ("Length", "Frequency Table", "GC-Content",
exit_file.writelines('C: {}\n'.format(sequence.count("C"))) exit_file.writelines('G: {}\n'.format(sequence.count("G"))) exit_file.writelines('T: {}\n'.format(sequence.count("T"))) # Calculating the percentage of GC amount_of_GC = sequence.count("C") + sequence.count("G") percentage_of_GC = (amount_of_GC / amount_of_nucleotides) * 100 exit_file.writelines('Amount of GC: {}\n'.format(amount_of_GC)) exit_file.writelines('% of GC: {}%\n'.format('%0.2f' % percentage_of_GC)) exit_file.writelines('\nMelting Temperature Values\n') # Calculating Melting Temperature exit_file.writelines('Tm_GC: {}\n'.format('%0.2f' % mt.Tm_GC(sequence))) exit_file.writelines('Tm_NN: {}\n'.format('%0.2f' % mt.Tm_NN(sequence))) # From University of Arizona Formula tm = 64.9 + 0.41 * percentage_of_GC - (500 / amount_of_nucleotides) exit_file.writelines('Arizona\'s: {}\n'.format('%0.2f' % tm)) exit_file.writelines('\n') # Getting information for each graphic TMarizona_values.append(tm) TMGC_values.append(mt.Tm_GC(sequence)) TMNN_values.append(mt.Tm_NN(sequence)) GC_values.append(percentage_of_GC) exit_file.close()
seqList = [line for line in clean2 if re.match(r'^[AGCT]+$', line)] sequence = "".join(i for i in seqList[:bases]) def gcContent(sequence): count = 0 for i in sequence: if i == 'G' or i == 'C': count += 1 else: count = count return round((count / bases) * 100, 1) gc = gcContent(sequence) tm = mt.Tm_GC(sequence, Na=50) moleWeight = round(mw(Seq(sequence, generic_dna)), 2) dilWeight = float(clean2[clean2.index("ug/OD260:") + 10:clean2.index("ug/OD260:") + 14]) dilution = dilWeight * 10 primerDict = { "Primer Data": { "Sequence": sequence, "Bases": bases, "TM (50mM NaCl)": tm, "% GC content": gc, "Molecular weight": moleWeight, "ug/0D260": dilWeight, "Dilution volume (uL)": dilution }, "Shipment Info": {
def complete_tasks(full_seq, des, unique_key): file_details = st.radio("Details", ("Description", "Sequence"), key=unique_key) #Show description and sequence in DNA Analysis section if file_details == "Description": st.write(des) elif file_details == "Sequence": st.write(full_seq) #Nucleotide occurances plot and color selector for the bars st.subheader("Plot Nucleotide Frequency") full_seq_freq = OrderedDict(Counter(full_seq)) bar1_colour = st.beta_color_picker("Pick Colour for Bar 1", key=unique_key) bar2_colour = st.beta_color_picker("Pick Colour for Bar 2", key=unique_key) bar3_colour = st.beta_color_picker("Pick Colour for Bar 3", key=unique_key) bar4_colour = st.beta_color_picker("Pick Colour for Bar 4", key=unique_key) if st.button("Plot Frequency", key=unique_key): barlist = plt.bar(full_seq_freq.keys(), full_seq_freq.values()) barlist[0].set_color(bar1_colour) barlist[1].set_color(bar2_colour) barlist[2].set_color(bar3_colour) barlist[3].set_color(bar4_colour) st.pyplot() st.subheader("Properties") #GC Content, GC Melting temp, GC_skew, Complement and reverse complement gc_count = GC(full_seq) st.write("GC Content: {}".format(gc_count)) mt = MeltingTemp.Tm_GC(full_seq, strict=False) st.write("Melting Temperature based on GC Content: {}".format(mt)) gc_skew_bases = st.number_input("Enter number of bases", key=unique_key) try: gc_skew = GC_skew(full_seq, int(gc_skew_bases)) st.write("GC Skew for {} bases: {}".format(gc_skew_bases, gc_skew)) except ValueError: st.write("Enter a Valid Number for bases") if st.checkbox("Complement", key=unique_key): st.write(full_seq.complement()) elif st.checkbox("Reverse Complement", key=unique_key): st.write(full_seq.reverse_complement()) #Protein Synthesis st.subheader("Protein Synthesis") p1 = full_seq.translate() if st.checkbox("Transcription: DNA to mRNA", key=unique_key): st.write(full_seq.transcribe()) elif st.checkbox("Translation: DNA to 1 letter Amino Acid Sequence", key=unique_key): st.write(p1) elif st.checkbox("Translation: DNA to 3 letter Amino Acid Sequence", key=unique_key): full_aa_name = str(p1).replace("*", "") st.write(seq3(full_aa_name)) elif st.checkbox("Plot Amino Acid Frequency", key=unique_key): aa_freq = OrderedDict(Counter(str(p1))) bar_colour = st.beta_color_picker("Pick Colour for all Bars", key=unique_key) plt.bar(aa_freq.keys(), aa_freq.values(), color=bar_colour) st.pyplot() st.write("Asterisk (*) - Denotes Stop Codons.")
def melting_temp(sequence): try: return MeltingTemp.Tm_GC(sequence, strict=False) except ZeroDivisionError: return
def temperatures(dic): Tw = round(mt.Tm_Wallace(dic, strict=False), 2) Tgc = round(mt.Tm_GC(dic, strict=False), 2) Tnn = round(mt.Tm_NN(dic, strict=False), 2) return Tw, Tgc, Tnn
from Bio.Seq import Seq from Bio import SeqIO from Bio.Alphabet import IUPAC from Bio.SeqUtils import GC from Bio.SeqUtils import MeltingTemp as mt import numpy as np for myseq in SeqIO.parse( "./hantavirus_M_1200_3700_segments_seq_1st_filter_oligos.fasta", "fasta"): #print(seq_record.id) #print 'seq %s is %i bases long\toligo_sequence\tPercent_GC\tTm_mean' % (myseq.id[0:59], len(myseq) ) i = 0 j = 80 while j < len(myseq): a = [ mt.Tm_NN(myseq.seq[i:j], strict=False), mt.Tm_GC(myseq.seq[i:j], strict=False) ] Tm_mean = (sum(a) / 2) print '%s_M_%i_%i\t%s\t%0.2f\t%0.2f' % ( myseq.id[0:29], i, j, myseq.seq[i:j], GC(myseq.seq[i:j]), Tm_mean) #print 'GC content is: %i' % (GC(myseq.seq[i:j])) #print('%0.2f' % mt.Tm_NN(myseq.seq[i:j]) ) #print('%0.2f' % mt.Tm_GC(myseq.seq[i:j]) ) i += 200 j = i + 80
def main(target_file, reference_database): start_time = time.time() # make sure cmdline arg filepaths exist try: if not os.path.exists(target_file): raise FileNotFoundError("infile not found: {}" "".format(target_file)) if not os.path.exists(reference_database): raise FileNotFoundError("reference file not found: {}" "".format(reference_database)) except FileNotFoundError as e: raise e targets = SeqIO.parse(target_file, 'fasta') reference_file = SeqIO.parse('{}'.format(reference_database), 'fasta') # Create a folder named 'part1' within the current working directory, this # will be where all probes are saved and the sub_sam files. working_dir = os.getcwd() if os.path.exists(os.path.dirname('{}/part1/'.format(working_dir))): pass else: os.makedirs(os.path.dirname('{}/part1/'.format(working_dir))) # Loading in the gene names from the reference database. This will throw # error messages if the gene name is not present (e.g., eGFP) but is also a # good measure in case the gene name is not exactly the same as the # reference database (e.g., Cd8 vs. CD8). database_gene_list = set() for seq in reference_file: gene_name = (seq.name) gene_name = gene_name.split('|')[5] database_gene_list.add(gene_name) for target in targets: if str(target.name).count('-') == 0: target_name = str(target.name) elif str(target.name).count('-') == 1: target_name = str(target.name).split('-')[0] elif str(target.name).count('-') == 2: z = str(target.name).split('-') target_name = z[0] + '-' + z[1] print('Now predicting probes for {}'.format(target_name)) # Nothing happens if target not in reference, just throws a warning. # Good for trouble shooting if gene names are not an exact match # (e.g., Cd8 vs. CD8) if target_name not in database_gene_list: raise ValueError('Sequence {} from infile is not found in the ' 'reference database. This should not happen ' 'if using the output from Script 0.' ''.format(target_name)) seq = target.seq sub_seq_list = [] # iterate over all 25 bp windows, avoid homotetramers, ensure proper # junction use, and avoid excessively low or high GC use for n in range(0, len(seq) - 24): sub_seq = str(seq[n:n + 25].reverse_complement()).upper() # look for TG junctions if GC(sub_seq) > 50 and GC(sub_seq) < 90: if 'GGGG' not in sub_seq: if 'CCCC' not in sub_seq: comparison = False if sub_seq[7] == 'T' and sub_seq[6] == 'C': comparison = True elif sub_seq[7] == 'A' and sub_seq[6] == 'T': comparison = True elif sub_seq[7] == 'T' and sub_seq[6] == 'T': comparison = True elif sub_seq[7] == 'A' and sub_seq[6] == 'G': comparison = True elif sub_seq[7] == 'A' and sub_seq[6] == 'A': comparison = True elif sub_seq[7] == 'T' and sub_seq[6] == 'A': comparison = True elif sub_seq[7] == 'A' and sub_seq[6] == 'C': comparison = True elif sub_seq[7] == 'T' and sub_seq[6] == 'G': comparison = True elif sub_seq[7] == 'C' and sub_seq[6] == 'T': comparison = True # if any of the elifs were true, append to sub_seq_list if comparison: probe_name = '{}_{}-{}'.format( target_name, n, n + 25) sub_seq_list.append({ 'Name': probe_name, 'Sequence': sub_seq, 'Tm': mt.Tm_GC(sub_seq, Na=300) }) temp_probe_list = open( './part1/{}_AllProbes.fasta'.format(target_name), 'w') pre_triage_probe_dict = {} for hit in sub_seq_list: temp_probe_list.write('>{}_{}\n{}\n'.format( hit['Name'], int(hit['Tm']), hit['Sequence'])) pre_triage_probe_dict['{}_{}'.format(hit['Name'], int( hit['Tm']))] = hit['Sequence'] print(len(pre_triage_probe_dict.keys())) temp_probe_list.close() try: os.remove('only_bowtie.sam') except OSError: pass pruned_bowtie_results = open('./part1/{}.sub_sam'.format(target_name), 'w') buildcmd = [ 'bowtie2', '--reorder', '--no-sq', '--nofw', '-p', '{}'.format(THREADS), '-D', '20', '-R', '3', '-N', '1', '-L', '9', '-i', 'L,0,0.80', '--gbar', '13', '-k', '50000', '-x', '{}'.format(reference_database), '-f', './part1/{}_AllProbes.fasta'.format(target_name), '-S', 'only_bowtie.sam', '--score-min', 'C,-42,0' ] #'--rdg', '5,10', #'--rfg', '5,10'] subprocess.call(buildcmd) # parse bowtie2 output bowtie_output = open('only_bowtie.sam', 'r') for result in bowtie_output: if result[0] != '@': result = result.split('\t') probe_name = result[0] hit_name = result[2] hit_gene_name = hit_name.split('|')[5] for detail in result: # search for edit difference between target and probe if 'NM:i:' in detail: mismatches = int(detail.replace('NM:i:', '')) if mismatches <= 6: pruned_bowtie_results.write('{}\t{}\t{}\t{}\n' ''.format( probe_name, hit_gene_name, hit_name, mismatches)) pruned_bowtie_results.close() bowtie_output.close() targets.close() reference_file.close() print("--- {} seconds ---".format(time.time() - start_time))
subdat = [record.id] if is_it_an_orf(str(record.seq)): orf_nt = str(record.seq) orf_aa = str(record.seq.translate()).replace("*", "") if trim: orf_aa = orf_aa[1:] orf_nt = orf_nt[3:] length = len(orf_nt.upper()) mw = Analyze(orf_aa).molecular_weight() pI = Analyze(orf_aa).isoelectric_point() aroma = Analyze(orf_aa).aromaticity() hydrophobe = Analyze(orf_aa).gravy() instability = Analyze(orf_aa).instability_index() cai = CAI.cai_for_gene(orf_nt.upper()) mp = mt.Tm_GC(orf_nt) A = orf_nt.upper().count("A") T = orf_nt.upper().count("T") C = orf_nt.upper().count("C") G = orf_nt.upper().count("G") CpG = orf_nt.upper().count("CG") + orf_nt.upper().count( "GC") # a forward GpC is a reverse CpG stop = stopz[orf_nt.upper()[-3:]] subdat.extend([ length, mw, mp, pI, aroma, hydrophobe, instability, cai, A, T, C, G, CpG, stop ]) nuWreck = record.translate() nuWreck.id = record.id