示例#1
0
def evalPrimerPairMT(fprimer, rprimer, ret_mt=False):
    """This will check the melting temperature

    The optimal melting temperature of the primers is 60–64°C, with
    an ideal temperature of 62°C, which is based on typical cycling and reaction conditions
    and the optimum temperature for PCR enzyme function. Ideally, the melting temperatures of
    the 2 primers should not differ by more than 2°C in order for both primers to bind
    simultaneously and efficiently amplify the product.
    PCR parameters used are from IDT: Oligo 0.2 uM Na 50 mM, Mg 3 mM, dNTPs 0.8 mM
    :param ret_mt: """

    fprimer_MT = MeltingTemp.Tm_GC(fprimer, Na=50, Mg=3, dNTPs=0.8)
    rprimer_MT = MeltingTemp.Tm_GC(rprimer, Na=50, Mg=3, dNTPs=0.8)

    fprimer_MT_NN = MeltingTemp.Tm_NN(fprimer, Na=50, Mg=3, dNTPs=0.8)
    rprimer_MT_NN = MeltingTemp.Tm_NN(fprimer, Na=50, Mg=3, dNTPs=0.8)

    print(
        f"forw primer: {fprimer}\nforw primer MT: {fprimer_MT} {fprimer_MT_NN} \n"
        f"rev  primer: {rprimer}\nrev primer MT : {rprimer_MT} {rprimer_MT_NN} \n"
    )
    """Filters for primers that meet the MT standards"""
    if math.fabs(fprimer_MT - rprimer_MT) <= 3 and\
                max(fprimer_MT,rprimer_MT) <= 64 and\
                min(fprimer_MT, rprimer_MT) >= 60:

        print("MT of primer pair passed.\n")

        if ret_mt == False:
            return True
        else:
            return fprimer_MT, rprimer_MT
    else:
        print("MT for the primer pairs did not meet standards\n")
        return False
def printNSeq(num_seq=20,seq_len=20,GC_low_cutoff=40,GC_high_cutoff=60):
    uniq_seq = list()
    while len(uniq_seq) < num_seq:
        seq = randSeqGen(seq_len)
        GCcontent = GCpercent(seq)
        if GCcontent >= GC_low_cutoff and GCcontent <= GC_high_cutoff:
            uniq_seq.append((str(seq), GCcontent, round(MT.Tm_GC(seq, Na=50, Mg=3, dNTPs=0.8),3)))
        else:
            continue
    pprint(uniq_seq)
    def evalSeqMT(seq, ret_mt=False):
        seq_MT = round(MT.Tm_GC(seq, Na=50, Mg=3, dNTPs=0.8), 2)

        if seq_MT < 65.0 and seq_MT > 59.0:
            if ret_mt == False:
                return True
            else:
                return seq_MT
        else:
            # print("MT for the primer pairs did not meet standards\n")
            return False
示例#4
0
 def calc_tm_value(self):
     self.primer_fw_seq = Seq(self.primer_fw.upper())
     self.primer_rv_seq = Seq(self.primer_rv.upper())
     self.tm_value_Wallace = np.mean([
         mt.Tm_Wallace(self.primer_fw_seq),
         mt.Tm_Wallace(self.primer_rv_seq)
     ])  # GC法で計算
     self.tm_value_GC = np.mean([
         mt.Tm_GC(self.primer_fw_seq, Na=50, valueset=7),
         mt.Tm_GC(self.primer_rv_seq, Na=50, valueset=7)
     ])  # GC法で計算
     self.tm_value_NN = np.mean([
         mt.Tm_NN(self.primer_fw_seq, Na=50, nn_table=mt.DNA_NN1),
         mt.Tm_NN(self.primer_rv_seq, Na=50, nn_table=mt.DNA_NN1)
     ])  # 最近接塩基法で計算
     self.tm_table_column = ["計算手法", "Tm値 (°C)"]
     self.tm_list = [["Wallace法",
                      round(self.tm_value_Wallace, 1)],
                     ["GC法", round(self.tm_value_GC, 1)],
                     ["最近接塩基法", round(self.tm_value_NN, 1)]]
     self.tm_table = pd.DataFrame(self.tm_list,
                                  columns=self.tm_table_column)
示例#5
0
if __name__ == '__main__':
    args = parse_args()

    file, seq_format, fh = args.infile, args.format, None,
    if file:
        if not seq_format:
            found = re.search(r'(?i)(fasta|fa|fastq|fq)(.gz)?$', file)
            if not found:
                print(
                    "invalid file name suffix.\nfile name should like this: infile.[fasfa|fa|fastq|fq][.gz]",
                    file=sys.stderr)
                sys.exit(1)
            seq_format, is_gz = found.groups()
            if seq_format == 'fa':
                seq_format = 'fasta'
            if seq_format == 'fq':
                seq_format = 'fastq'

        fh = gzip.open(file, 'rt') if file.endswith('.gz') else open(file, 'r')
    else:
        fh = sys.stdin
        seq_format = args.format

    sys.stdout.write('{}\t{}\t{}\t{}\n'.format('seq_id', 'Tm_Wallace', 'Tm_GC',
                                               'Tm_NN'))
    for seq in SeqIO.parse(fh, seq_format):
        sys.stdout.write('{}\t{:0.2f}\t{:0.2f}\t{:0.2f}\n'.format(
            seq.id, mt.Tm_Wallace(seq.seq), mt.Tm_GC(seq.seq),
            mt.Tm_NN(seq.seq)))
    fh.close()
#!/usr/bin/env python

import sys
from Bio.SeqUtils import molecular_weight
from Bio.SeqUtils import MeltingTemp as mt

print("python: " + sys.version, end="\n", file=sys.stderr)
print(sys.argv[1], end="\n", file=sys.stderr)

with open(sys.argv[1]) as file:
    for line in file:
        row = line.rstrip('\n').split("\t")
        seq = row[3]

        if seq == 'cdna':
            row.extend(["tm_nn", "tm_gc", "tm_wallace"])
            print(",".join(row))

        else:
            mw = molecular_weight(seq, 'DNA', False)

            row.append('%0.2f' % mt.Tm_NN(seq))
            row.append('%0.2f' % mt.Tm_GC(seq))
            row.append('%0.2f' % mt.Tm_Wallace(seq))

            print(",".join(row))
    #     st.warning("Please enter a FASTA Sequence !")
    #     st.stop()

    #INPUT USING UPLOAD
    st.set_option('deprecation.showfileUploaderEncoding', False)
    seqfile = st.file_uploader("Upload DNA fasta file", type=["fasta", "fa"])

    #BASIC STATS CALCULATION
    if seqfile is not None:
        dnarecord = SeqIO.read(seqfile, "fasta")
        dnaID = dnarecord.id
        dnadescript = dnarecord.description
        dnaseq = dnarecord.seq
        length = len(dnaseq)
        gccont = GC(dnaseq)
        melttemp = MeltingTemp.Tm_GC(dnaseq)
        dnafreq = Counter(dnaseq)

        #SETTING RADIO BUTTONS FOR OPTIONS ID|DESCRIPTION|SEQUENCE
        details = st.radio("Sequence Details",
                           ("ID", "Description", "Sequence"))
        if details == "Description":
            st.write(dnadescript)
        elif details == "Sequence":
            st.write(dnaseq)
        elif details == "ID":
            st.write(dnaID)

        #SETTING RADIO BUTTON FOR OPTIONS LENGTH|FREQUENCY TABLE|GC CONTENT|MELTING TEMPERATURE|PLOT NUCLEOTIDE FREQUENCY
        stats = st.radio("Sequence Statistics",
                         ("Length", "Frequency Table", "GC-Content",
示例#8
0
            exit_file.writelines('C: {}\n'.format(sequence.count("C")))
            exit_file.writelines('G: {}\n'.format(sequence.count("G")))
            exit_file.writelines('T: {}\n'.format(sequence.count("T")))

            # Calculating the percentage of GC
            amount_of_GC = sequence.count("C") + sequence.count("G")
            percentage_of_GC = (amount_of_GC / amount_of_nucleotides) * 100

            exit_file.writelines('Amount of GC: {}\n'.format(amount_of_GC))
            exit_file.writelines('% of GC: {}%\n'.format('%0.2f' %
                                                         percentage_of_GC))
            exit_file.writelines('\nMelting Temperature Values\n')

            # Calculating Melting Temperature
            exit_file.writelines('Tm_GC: {}\n'.format('%0.2f' %
                                                      mt.Tm_GC(sequence)))
            exit_file.writelines('Tm_NN: {}\n'.format('%0.2f' %
                                                      mt.Tm_NN(sequence)))

            # From University of Arizona Formula
            tm = 64.9 + 0.41 * percentage_of_GC - (500 / amount_of_nucleotides)
            exit_file.writelines('Arizona\'s: {}\n'.format('%0.2f' % tm))
            exit_file.writelines('\n')

            # Getting information for each graphic
            TMarizona_values.append(tm)
            TMGC_values.append(mt.Tm_GC(sequence))
            TMNN_values.append(mt.Tm_NN(sequence))
            GC_values.append(percentage_of_GC)

exit_file.close()
seqList = [line for line in clean2 if re.match(r'^[AGCT]+$', line)]
sequence = "".join(i for i in seqList[:bases])


def gcContent(sequence):
    count = 0
    for i in sequence:
        if i == 'G' or i == 'C':
            count += 1
        else:
            count = count
    return round((count / bases) * 100, 1)


gc = gcContent(sequence)
tm = mt.Tm_GC(sequence, Na=50)
moleWeight = round(mw(Seq(sequence, generic_dna)), 2)
dilWeight = float(clean2[clean2.index("ug/OD260:") +
                         10:clean2.index("ug/OD260:") + 14])
dilution = dilWeight * 10
primerDict = {
    "Primer Data": {
        "Sequence": sequence,
        "Bases": bases,
        "TM (50mM NaCl)": tm,
        "% GC content": gc,
        "Molecular weight": moleWeight,
        "ug/0D260": dilWeight,
        "Dilution volume (uL)": dilution
    },
    "Shipment Info": {
示例#10
0
def complete_tasks(full_seq, des, unique_key):
    file_details = st.radio("Details", ("Description", "Sequence"),
                            key=unique_key)

    #Show description and sequence in DNA Analysis section
    if file_details == "Description":
        st.write(des)
    elif file_details == "Sequence":
        st.write(full_seq)

    #Nucleotide occurances plot and color selector for the bars
    st.subheader("Plot Nucleotide Frequency")
    full_seq_freq = OrderedDict(Counter(full_seq))

    bar1_colour = st.beta_color_picker("Pick Colour for Bar 1", key=unique_key)
    bar2_colour = st.beta_color_picker("Pick Colour for Bar 2", key=unique_key)
    bar3_colour = st.beta_color_picker("Pick Colour for Bar 3", key=unique_key)
    bar4_colour = st.beta_color_picker("Pick Colour for Bar 4", key=unique_key)

    if st.button("Plot Frequency", key=unique_key):
        barlist = plt.bar(full_seq_freq.keys(), full_seq_freq.values())
        barlist[0].set_color(bar1_colour)
        barlist[1].set_color(bar2_colour)
        barlist[2].set_color(bar3_colour)
        barlist[3].set_color(bar4_colour)
        st.pyplot()

    st.subheader("Properties")

    #GC Content, GC Melting temp, GC_skew, Complement and reverse complement
    gc_count = GC(full_seq)
    st.write("GC Content: {}".format(gc_count))

    mt = MeltingTemp.Tm_GC(full_seq, strict=False)
    st.write("Melting Temperature based on GC Content: {}".format(mt))

    gc_skew_bases = st.number_input("Enter number of bases", key=unique_key)
    try:
        gc_skew = GC_skew(full_seq, int(gc_skew_bases))
        st.write("GC Skew for {} bases: {}".format(gc_skew_bases, gc_skew))
    except ValueError:
        st.write("Enter a Valid Number for bases")

    if st.checkbox("Complement", key=unique_key):
        st.write(full_seq.complement())

    elif st.checkbox("Reverse Complement", key=unique_key):
        st.write(full_seq.reverse_complement())

    #Protein Synthesis
    st.subheader("Protein Synthesis")
    p1 = full_seq.translate()
    if st.checkbox("Transcription: DNA to mRNA", key=unique_key):
        st.write(full_seq.transcribe())

    elif st.checkbox("Translation: DNA to 1 letter Amino Acid Sequence",
                     key=unique_key):
        st.write(p1)

    elif st.checkbox("Translation: DNA to 3 letter Amino Acid Sequence",
                     key=unique_key):
        full_aa_name = str(p1).replace("*", "")
        st.write(seq3(full_aa_name))

    elif st.checkbox("Plot Amino Acid Frequency", key=unique_key):
        aa_freq = OrderedDict(Counter(str(p1)))
        bar_colour = st.beta_color_picker("Pick Colour for all Bars",
                                          key=unique_key)
        plt.bar(aa_freq.keys(), aa_freq.values(), color=bar_colour)
        st.pyplot()
        st.write("Asterisk (*) - Denotes Stop Codons.")
def melting_temp(sequence):
    try:
        return MeltingTemp.Tm_GC(sequence, strict=False)
    except ZeroDivisionError:
        return
示例#12
0
def temperatures(dic):
    Tw = round(mt.Tm_Wallace(dic, strict=False), 2)
    Tgc = round(mt.Tm_GC(dic, strict=False), 2)
    Tnn = round(mt.Tm_NN(dic, strict=False), 2)
    return Tw, Tgc, Tnn
示例#13
0
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
from Bio.SeqUtils import MeltingTemp as mt
import numpy as np

for myseq in SeqIO.parse(
        "./hantavirus_M_1200_3700_segments_seq_1st_filter_oligos.fasta",
        "fasta"):
    #print(seq_record.id)
    #print 'seq %s is %i bases long\toligo_sequence\tPercent_GC\tTm_mean' % (myseq.id[0:59], len(myseq) )

    i = 0
    j = 80
    while j < len(myseq):
        a = [
            mt.Tm_NN(myseq.seq[i:j], strict=False),
            mt.Tm_GC(myseq.seq[i:j], strict=False)
        ]
        Tm_mean = (sum(a) / 2)

        print '%s_M_%i_%i\t%s\t%0.2f\t%0.2f' % (
            myseq.id[0:29], i, j, myseq.seq[i:j], GC(myseq.seq[i:j]), Tm_mean)
        #print 'GC content is: %i' % (GC(myseq.seq[i:j]))
        #print('%0.2f' % mt.Tm_NN(myseq.seq[i:j]) )
        #print('%0.2f' % mt.Tm_GC(myseq.seq[i:j]) )
        i += 200
        j = i + 80
示例#14
0
def main(target_file, reference_database):
    start_time = time.time()

    # make sure cmdline arg filepaths exist
    try:
        if not os.path.exists(target_file):
            raise FileNotFoundError("infile not found: {}"
                                    "".format(target_file))
        if not os.path.exists(reference_database):
            raise FileNotFoundError("reference file not found: {}"
                                    "".format(reference_database))
    except FileNotFoundError as e:
        raise e

    targets = SeqIO.parse(target_file, 'fasta')
    reference_file = SeqIO.parse('{}'.format(reference_database), 'fasta')

    # Create a folder named 'part1' within the current working directory, this
    # will be where all probes are saved and the sub_sam files.
    working_dir = os.getcwd()
    if os.path.exists(os.path.dirname('{}/part1/'.format(working_dir))):
        pass
    else:
        os.makedirs(os.path.dirname('{}/part1/'.format(working_dir)))

    # Loading in the gene names from the reference database. This will throw
    # error messages if the gene name is not present (e.g., eGFP) but is also a
    # good measure in case the gene name is not exactly the same as the
    # reference database (e.g., Cd8 vs. CD8).
    database_gene_list = set()
    for seq in reference_file:
        gene_name = (seq.name)
        gene_name = gene_name.split('|')[5]
        database_gene_list.add(gene_name)

    for target in targets:
        if str(target.name).count('-') == 0:
            target_name = str(target.name)
        elif str(target.name).count('-') == 1:
            target_name = str(target.name).split('-')[0]
        elif str(target.name).count('-') == 2:
            z = str(target.name).split('-')
            target_name = z[0] + '-' + z[1]
        print('Now predicting probes for {}'.format(target_name))
        # Nothing happens if target not in reference, just throws a warning.
        # Good for trouble shooting if gene names are not an exact match
        # (e.g., Cd8 vs. CD8)
        if target_name not in database_gene_list:
            raise ValueError('Sequence {} from infile is not found in the '
                             'reference database. This should not happen '
                             'if using the output from Script 0.'
                             ''.format(target_name))

        seq = target.seq
        sub_seq_list = []

        # iterate over all 25 bp windows, avoid homotetramers, ensure proper
        # junction use, and avoid excessively low or high GC use
        for n in range(0, len(seq) - 24):
            sub_seq = str(seq[n:n + 25].reverse_complement()).upper()

            # look for TG junctions
            if GC(sub_seq) > 50 and GC(sub_seq) < 90:
                if 'GGGG' not in sub_seq:
                    if 'CCCC' not in sub_seq:
                        comparison = False
                        if sub_seq[7] == 'T' and sub_seq[6] == 'C':
                            comparison = True
                        elif sub_seq[7] == 'A' and sub_seq[6] == 'T':
                            comparison = True
                        elif sub_seq[7] == 'T' and sub_seq[6] == 'T':
                            comparison = True
                        elif sub_seq[7] == 'A' and sub_seq[6] == 'G':
                            comparison = True
                        elif sub_seq[7] == 'A' and sub_seq[6] == 'A':
                            comparison = True
                        elif sub_seq[7] == 'T' and sub_seq[6] == 'A':
                            comparison = True
                        elif sub_seq[7] == 'A' and sub_seq[6] == 'C':
                            comparison = True
                        elif sub_seq[7] == 'T' and sub_seq[6] == 'G':
                            comparison = True
                        elif sub_seq[7] == 'C' and sub_seq[6] == 'T':
                            comparison = True

                        # if any of the elifs were true, append to sub_seq_list
                        if comparison:
                            probe_name = '{}_{}-{}'.format(
                                target_name, n, n + 25)
                            sub_seq_list.append({
                                'Name': probe_name,
                                'Sequence': sub_seq,
                                'Tm': mt.Tm_GC(sub_seq, Na=300)
                            })

        temp_probe_list = open(
            './part1/{}_AllProbes.fasta'.format(target_name), 'w')
        pre_triage_probe_dict = {}
        for hit in sub_seq_list:
            temp_probe_list.write('>{}_{}\n{}\n'.format(
                hit['Name'], int(hit['Tm']), hit['Sequence']))
            pre_triage_probe_dict['{}_{}'.format(hit['Name'], int(
                hit['Tm']))] = hit['Sequence']
        print(len(pre_triage_probe_dict.keys()))
        temp_probe_list.close()

        try:
            os.remove('only_bowtie.sam')
        except OSError:
            pass
        pruned_bowtie_results = open('./part1/{}.sub_sam'.format(target_name),
                                     'w')
        buildcmd = [
            'bowtie2', '--reorder', '--no-sq', '--nofw', '-p',
            '{}'.format(THREADS), '-D', '20', '-R', '3', '-N', '1', '-L', '9',
            '-i', 'L,0,0.80', '--gbar', '13', '-k', '50000', '-x',
            '{}'.format(reference_database), '-f',
            './part1/{}_AllProbes.fasta'.format(target_name), '-S',
            'only_bowtie.sam', '--score-min', 'C,-42,0'
        ]
        #'--rdg', '5,10',
        #'--rfg', '5,10']
        subprocess.call(buildcmd)

        # parse bowtie2 output
        bowtie_output = open('only_bowtie.sam', 'r')

        for result in bowtie_output:
            if result[0] != '@':
                result = result.split('\t')
                probe_name = result[0]
                hit_name = result[2]
                hit_gene_name = hit_name.split('|')[5]
                for detail in result:
                    # search for edit difference between target and probe
                    if 'NM:i:' in detail:
                        mismatches = int(detail.replace('NM:i:', ''))
                if mismatches <= 6:
                    pruned_bowtie_results.write('{}\t{}\t{}\t{}\n'
                                                ''.format(
                                                    probe_name, hit_gene_name,
                                                    hit_name, mismatches))
        pruned_bowtie_results.close()
        bowtie_output.close()

    targets.close()
    reference_file.close()

    print("--- {} seconds ---".format(time.time() - start_time))
    subdat = [record.id]
    if is_it_an_orf(str(record.seq)):
        orf_nt = str(record.seq)
        orf_aa = str(record.seq.translate()).replace("*", "")
        if trim:
            orf_aa = orf_aa[1:]
            orf_nt = orf_nt[3:]

        length = len(orf_nt.upper())
        mw = Analyze(orf_aa).molecular_weight()
        pI = Analyze(orf_aa).isoelectric_point()
        aroma = Analyze(orf_aa).aromaticity()
        hydrophobe = Analyze(orf_aa).gravy()
        instability = Analyze(orf_aa).instability_index()
        cai = CAI.cai_for_gene(orf_nt.upper())
        mp = mt.Tm_GC(orf_nt)
        A = orf_nt.upper().count("A")
        T = orf_nt.upper().count("T")
        C = orf_nt.upper().count("C")
        G = orf_nt.upper().count("G")
        CpG = orf_nt.upper().count("CG") + orf_nt.upper().count(
            "GC")  # a forward GpC is a reverse CpG

        stop = stopz[orf_nt.upper()[-3:]]

        subdat.extend([
            length, mw, mp, pI, aroma, hydrophobe, instability, cai, A, T, C,
            G, CpG, stop
        ])
        nuWreck = record.translate()
        nuWreck.id = record.id