Exemplo n.º 1
0
def evalPrimerPairMT(fprimer, rprimer, ret_mt=False):
    """This will check the melting temperature

    The optimal melting temperature of the primers is 60–64°C, with
    an ideal temperature of 62°C, which is based on typical cycling and reaction conditions
    and the optimum temperature for PCR enzyme function. Ideally, the melting temperatures of
    the 2 primers should not differ by more than 2°C in order for both primers to bind
    simultaneously and efficiently amplify the product.
    PCR parameters used are from IDT: Oligo 0.2 uM Na 50 mM, Mg 3 mM, dNTPs 0.8 mM
    :param ret_mt: """

    fprimer_MT = MeltingTemp.Tm_GC(fprimer, Na=50, Mg=3, dNTPs=0.8)
    rprimer_MT = MeltingTemp.Tm_GC(rprimer, Na=50, Mg=3, dNTPs=0.8)

    fprimer_MT_NN = MeltingTemp.Tm_NN(fprimer, Na=50, Mg=3, dNTPs=0.8)
    rprimer_MT_NN = MeltingTemp.Tm_NN(fprimer, Na=50, Mg=3, dNTPs=0.8)

    print(
        f"forw primer: {fprimer}\nforw primer MT: {fprimer_MT} {fprimer_MT_NN} \n"
        f"rev  primer: {rprimer}\nrev primer MT : {rprimer_MT} {rprimer_MT_NN} \n"
    )
    """Filters for primers that meet the MT standards"""
    if math.fabs(fprimer_MT - rprimer_MT) <= 3 and\
                max(fprimer_MT,rprimer_MT) <= 64 and\
                min(fprimer_MT, rprimer_MT) >= 60:

        print("MT of primer pair passed.\n")

        if ret_mt == False:
            return True
        else:
            return fprimer_MT, rprimer_MT
    else:
        print("MT for the primer pairs did not meet standards\n")
        return False
Exemplo n.º 2
0
def Tm_feature(data, pam_audit=True, learn_options=None):
    if learn_options is None or "Tm segments" not in learn_options:
        segments = [(19, 24), (11, 19), (6, 11), (4, 24)]
    else:
        segments = learn_options["Tm segments"]
    sequence = data["30mer"].values
    featarray = np.ones((sequence.shape[0], 5))
    rna = True
    for i, seq in enumerate(sequence):
        if pam_audit and seq[25:27] != "GG":
            continue
            raise Exception("excepted GG but found %s" % seq[25:27])

        featarray[i, 0] = Tm.Tm_staluc(seq, rna=rna)  #30mer
        featarray[i, 1] = Tm.Tm_staluc(
            seq[segments[0][0]:segments[0][1]],
            rna=rna)  #5nts immediately proximal of the NGG PAM
        featarray[i, 2] = Tm.Tm_staluc(seq[segments[1][0]:segments[1][1]],
                                       rna=rna)  #8-mer
        featarray[i, 3] = Tm.Tm_staluc(seq[segments[2][0]:segments[2][1]],
                                       rna=rna)  #5-mer
        featarray[i, 4] = Tm.Tm_staluc(seq[segments[3][0]:segments[3][1]],
                                       rna=rna)  #20-spacer
    feat = pd.DataFrame(featarray,
                        index=data.index,
                        columns=[
                            "Tm global_30mer%s" % rna,
                            "5mer_end_%s" % rna,
                            "8mer_middle_%s" % rna,
                            "5mer_start_%s" % rna,
                            "Tm global_spacer_%s" % rna
                        ])
    return feat
Exemplo n.º 3
0
def get_thermo(dict, guide_sequence, context_sequence):
    # Use Biopython to get thermo info. from context and guides
    dict['Tm, context'] = MeltingTemp.Tm_NN(context_sequence)
    dict['Tm, 5mer-15'] = MeltingTemp.Tm_NN(guide_sequence[-5:])
    dict['Tm, 5mer-3'] = MeltingTemp.Tm_NN(guide_sequence[2:7])
    dict['Tm, middle'] = MeltingTemp.Tm_NN(guide_sequence[7:-5])
    return dict
Exemplo n.º 4
0
def Tm_feature(data, feature_options=None):

    if feature_options is None or 'Tm segments' not in feature_options.keys():
        segments = [(15, 21), (4, 13), (0, 4)]
    else:
        segments = feature_options['Tm segments']

    sequence = data.values
    featarray = np.ones((sequence.shape[0], 4))

    for i, seq in enumerate(sequence):
        rna = False
        featarray[i, 0] = Tm.Tm_staluc(seq, rna=rna)  # 21mer Tm
        featarray[i, 1] = Tm.Tm_staluc(
            seq[segments[0][0]:segments[0][1]],
            rna=rna)  # 5nts immediately proximal of the NGG PAM
        featarray[i, 2] = Tm.Tm_staluc(seq[segments[1][0]:segments[1][1]],
                                       rna=rna)  # 8-mer
        featarray[i, 3] = Tm.Tm_staluc(seq[segments[2][0]:segments[2][1]],
                                       rna=rna)  # 4-mer

    feat = pandas.DataFrame(featarray,
                            index=data.index,
                            columns=[
                                "Tm global_%s" % rna,
                                "5mer_end_%s" % rna,
                                "8mer_middle_%s" % rna,
                                "4mer_start_%s" % rna
                            ])
    return feat
Exemplo n.º 5
0
def Tm_feature(data):
    '''
    assuming '30-mer'is a key
    get melting temperature features from:
        0-the 30-mer ("global Tm")
        1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA, i.e. the 5nts immediately proximal of the NGG PAM
        2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt)
        3-the Tm of the DNA:RNA hybrid from position 3 - 7  (i.e. 5 nt)
    '''
    sequence = data['30mer'].values
    featarray = np.ones((sequence.shape[0], 4))
    for i, seq in enumerate(sequence):
        if seq[25:27] != "GG":
            raise Exception("expected GG but found %s" % seq[25:27])
        rna = False
        featarray[i, 0] = Tm.Tm_staluc(seq, rna=rna)  #30mer Tm
        featarray[i, 1] = Tm.Tm_staluc(
            seq[20:25], rna=rna)  #5nts immediately proximal of the NGG PAM
        featarray[i, 2] = Tm.Tm_staluc(seq[12:20], rna=rna)  #8-mer
        featarray[i, 3] = Tm.Tm_staluc(seq[7:12], rna=rna)  #5-mer

    feat = pandas.DataFrame(featarray,
                            index=data.index,
                            columns=[
                                "Tm global_%s" % rna,
                                "5mer_end_%s" % rna,
                                "8mer_middle_%s" % rna,
                                "5mer_start_%s" % rna
                            ])

    return feat
Exemplo n.º 6
0
def nested(seq):
    shrt = 18
    lng = 25
    pr = ''
    maxa = 0
    ok = 0
    while ok != 1:
        if shrt == lng + 1:
            shrt = 18
            seq = seq[1:]
        pr = seq[:shrt]
        if GC(pr) >= 60 or GC(pr) <= 40:
            shrt += 1
            continue
        else:
            if GCend(pr) == 0:
                shrt += 1
            else:
                if mt.Tm_Wallace(pr) > 60 or mt.Tm_Wallace(pr) < 56:
                    shrt += 1
                else:
                    pr_revers = pr[::-1]
                    complalig = SWalig(SWvlmtrx(pr, pr_revers), pr, pr_revers)
                    if complalig[2].find("****") >= 0 or self_dimers(
                            pr
                    ) == 1 or complalig[2].find(
                            "***-**") >= 0 or complalig[2].find("**-***") >= 0:
                        shrt += 1
                    else:
                        hpin = hairpin(pr)
                        if hpin == 0:
                            shrt += 1
                        else:
                            return pr
 def BedprobeTm(self, seq7):
     """Tm calculation function for use with .bed output."""
     bedTmVal = float(('%0.2f' % mt.Tm_NN(seq7, Na=self.sal,
                                          dnac1=self.conc1,
                                          dnac2=self.conc2)))
     bed_fcorrected = ('%0.2f' % mt.chem_correction(bedTmVal, fmd=self.form))
     return bed_fcorrected
Exemplo n.º 8
0
def probeTm(seq1, saltConc, formConc):
    """Calculates the melting temperature of a given sequence under the
    specified salt and formamide conditions."""

    tmval = float(('%0.2f' % mt.Tm_NN(seq1, Na=saltConc)))
    fcorrected = ('%0.2f' % mt.chem_correction(tmval, fmd=formConc))
    return fcorrected
Exemplo n.º 9
0
def get_primer(seq, direction, name):
    # Tm_NN: Calculation based on nearest neighbor thermodynamics. Several
    # tables for DNA/DNA, DNA/RNA and RNA/RNA hybridizations are included.
    # Correction for mismatches, dangling ends, salt concentration and other
    # additives are available.
    # Tm_staluc is the 'old' NN calculation and is kept for compatibility.
    # It is, however, recommended to use Tm_NN instead, since Tm_staluc may be
    # depreceated in the future. Also, Tm_NN has much more options. Using
    # Tm_staluc and Tm_NN with default parameters gives (essentially) the same results.

    global PRIMER_NUM
    global PRIMER_TM
    PRIMER_LENGTH = 15  # min primer lenght
    if direction == "fwd":
        while mt.Tm_staluc(seq[0:PRIMER_LENGTH]) <= PRIMER_TM and PRIMER_LENGTH <= 65:
            PRIMER_LENGTH += 1
        primer_seq = seq[0:PRIMER_LENGTH]
        primer_tm = mt.Tm_staluc(primer_seq)
    elif direction == "rev":
        while mt.Tm_staluc(seq[-PRIMER_LENGTH:]) <= PRIMER_TM and PRIMER_LENGTH <= 65:
            PRIMER_LENGTH += 1
        primer_seq = revcomplement(seq[-PRIMER_LENGTH:]).lower()
        primer_tm = mt.Tm_staluc(primer_seq)
    primer_seq = str(primer_seq)
    primer_name = "{}_{}_{}".format(PRIMER_NUM, name, direction)
    primer = list([primer_name, primer_seq, primer_tm, PRIMER_LENGTH])
    PRIMER_NUM += 1
    return primer
def grow_overlap(startpoint, seq):
    """
    Returns the sequence divided in three parts: 5', overlap and 3'.
    Overlap grows from the middle outwards, regardless of the oligo length limit, that
    is checked afterwards, and if it's not good enough codons get swapped randomly until
    the overlap has the proper length and Tm. The returned stings are the three parts of
    sequence: five prime unique section, the overlap that will be later added to both,
    and the three-prime unique section.
    """
    #Minimum length of the overlap must be len = Tm/4, which assumes that it's %100 GC:
    min_len = minmelt/4
    overlap = seq[startpoint - min_len/2 : startpoint] + seq[startpoint : startpoint + min_len/2 ]
    counter = 0
    firsthalf = seq[:(startpoint - min_len/2 - counter)]
    tm = MeltingTemp.Tm_NN(overlap, Na=50, K=0, Tris=0, Mg=args.MgmM, dNTPs=args.dNTPsmM )
    gc = GC(overlap) 
    while (min_len/2 + counter < args.maxoverlaplen) and ( (tm < minmelt) or ( (gc < 40) or (gc > 60) ) ): # GC% must be between 40 and 60, Tm should be above mininimum, and the overlap should not be longer than 30bps
        counter += 1
        overlap = seq[startpoint - min_len/2 - counter] + overlap + seq[startpoint + min_len/2 + counter - 1]
        tm = MeltingTemp.Tm_NN(overlap, Na=50, K=0, Tris=0, Mg=args.MgmM, dNTPs=args.dNTPsmM )
        gc = GC(overlap)
    #print(counter)
    firsthalf = seq[:(startpoint - min_len/2 - counter)]
    secondhalf = seq[(startpoint + min_len/2 + counter):len(seq)]
    #print len(firsthalf + overlap + secondhalf), len(seq)
    #print(firsthalf)
    #print(overlap)
    #print(secondhalf)
    assert len(firsthalf + overlap + secondhalf) == len(seq)
    assert firsthalf + overlap + secondhalf == seq
    #assert str(Seq(seq, unambiguous_dna).translate()) == protein_seqs[design] # this is done later
    return firsthalf, overlap, secondhalf
Exemplo n.º 11
0
def get_thermo(dict, guide_sequence, context_sequence):
    # Use Biopython to get thermo info. from context and guides
    dict['Tm, context'] = MeltingTemp.Tm_NN(context_sequence)
    third = len(guide_sequence)//3
    dict['Tm, start'] = MeltingTemp.Tm_NN(guide_sequence[0:third])
    dict['Tm, mid'] = MeltingTemp.Tm_NN(guide_sequence[third:2*third])
    dict['Tm, end'] = MeltingTemp.Tm_NN(guide_sequence[2*third:])
    return dict
Exemplo n.º 12
0
 def __probeTm(self):
     """
     Calculates the melting temperature of a given sequence under the
     specified salt and formamide conditions.
     """
     tmval = float(mt.Tm_NN(self.seq, Na=self.sal))
     Tm = ('%0.2f' % mt.chem_correction(tmval, fmd=int(self.form)))
     return Tm
Exemplo n.º 13
0
def possible_reverse_seq(seq_d, n, seq_length, mt_min, mt_max, na, tris, mg,
                         dntps, saltcor):
    a = seq_d[n:n + seq_length]
    if (a[0] == 'C' or a[0] == 'G') and len(a) == seq_length:
        if mt_min < mt.Tm_NN(
                a, Na=na, Tris=tris, Mg=mg, dNTPs=dntps,
                saltcorr=saltcor) and mt.Tm_NN(
                    a, Na=na, Tris=tris, Mg=mg, dNTPs=dntps,
                    saltcorr=saltcor) < mt_max:
            return str(a)
Exemplo n.º 14
0
def seqProbes(mb_seq, mb_size, mb_sscount, probe):
    result = list(itersplit_into_x_chunks(mb_seq, mb_size, probe))
    basesl = []
    for i in result:
        i = reverse_complement(i)
        basesl.append(i)

    basesp = []
    for i in result:
        i = parallel_probe(i)
        basesp.append(i)

    Tml = []
    for i in basesl:
        Tmx = mt.Tm_NN(i,
                       dnac1=50000,
                       dnac2=50000,
                       Na=100,
                       nn_table=mt.RNA_NN1,
                       saltcorr=1)
        Tml.append(int(Tmx))
    result_basesa = list(itersplit_into_x_chunks(
        mb_bases, mb_size, probe))  #list of lists of each base for each probe
    #base number as j and list of these numbers as jl, list of percent of Gs and Cs as perl

    Tmp = []
    for i in basesp:
        Tmx = mt.Tm_NN(i,
                       dnac1=50000,
                       dnac2=50000,
                       Na=100,
                       nn_table=mt.RNA_NN1,
                       saltcorr=1)
        Tmp.append(int(Tmx))
    result_basesp = list(itersplit_into_x_chunks(mb_bases, mb_size, probe))

    j = 0
    perl = []
    jl = []
    for i in result_basesa:
        j += 1
        aas = i.count('A')
        gs = i.count('G')
        per = int((aas + gs) / probe * 100)
        perl.append(per)
        jl.append(j)
    size2 = len(mb_sscount)
    result2 = list(itersplit_into_x_chunks(mb_sscount, size2, probe))
    sumsl = []
    for i in result2:
        i = list(map(int, i))
        sums = sum(i) / (probe * mb_so)
        sumsl.append(sums)
    return (jl, perl, sumsl, basesl, Tml, Tmp, basesp
            )  #put together all data as indicated in header
Exemplo n.º 15
0
def Temper(sequence):
    seq=sequence
    seq_7=seq[:7]
    seq_8=seq[7:15]
    seq_5=seq[15:20]
    TDic={}
    TDic['T20']=Tm.Tm_staluc(seq)
    TDic['T7']=Tm.Tm_staluc(seq_7)
    TDic['T8']=Tm.Tm_staluc(seq_8)
    TDic['T5']=Tm.Tm_staluc(seq_5)
    return TDic
Exemplo n.º 16
0
 def find_left_primer(self, seq, optimal_tm=54):
     seqO = Seq(seq)
     seqs = []
     for _len in range(10, 60):
         if _len <= len(seq):
             seqO = Seq(seq[:_len])
             seqs.append([seq[:_len], abs(optimal_tm - mt.Tm_NN(seqO))])
         else:
             seqO = Seq(seq)
             seqs.append([seq, abs(optimal_tm - mt.Tm_NN(seqO))])
             break
     seqs = sorted(seqs, key=lambda x: x[1])
     best = seqs[0]
     return best[0]
Exemplo n.º 17
0
def featurize(data, exp_nm, seq_col):
  X_all = []
  start_pos, end_pos = -9, 21   # go up to N in NGG

  for idx, row in data.iterrows():
    x_input = row[seq_col]
    # zero_idx = _data.pos_to_idx(0, exp_nm)
    zero_idx = _data.pos_to_idx_safe(0, exp_nm, row['Name (unique)'])
    seq = x_input[zero_idx + start_pos : zero_idx + end_pos + 1]
    assert len(seq) == 31

    curr_x = []

    # One hot encoding
    curr_x += one_hot_encode(seq)

    # Dinucleotides
    curr_x += dinucleotide_encode(seq)

    # Sum nucleotides
    features = [
      seq.count('A'),
      seq.count('C'),
      seq.count('G'),
      seq.count('T'),
      seq.count('G') + seq.count('C'),
    ]
    curr_x += features

    # Melting temp
    from Bio.SeqUtils import MeltingTemp as mt
    features = [
      mt.Tm_NN(seq),
      mt.Tm_NN(seq[-5:]),
      mt.Tm_NN(seq[-13:-5]),
      mt.Tm_NN(seq[-21:-13]),
    ]
    curr_x += features

    # Store
    X_all.append(np.array(curr_x))

  ohe_nms = get_one_hot_encoder_nms(start_pos, end_pos)
  dint_nms = get_dinucleotide_nms(start_pos, end_pos)
  sum_nms = ['Num. A', 'Num. C', 'Num. G', 'Num. T', 'Num. GC']
  mt_nms = ['Tm full', 'Tm -5', 'Tm -13 to -5', 'Tm -21 to -13']
  param_nms = ['x_%s' % (ft_nm) for ft_nm in ohe_nms + dint_nms + sum_nms + mt_nms]

  return (np.array(X_all), param_nms)
Exemplo n.º 18
0
def select_primers(circRNA_seq):
    for i in range(7, 14):
        print(i)
        five_end = circRNA_seq[0:i]
        three_end = circRNA_seq[(i - 20):]
        forward_primer = Seq(circRNA_seq[(i - 150):(i - 150) + 20])

        Rev_Comp = Seq(three_end + five_end)
        Rev_Primer = Rev_Comp.reverse_complement()

        print(forward_primer, mt.Tm_Wallace(forward_primer))
        print(Rev_Primer, mt.Tm_Wallace(Rev_Primer))
        print(Rev_Comp)
        difference = mt.Tm_Wallace(forward_primer) - mt.Tm_Wallace(Rev_Primer)
        print(abs(difference))
Exemplo n.º 19
0
    def probeTmOpt_var(self, seq1):
        """

        :param self:
        :param seq1:
        :return:
        """

        vargibbs_res = str(self.inputFile).split('.')[0]

        vargibbs_run = [
            self.vargibbs, f'-o={vargibbs_res}', f'-par={self.par}',
            '-calc=prediction', '-v=0', '-seqsalt={}'.format(1000),
            '\"-seq=r({})\"'.format(seq1.replace('T', 'U')),
            '-cseq={}'.format(complement(seq1)), f'-ct={self.ct}',
            f'-targetsalt={self.sal}', f'-saltscheme={self.saltscheme}'
        ]

        run_func = ' '.join(vargibbs_run)
        proc = subprocess.Popen(run_func,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                shell=True)
        result, err = proc.communicate()

        try:
            tm_val = float(
                open(vargibbs_res +
                     '.ver').read().split('\n')[1].split(' ')[1])
        except:
            LOG.warning(err)
            return False

        tm_val = mt.chem_correction(tm_val, fmd=self.form)
        return tm_val
Exemplo n.º 20
0
def repeat_finder(seq):
    '''Finds largest repeat in a sequence and returns the position of the repeat.'''
    string = seq.upper()
    l = list(string)
    d = collections.deque(string[1:])
    match = []
    longest_match = []
    while d:
        for i, item in enumerate(d):
            if l[i] == item:
                match.append(item)
            else:
                if len(longest_match) < len(match):
                    longest_match = match
                match = []
        d.popleft()
    repeat_sequence = ''.join(longest_match)
    if len(repeat_sequence) > 20:
        location = string.find(repeat_sequence)
        length = len(repeat_sequence)
        logger.debug("Found long {}bp repeat at location {}".format(
            length, location))
        return location, length
    if len(repeat_sequence) < 20 and int(mt.Tm_Wallace(
            Seq(repeat_sequence))) > 55:
        location = string.find(repeat_sequence)
        length = len(repeat_sequence)
        logger.debug("Found long {}bp repeat at location {}".format(
            length, location))
        return location, length
    else:
        return False
Exemplo n.º 21
0
def main():
    for seq in seqarr:
        s = Seq(seq)
        res = mt.Tm_NN(s, check, strict, c_seq, shift, nn_table, tmm_table,
                       imm_table, de_table, dnac1, dnac2, selfcomp, Na, K,
                       Tris, Mg, dNTPs, saltcorr)
        print('%0.2f' % res)
Exemplo n.º 22
0
 def tm_func(primers):
     """ Calculates the nearest neighbor melting temperature using
     the user specified pcr salt parameters. When there are multiple
     sequences due to ambiguous bases, an average Tm is returned.
     """
     return np.mean(
         [MeltingTemp.Tm_NN(primer, **tm_params) for primer in primers])
def get_max_domain_melt(dna_structure, staple_indices, scaffold_rotation, scaffold_id, print_staples):
    # physical scaffold sequence
    scaffold_sequence = get_sequence(dna_structure.strands[scaffold_id]).replace('N', '')
    # physical scaffold length
    scaffold_length = len(scaffold_sequence)
    #print(staple_indices)
    #loop through strands
    staple_domain_melt = []
    for strand in staple_indices:
        #cur_strand= []
        cur_domain_melt = []
        # loop through domain
        for domain in strand:
            # loop through bases in DOMAIN
            cur_domain = []
            for baseindex in domain:
                # physical index in scaffold
                i_physical = (baseindex+scaffold_rotation)%scaffold_length

                #dna_structure.strands[scaffold_id].tour[i_physical+offset].seq
                cur_domain.append(scaffold_sequence[i_physical])
            if len(cur_domain)>1:
                # compute melting temperature of domain; reverse sequence of cur_domain, since it is on the scaffold and the indices follow staples
                cur_domain_melt.append(MeltingTemp.Tm_NN(Seq(''.join(cur_domain[::-1]))))
            else:
                cur_domain_melt.append(0.)
            #domain_seq_on_scaffold = Seq(''.join(cur_domain[::-1]), generic_dna)
            #cur_strand.append(str(domain_seq_on_scaffold.reverse_complement()))

        staple_domain_melt.append(max(cur_domain_melt))
        #if print_staples:
        #    print(str(cur_strand))
    return staple_domain_melt
Exemplo n.º 24
0
def gene_feature(Y, X, learn_options):
    '''
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    '''

    gene_names = Y['Target gene']

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values == gene] = len(seq)
        gc_content[gene_names.values == gene] = SeqUtil.GC(seq)
        temperature[gene_names.values == gene] = Tm.Tm_NN(seq, rna=False)
        molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight(
            seq, 'DNA')

    all = np.concatenate(
        (gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pandas.DataFrame(data=all,
                          index=gene_names.index,
                          columns=[
                              'gene length', 'gene GC content',
                              'gene temperature', 'gene molecular weight'
                          ])
    return df
Exemplo n.º 25
0
def main(argv):
    fout = open("./test.fa", 'w+')
    fafile = '/Users/yeweijian/Downloads/data/hg19.fa'
    bedfile = '/Users/yeweijian/Downloads/data/test.bed'

    parser = argparse.ArgumentParser(description='python Rundesign.py ')
    parser.add_argument('--FA',
                        type=str,
                        default=fafile,
                        help='the reference fasta file')
    parser.add_argument('--BED',
                        type=str,
                        default=bedfile,
                        help='the target region file')
    args = parser.parse_args()

    fafile = args.FA
    bedfile = args.BED

    file_exists(fafile)
    file_exists(bedfile)

    #读取fasta文件
    fh = pysam.Fastafile(fafile)

    #sal = 390 #The mM Na+ concentration to be used for Tm
    #form = 50  #The percent formamide to be used for Tm

    #读取区间文件,提取序列信息
    for line in open(bedfile):
        chrom, start, end = line.rstrip().split('\t')
        start = int(start)
        end = int(end)
        regionsize = int(end) - int(start)

        #区间太小,直接取区间序列
        if regionsize <= 120:
            seq = Seq(fh.fetch(reference=chrom, start=start, end=end),
                      IUPAC.unambiguous_dna)
            faout(seq, chrom, start, end)
            #Tm = probeTm(seq, sal, form)

            #print("%0.2f" % mt.Tm_NN(seq))

            print(
                '>{}:{}-{} Repeat:{:.3f} GC:{:.3f} Nrate:{:.3f} Tm:{}'.format(
                    chrom, start, end, repeatstat(seq), GC(seq), nstat(seq),
                    mt.Tm_NN(seq)),
                file=fout)
            print(seq, file=fout)

        else:
            for p1 in range(start, end):
                p2 = start + 119
                if p2 >= end:
                    break
                else:
                    seq = Seq(fh.fetch(reference=chrom, start=p1, end=p2),
                              IUPAC.unambiguous_dna)
                    faout(seq, chrom, p1, p2)
Exemplo n.º 26
0
def gene_feature(Y):
    """
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    """

    gene_names = Y["Target gene"]

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values == gene] = len(seq)
        gc_content[gene_names.values == gene] = SeqUtil.GC(seq)
        temperature[gene_names.values == gene] = Tm.Tm_staluc(seq, rna=False)
        molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight(
            seq, "DNA")

    everything = np.concatenate(
        (gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pd.DataFrame(
        data=everything,
        index=gene_names.index,
        columns=[
            "gene length",
            "gene GC content",
            "gene temperature",
            "gene molecular weight",
        ],
    )
    return df
Exemplo n.º 27
0
def Tm_feature(data, pam_audit=True, learn_options=None):
    '''
    assuming '30-mer'is a key
    get melting temperature features from:
        0-the 30-mer ("global Tm")
        1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA, i.e. the 5nts immediately proximal of the NGG PAM
        2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt)
        3-the Tm of the DNA:RNA hybrid from position 3 - 7  (i.e. 5 nt)
    '''

    if learn_options is None or 'Tm segments' not in list(
            learn_options.keys()):
        segments = [(19, 24), (11, 19), (6, 11)]
    else:
        segments = learn_options['Tm segments']

    sequence = data['30mer'].values
    featarray = np.ones((sequence.shape[0], 4))

    for i, seq in enumerate(sequence):
        if pam_audit and seq[25:27] != "GG":
            raise Exception("expected GG but found %s" % seq[25:27])
        rna = False
        featarray[i, 0] = Tm.Tm_staluc(seq, rna=rna)  #30mer Tm
        featarray[i, 1] = Tm.Tm_staluc(
            seq[segments[0][0]:segments[0][1]],
            rna=rna)  #5nts immediately proximal of the NGG PAM
        featarray[i, 2] = Tm.Tm_staluc(seq[segments[1][0]:segments[1][1]],
                                       rna=rna)  #8-mer
        featarray[i, 3] = Tm.Tm_staluc(seq[segments[2][0]:segments[2][1]],
                                       rna=rna)  #5-mer

        #print "CRISPR"
        #for d in range(4):
        #    print featarray[i,d]
        #import ipdb; ipdb.set_trace()

    feat = pandas.DataFrame(featarray,
                            index=data.index,
                            columns=[
                                "Tm global_%s" % rna,
                                "5mer_end_%s" % rna,
                                "8mer_middle_%s" % rna,
                                "5mer_start_%s" % rna
                            ])

    return feat
Exemplo n.º 28
0
def PenaltyMeltingTemperature(Primer: str):

    # -- Melting Temperature
    MTemp = mt.Tm_NN(Primer)

    Penalty = ValueInBounds(MTemp, 55, 3)

    return Penalty
Exemplo n.º 29
0
def melting_tmp_T7(input_df):
	mtmp = zeros((len(input_df), 1), dtype=float)	
	for l in range(len(input_df)):
		seq = input_df.iat[l]
		# Protospacer sequence (20mer)
		seq = seq[6:13]
		mtmp[l,0] =  mt.Tm_staluc(seq)
	return mtmp		
Exemplo n.º 30
0
 def _create_staple_max_melt_T(self) -> Dict[Strand, float]:
     """ max_melt_T is the staple domain with the highest metling temperature"""
     staple_domains_melt_t: Dict[Strand, List[float]] = dict()
     for staple in self.staples:
         domains = staple.domain_list
         for domain in domains:
             if "N" not in domain.sequence:
                 # NOTE: using nearest neighbor for domain with length higher
                 #   than 14 using 'Wallace rule' else
                 if len(domain.base_list) > 14:
                     staple_domains_melt_t.setdefault(staple, []).append(MeltingTemp.Tm_NN(
                         Seq(domain.sequence), Na=0, Mg=17.5))
                 else:
                     staple_domains_melt_t.setdefault(staple, []).append(MeltingTemp.Tm_Wallace(
                         Seq(domain.sequence)))
     max_staple_melt_t = {key: max(value) for (
         key, value) in staple_domains_melt_t.items()}
     return max_staple_melt_t