Пример #1
0
def ss_motif(args):

    # sanity checks
    files = args.files
    print "# Finding 3D Single Strand Motifs..."
    # assert args.bulges < 3, "# FATAL: cannot do bulges > 2"
    ref_pdb = reader.Pdb(args.reference, res_mode=args.res_mode)
    ref_len = len(ref_pdb.model.sequence)
    ref_mat = (ref_pdb.model.get_gmat(args.cutoff)).reshape(-1)
    if args.seq == None:
        query = "N" * ref_len
    else:
        assert len(args.seq) == ref_len, "# FATAL: query structure and sequence length mismatch!"
        query = args.seq

    pattern = tools.get_pattern(query)
    # OK...
    fh = open(args.name, "w")
    fh.write("# This is a baRNAba run.\n")
    for k in sorted(args.__dict__):
        s = "# " + str(k) + " " + str(args.__dict__[k]) + "\n"
        fh.write(s)

    for i in xrange(0, len(files)):

        cur_pdb = reader.Pdb(files[i], res_mode=args.res_mode)
        cur_pdb.set_xtc(args.xtc)
        cur_len = len(cur_pdb.model.sequence)

        if cur_len < ref_len:
            continue

        # return indeces matching query
        indeces = tools.get_idx(cur_pdb.model.sequence, query, args.bulges)
        idx = 0
        while idx >= 0:
            gmats = [(cur_pdb.model.get_gmat(args.cutoff, index)).reshape(-1) for index in indeces]
            dists = distance.cdist([ref_mat], gmats) / np.sqrt(ref_len)

            below_t = (dists < args.treshold).nonzero()[1]
            for ss in below_t:
                seq = "_".join([cur_pdb.model.sequence_id[p] for p in indeces[ss]])
                string = "%8.5f %s %i - %s \n" % (dists[0, ss], files[i], idx, seq)
                fh.write(string)

            idx = cur_pdb.read()

    fh.close()
Пример #2
0
def ds_motif(args):

    # sanity checks
    files = args.files
    print "# Finding 3D Double Stranded Motifs..."

    ref_pdb = reader.Pdb(args.reference,res_mode=args.res_mode)

    ref_len1 = args.l1
    ref_len2 = args.l2 
    ref_len = ref_len1+ref_len2
    if(args.seq==None):
        query1="N"*ref_len1
        query2="N"*ref_len2
    else:
        query1=(args.seq).split("%")[0]
        query2=(args.seq).split("%")[1]
        assert len(query1)==ref_len1, "# FATAL: query structure and sequence length mismatch!"
        assert len(query2)==ref_len2, "# FATAL: query structure and sequence length mismatch!"



    # OK...
    fh = open(args.name,'w')
    fh.write("# This is a baRNAba run.\n")
    for k in sorted(args.__dict__):
        s = "# " + str(k) + " " + str(args.__dict__[k]) + "\n"
        fh.write(s)

    ref_mat_tot = ref_pdb.model.get_gmat(args.cutoff).reshape(-1)

    indeces1=np.arange(0,ref_len1)
    ref_mat1 = ref_pdb.model.get_gmat(args.cutoff,indeces1).reshape(-1)
    
    indeces2=np.arange(ref_len1,ref_len)
    ref_mat2 = ref_pdb.model.get_gmat(args.cutoff,indeces2).reshape(-1)
    

    # calculate center of mass distances
    # this will be used to prune the search!
    ref_com1 = ref_pdb.model.get_lcs_com(indeces1)
    ref_com2 = ref_pdb.model.get_lcs_com(indeces2)
    diff_com = (ref_com2-ref_com1)**2
    dd= np.sqrt(np.sum(diff_com))

    for i in xrange(0,len(files)):

        cur_pdb = reader.Pdb(files[i],res_mode=args.res_mode)
        cur_pdb.set_xtc(args.xtc)

        cur_len = len(cur_pdb.model.sequence)

        if(cur_len<ref_len): continue

        all_idx1 = tools.get_idx(cur_pdb.model.sequence,query1,args.bulges)
        all_idx2 = tools.get_idx(cur_pdb.model.sequence,query2,args.bulges)


        idx = 0
        while(idx>=0):

            # get indeces and coms of first half
            gmats1 =  [(cur_pdb.model.get_gmat(args.cutoff,index)).reshape(-1) for index in all_idx1]
            dists1 = distance.cdist([ref_mat1],gmats1)/np.sqrt(ref_len1)
            below_t1 = (dists1<args.treshold).nonzero()[1]
            idx1 = [all_idx1[j] for j in below_t1]
            com1 = [cur_pdb.model.get_lcs_com(all_idx1[j]) for j in below_t1]
            
            # get indeces of second half
            gmats2 =  [(cur_pdb.model.get_gmat(args.cutoff,index)).reshape(-1) for index in all_idx2]
            dists2 = distance.cdist([ref_mat2],gmats2)/np.sqrt(ref_len2)
            below_t2 = (dists2<args.treshold).nonzero()[1]
            idx2 = [all_idx2[j] for j in below_t2]
            com2 = [cur_pdb.model.get_lcs_com(all_idx2[j]) for j in below_t2]
            
            # calculate all distances between center of mass
            dmat = distance.cdist(com1,com2)
            c_idx = (dmat<1.5*dd).nonzero()

            # get combo indeces
            dmine = [dmat[c_idx[0][ii],c_idx[1][ii]] for ii in range(len(c_idx[0]))]
            idx_combo = [idx1[c_idx[0][ii]] + idx2[c_idx[1][ii]] for ii in range(len(c_idx[0]))]
            gmatsf = [(cur_pdb.model.get_gmat(args.cutoff,index)).reshape(-1) for index in idx_combo]
            distsf = distance.cdist([ref_mat_tot],gmatsf)/np.sqrt(ref_len)
            below_tf = (distsf<args.treshold).nonzero()[1]
            for ss in below_tf:
                seq = "_".join([cur_pdb.model.sequence_id[p] for p in idx_combo[ss] ])
                string = '%8.5f %s %i - %s \n' % (distsf[0,ss],files[i],idx,seq)
                fh.write(string)

            idx = cur_pdb.read()


    fh.close()
Пример #3
0
def snippet(args):
    
    files = args.files
    # check query sequence
    for item in args.seq:
        if(item not in reader.Names.known_abbrev):
            print "# FATAL Error. Symbol ", item, " not known. Use ACGU NYR"
            return 1

    query = args.seq.split("%")
    assert len(args.seq.split("%")) < 2 , "# Fatal error: max 1 strand"


    fh = open(args.name,'w')
    print "# SPLIT..."
    fh.write("# This is a baRNAba run.\n")
    for k in sorted(args.__dict__):
        s = "# " + str(k) + " " + str(args.__dict__[k]) + "\n"
        fh.write(s)

    ll = [len(el) for el in query]
    
    for i in xrange(0,len(files)):

        try:
            cur_pdb = reader.Pdb(files[i],res_mode=args.res_mode,permissive=True)
        except:
            print "# SKIPPING", files[i]
            continue
        cur_len = len(cur_pdb.model.sequence)
        if(cur_len<sum(ll)): continue

        # single strand
        indeces = tools.get_idx(cur_pdb.model.sequence,query[0],bulges=0)

        # check chain consistency - remove 
        tools.chain_consistency(indeces,cur_pdb.model.sequence_id)

        if(len(query)==2):
            indeces2 = tools.get_idx(cur_pdb.model.sequence,query[1],bulges=0)
            tools.chain_consistency(indeces2,cur_pdb.model.sequence_id)
            # to be done....
            
            
        idx = 0
        new_pdb_r = files[i].split("/")[-1].split(".pdb")[0] + "_"
        while(idx>=0):
            
            for index in indeces:

                seq_out = "".join([cur_pdb.model.sequence[res] for res in index])
                f_res = cur_pdb.model.sequence_id[index[0]]
                l_res = cur_pdb.model.sequence_id[index[-1]]

                new_pdb = new_pdb_r + seq_out + "_" + f_res + "_" + l_res + ".pdb"
                fh_pdb = open(new_pdb,'w')
                fh_pdb.write(cur_pdb.model.string_pdb(index,noP=True))
                fh_pdb.close()

            idx = cur_pdb.read()


    fh.close()
    return 0