def ss_motif(args): # sanity checks files = args.files print "# Finding 3D Single Strand Motifs..." # assert args.bulges < 3, "# FATAL: cannot do bulges > 2" ref_pdb = reader.Pdb(args.reference, res_mode=args.res_mode) ref_len = len(ref_pdb.model.sequence) ref_mat = (ref_pdb.model.get_gmat(args.cutoff)).reshape(-1) if args.seq == None: query = "N" * ref_len else: assert len(args.seq) == ref_len, "# FATAL: query structure and sequence length mismatch!" query = args.seq pattern = tools.get_pattern(query) # OK... fh = open(args.name, "w") fh.write("# This is a baRNAba run.\n") for k in sorted(args.__dict__): s = "# " + str(k) + " " + str(args.__dict__[k]) + "\n" fh.write(s) for i in xrange(0, len(files)): cur_pdb = reader.Pdb(files[i], res_mode=args.res_mode) cur_pdb.set_xtc(args.xtc) cur_len = len(cur_pdb.model.sequence) if cur_len < ref_len: continue # return indeces matching query indeces = tools.get_idx(cur_pdb.model.sequence, query, args.bulges) idx = 0 while idx >= 0: gmats = [(cur_pdb.model.get_gmat(args.cutoff, index)).reshape(-1) for index in indeces] dists = distance.cdist([ref_mat], gmats) / np.sqrt(ref_len) below_t = (dists < args.treshold).nonzero()[1] for ss in below_t: seq = "_".join([cur_pdb.model.sequence_id[p] for p in indeces[ss]]) string = "%8.5f %s %i - %s \n" % (dists[0, ss], files[i], idx, seq) fh.write(string) idx = cur_pdb.read() fh.close()
def ds_motif(args): # sanity checks files = args.files print "# Finding 3D Double Stranded Motifs..." ref_pdb = reader.Pdb(args.reference,res_mode=args.res_mode) ref_len1 = args.l1 ref_len2 = args.l2 ref_len = ref_len1+ref_len2 if(args.seq==None): query1="N"*ref_len1 query2="N"*ref_len2 else: query1=(args.seq).split("%")[0] query2=(args.seq).split("%")[1] assert len(query1)==ref_len1, "# FATAL: query structure and sequence length mismatch!" assert len(query2)==ref_len2, "# FATAL: query structure and sequence length mismatch!" # OK... fh = open(args.name,'w') fh.write("# This is a baRNAba run.\n") for k in sorted(args.__dict__): s = "# " + str(k) + " " + str(args.__dict__[k]) + "\n" fh.write(s) ref_mat_tot = ref_pdb.model.get_gmat(args.cutoff).reshape(-1) indeces1=np.arange(0,ref_len1) ref_mat1 = ref_pdb.model.get_gmat(args.cutoff,indeces1).reshape(-1) indeces2=np.arange(ref_len1,ref_len) ref_mat2 = ref_pdb.model.get_gmat(args.cutoff,indeces2).reshape(-1) # calculate center of mass distances # this will be used to prune the search! ref_com1 = ref_pdb.model.get_lcs_com(indeces1) ref_com2 = ref_pdb.model.get_lcs_com(indeces2) diff_com = (ref_com2-ref_com1)**2 dd= np.sqrt(np.sum(diff_com)) for i in xrange(0,len(files)): cur_pdb = reader.Pdb(files[i],res_mode=args.res_mode) cur_pdb.set_xtc(args.xtc) cur_len = len(cur_pdb.model.sequence) if(cur_len<ref_len): continue all_idx1 = tools.get_idx(cur_pdb.model.sequence,query1,args.bulges) all_idx2 = tools.get_idx(cur_pdb.model.sequence,query2,args.bulges) idx = 0 while(idx>=0): # get indeces and coms of first half gmats1 = [(cur_pdb.model.get_gmat(args.cutoff,index)).reshape(-1) for index in all_idx1] dists1 = distance.cdist([ref_mat1],gmats1)/np.sqrt(ref_len1) below_t1 = (dists1<args.treshold).nonzero()[1] idx1 = [all_idx1[j] for j in below_t1] com1 = [cur_pdb.model.get_lcs_com(all_idx1[j]) for j in below_t1] # get indeces of second half gmats2 = [(cur_pdb.model.get_gmat(args.cutoff,index)).reshape(-1) for index in all_idx2] dists2 = distance.cdist([ref_mat2],gmats2)/np.sqrt(ref_len2) below_t2 = (dists2<args.treshold).nonzero()[1] idx2 = [all_idx2[j] for j in below_t2] com2 = [cur_pdb.model.get_lcs_com(all_idx2[j]) for j in below_t2] # calculate all distances between center of mass dmat = distance.cdist(com1,com2) c_idx = (dmat<1.5*dd).nonzero() # get combo indeces dmine = [dmat[c_idx[0][ii],c_idx[1][ii]] for ii in range(len(c_idx[0]))] idx_combo = [idx1[c_idx[0][ii]] + idx2[c_idx[1][ii]] for ii in range(len(c_idx[0]))] gmatsf = [(cur_pdb.model.get_gmat(args.cutoff,index)).reshape(-1) for index in idx_combo] distsf = distance.cdist([ref_mat_tot],gmatsf)/np.sqrt(ref_len) below_tf = (distsf<args.treshold).nonzero()[1] for ss in below_tf: seq = "_".join([cur_pdb.model.sequence_id[p] for p in idx_combo[ss] ]) string = '%8.5f %s %i - %s \n' % (distsf[0,ss],files[i],idx,seq) fh.write(string) idx = cur_pdb.read() fh.close()
def snippet(args): files = args.files # check query sequence for item in args.seq: if(item not in reader.Names.known_abbrev): print "# FATAL Error. Symbol ", item, " not known. Use ACGU NYR" return 1 query = args.seq.split("%") assert len(args.seq.split("%")) < 2 , "# Fatal error: max 1 strand" fh = open(args.name,'w') print "# SPLIT..." fh.write("# This is a baRNAba run.\n") for k in sorted(args.__dict__): s = "# " + str(k) + " " + str(args.__dict__[k]) + "\n" fh.write(s) ll = [len(el) for el in query] for i in xrange(0,len(files)): try: cur_pdb = reader.Pdb(files[i],res_mode=args.res_mode,permissive=True) except: print "# SKIPPING", files[i] continue cur_len = len(cur_pdb.model.sequence) if(cur_len<sum(ll)): continue # single strand indeces = tools.get_idx(cur_pdb.model.sequence,query[0],bulges=0) # check chain consistency - remove tools.chain_consistency(indeces,cur_pdb.model.sequence_id) if(len(query)==2): indeces2 = tools.get_idx(cur_pdb.model.sequence,query[1],bulges=0) tools.chain_consistency(indeces2,cur_pdb.model.sequence_id) # to be done.... idx = 0 new_pdb_r = files[i].split("/")[-1].split(".pdb")[0] + "_" while(idx>=0): for index in indeces: seq_out = "".join([cur_pdb.model.sequence[res] for res in index]) f_res = cur_pdb.model.sequence_id[index[0]] l_res = cur_pdb.model.sequence_id[index[-1]] new_pdb = new_pdb_r + seq_out + "_" + f_res + "_" + l_res + ".pdb" fh_pdb = open(new_pdb,'w') fh_pdb.write(cur_pdb.model.string_pdb(index,noP=True)) fh_pdb.close() idx = cur_pdb.read() fh.close() return 0