def selectProteinAtomsCrds(self, zrange=[0, 1.0]): """get protein atoms' coordinates if the protein atoms z coordinates in zrange Parameters ---------- zrange : iterable, length = 2 up and low boundaries Returns ------- selected_atoms : np.ndarray, shape = [ N, 3] the xyz coordinates of selected atoms N is number of atoms selected """ pdbio = parsePDB() atominfor = pdbio.atomInformation(self.pdb) ndx = atominfor.keys() # a list of atom index, strings selected_ndx = [x for x in ndx if atominfor[x][1] == "Protein"] cpdb = coordinatesPDB() crds = np.asarray(cpdb.getAtomCrdByNdx(self.pdb, selected_ndx)) selected_crds = [ x for x in crds if ((x[2] > zrange[0]) and (x[2] < zrange[1])) ] return np.asarray(selected_crds)
def parsePDB(self): pio = pdbIO.parsePDB(inPDB=self.inpdb) infor = pio.atomInformation(self.inpdb) atomndx = infor.keys() elements = [infor[x][7] for x in atomndx] return (atomndx, elements)
def domainCOM(self, domainf, ref, pdbchain='A', output="com_domains.day", atomNames=["CA"]): """ calculate centor of mass, or geometry center of a domain :param domainf:str, domain information data file :param ref: str, reference pdb file :param pdbchain: chain id :param output: str, output file name :return: list of list, com of domains, dimension N*3 """ dom = parsePDB() domains = dom.readDomainRes(domainf) dnames = [x[0] for x in domains] pdbc = coordinatesPDB() coms = [] for i in range(len(domains)): # get atom index of residues in a domain atomindex = index.gen_atom_index(ref, [ pdbchain, ], domains[i][1:], atomNames, "original") atomindex = [str(x) for x in atomindex] # get crds of a list of atoms crds = pdbc.getAtomCrdByNdx(ref, atomindex) # calculate geometry center, not centor of mass com = np.mean(np.asarray(crds), axis=0) coms.append(list(com)) np.savetxt(output, np.asarray(coms), fmt="%8.3f", delimiter=" ", header=dnames, comments="#") return coms
def elementCount(self, ligand): from dockml import pdbIO elements = self.getVdWParams().keys() atominfor = pdbIO.parsePDB(ligand).atomInformation(ligand) elem_count = dict(zip(elements, np.zeros(len(elements)))) for atom in atominfor.keys(): if atominfor[atom][7] not in elements: elem_count["DU"] += 1 else: elem_count[atominfor[atom][7]] += 1 return elem_count
def resInDomains(self, domainf, residues): ''' input a list of residues and domain information file output the ratio of residues in each domain :param domainf: :param residues: list, a list of residues from community analysis :return: ''' pdb = pdbIO.parsePDB() dinfor = pdb.readDomainRes(domainf) # eg. { domain_name: [1, 3, 5]} domains = collections.defaultdict(list) d_count = {} for d in dinfor : domains[d[0]] = range(d[1], d[2]+1) d_count[d[0]] = 0 for d in d_count.keys() : # calculate how many res (in parameter residues) in a domain d_count[d] = len(set(domains[d]).intersection(set(residues))) # ratio_outof means how much res in the list (residues) in different domains, # sum them up, you should get 1 # eg, this list of residues is 100 residues, only 25 in domain HNH, # therefore, the ratio for HNH is 25%. ratio_outof = {} # ratio_indomain means, for a specific domain, # some ratio of all res in this specific domain, is in the list of residues # eg. Domain HNH (have 250 residues), in this community residues list, 25 res in # domain HNH, thus the ratio for HNH is 10% ratio_indomain = {} for d in d_count.keys() : ratio_outof[d] = d_count[d] / float(len(residues)) ratio_indomain[d] = d_count[d] / float(len(domains[d])) ratio_outof = sorted(ratio_outof.items(), key=lambda x: x[1], reverse=True) ratio_indomain = sorted(ratio_indomain.items(), key=lambda x:x[1], reverse=True) return ratio_indomain, ratio_outof
def domainWiseEigVec(self, domainf, vectors, scalefactor=1.0, output='aver-vectors.dat'): ''' averaging the vectors on each atoms in a domain :param domainf: str, domain information data file :param vectors: list of lists, dimension N*3 :param output: str, output file name :return: tuple, (domain_average_vectors, domain names) ''' dom = parsePDB() domains = dom.readDomainRes(domainf) # domain_names d_name = [x[0] for x in domains] minResIndx = min(sum([x[1:] for x in domains], [])) aver_vec = [] for i in range(len(domains)): #print(domains[i]) # becasue vectors index starting from 0 resindexlist = [] for k in range((len(domains[i]) - 1) / 2): #print(domains[i][k*2+1], domains[i][k*2+2]) resindexlist += list( np.asarray( range(domains[i][k * 2 + 1], domains[i][k * 2 + 2])) - minResIndx) v = self.averageVectors(vectors, resindexlist) * scalefactor aver_vec.append(v) np.savetxt(output, np.asarray(aver_vec), delimiter=' ', fmt='%12.5f', header=" ".join(d_name), comments="#") return (aver_vec, d_name)
def __init__(self, ligandPDB, receptorPDB, gridsize=1.0): ''' :param gridsize: float, grid size, unit angstrom ''' self.gridsize = gridsize self.ligpdb = ligandPDB self.recpdb = receptorPDB bf = BindingFeature() self.vdwparm = bf.getVdWParams() self.eleparm = bf.getElementParams() self.atominfor = pio.parsePDB().atomInformation(self.ligpdb) try: with open(self.ligpdb) as lines: self.ligndx = [x.split()[1] for x in lines if "ATOM" in x] except: self.ligndx = []
def main(): mtxh = MatrixHandle() args, unknown = arguments() if args.opt in ["merge", "pair-t-test", "ind-t-test"]: if args.ds in ['xyz', 'XYZ', '3d']: data1 = mtxh.loadxyz(args.dat[0], args.dtype, args.xyzcol, xyshift=args.xyshift) data2 = mtxh.loadxyz(args.dat[1], args.dtype, args.xyzcol, xyshift=args.xyshift) elif args.ds in ['matrix', 'mtx']: data1 = mtxh.reshapeMtx(args.dat[0], args.dtype, xyshift=args.xyshift) data2 = mtxh.reshapeMtx(args.dat[1], args.dtype, xyshift=args.xyshift) else: print("Error: Data-shape is not specified. ") data1, data2 = np.array([]), np.array([]) if args.opt == "merge": merged = mtxh.merge_matrix(data1, data2) np.savetxt(args.out, merged, fmt="%.3f", delimiter=" ") print("Merge matrix file completed!") elif args.opt == "pair-t-test": t, p = stats.ttest_rel(data1[:, 2], data2[:, 2]) print("T statistics: %6.3f " % t) print("P value : %8.6f" % p) elif args.opt == "ind-t-test": t, p = stats.ttest_ind(data1[:, 2], data2[:, 2]) print("T statistics: %6.3f " % t) print("P value : %8.6f" % p) fit = np.polyfit(data1[:, 2], data2[:, 2], deg=1) x, y = data1[:, 2], data2[:, 2] plt.plot(x, fit[0] * x + fit[1], color='red', lw=2.5) plt.scatter(x, y, c='b') plt.xlabel("X") plt.ylabel("Y") plt.show() elif args.opt == "transform": if args.ds in ['xyz', 'XYZ', '3d']: data = mtxh.loadxyz(args.dat[0], args.dtype, args.xyzcol, xyshift=args.xyshift) odata = mtxh.xyz2matrix(data) np.savetxt(args.out, odata, fmt="%.5f", delimiter=" ") else: data = mtxh.reshapeMtx(args.dat[0], args.dtype, xyshift=args.xyshift) np.savetxt(args.out, data, fmt="%.5f", delimiter=" ") print("Transform matrix file type completed!") elif args.opt == "extract": if args.ds in ['xyz', 'XYZ', '3d']: data = mtxh.loadxyz(args.dat[0], args.dtype, args.xyzcol, xyshift=args.xyshift) elif args.ds in ['matrix', 'mtx']: data = mtxh.reshapeMtx(args.dat[0], args.dtype, xyshift=args.xyshift) else: sys.exit(0) data = data.astype(np.float) d = mtxh.extractDomainData(data, args.xyrange[:2], args.xyrange[2:]) np.savetxt(args.out, d, fmt="%.5f") print("Extract matrix file completed!") elif args.opt == "average": aver_data = np.array([]) for i in range(len(args.dat)): if args.ds in ['xyz', 'XYZ', '3d']: data = mtxh.loadxyz(args.dat[0], args.dtype, args.xyzcol, xyshift=args.xyshift) elif args.ds in ['matrix', 'mtx']: data = mtxh.reshapeMtx(args.dat[0], args.dtype, xyshift=args.xyshift) else: sys.exit(0) print(data.shape) if i == 0: aver_data = data else: aver_data[:, 2] += data[:, 2] aver_data[:, 2] = aver_data[:, 2] / float(len(args.dat)) np.savetxt(args.out, aver_data, fmt="%.5f") print("Average matrix files completed!") elif args.opt == "domain-aver": if args.ds in ['xyz', 'XYZ', '3d']: data = mtxh.loadxyz(args.dat[0], args.dtype, args.xyzcol, xyshift=args.xyshift) elif args.ds in ['matrix', 'mtx']: data = mtxh.reshapeMtx(args.dat[0], args.dtype, xyshift=args.xyshift) else: sys.exit(0) data = data.astype(np.float) data[:, 0] = data[:, 0] + args.start_res data[:, 1] = data[:, 1] + args.start_res drange = [] if os.path.exists(args.domain): pdb = pdbIO.parsePDB() domains = pdb.readDomainRes(args.domain) drange = [x[1:] for x in domains] else: drange = [float(x) for x in args.drange] tofile = open(args.out, 'w') for i in range(len(drange)): tofile.write("# %d %s \n" % (i, " ".join([str(x) for x in drange[i]]))) print(drange) for i in range(len(drange)): for j in range(len(drange)): if args.dzero and i == j: tofile.write("%3d %3d 0.0 \n" % (i, j)) else: d = [] #print(len(drange[i])/2) for r1 in range(int(len(drange[i]) / 2)): for r2 in range(int(len(drange[j]) / 2)): ccc = mtxh.extractDomainData( data, xrange=drange[i][2 * r1:2 * r1 + 2], yrange=drange[j][2 * r2:2 * r2 + 2])[:, 2] d += list(ccc) tofile.write("%3d %3d %12.3f \n" % (i, j, np.mean(d))) tofile.close() print("Domain-wise matrix averaging completed!") elif args.opt == 'neib0': if args.ds in ['xyz', 'XYZ', '3d']: data = mtxh.loadxyz(args.dat[0], args.dtype, args.xyzcol, xyshift=args.xyshift) elif args.ds in ['matrix', 'mtx']: data = mtxh.reshapeMtx(args.dat[0], args.dtype, xyshift=args.xyshift) else: sys.exit(0) data = data.astype(np.float) print(data.shape) newd = mtxh.neiborhood2zero(data, neiborsize=args.neibsize, outtype='mtx', zscale=args.zscale) np.savetxt(args.out, newd, fmt='%.3f')
def cmap_nbyn(trajs, ref, rc, lc, v=True, cutoff=0.35, allchains=" ABCDEFGH", atomtype=["sidechain", "sidechain"]): """Generate sidechain based contact maps. Parameters ---------- trajs : list of mt.trajectory objects, shape = N The input chunks of trajectories, N is number of chunks ref : str The reference pdb file name rc : list, shape = 3 The residue and chain identifier for x-axis lc : list, shape = 3 The residue and chain identifier for y-axis cutoff : float, default = 0.35 The distance cutoff, in unit nanometer v : bool, default = True Whether print detail information during the calculation allchains : str, default = 'ABCDEFGH' All available chain identifiers in the reference pdb files Returns ------- """ pdb = pdbIO.parsePDB(inPDB=ref) all_resids = pdb.getNdxForRes(ref, chains=allchains) print(all_resids) # for chain_a resids_a = [] for i, item in enumerate(all_resids): if item[2] in rc[0] and \ int(item[1]) in np.arange(int(rc[1]), int(rc[2])+1): resids_a.append(i) print(resids_a) # for chain_b resids_b = [] for i, item in enumerate(all_resids): if item[2] in lc[0] and \ int(item[1]) in np.arange(int(lc[1]), int(lc[2])+1): resids_b.append(i) print(resids_b) contact_map = np.array([]) for i, traj in enumerate(trajs): # calculate cmap information verbose(v, "Generate cmap for chunk %5d ......" % i) contmap = CmapNbyN(traj, resids_a=resids_a, resids_b=resids_b, cutoff=cutoff) contmap.cmap_nbyn(atomtype=atomtype) if i == 0: contact_map = contmap.cmap_ else: contact_map = np.concatenate((contact_map, contmap.cmap_), axis=0) return contact_map
def drawTimeSeries2D( self, cmapmatrix, refpdb=[], fsize=14, xlabel="", ylabel="", cmaptype="Grey", xlim=[], ylim=[], yticks_loc=[], yticks_labels=[], yticks_showchainid=False, xticks_loc=[], xticks_labels=[], colorbar_label="", colorbar_show=False, savefig="", ): """ plot the ligand protein interactions (time series, x axis) :param cmapmatrix: str, the data file, containing time series matrix file :param refpdb: list, [ pdbfilename, chain id, residue sequence shift-by ] :param fsize: :param xlabel: :param ylabel: :param cmaptype: :param xlim: :param ylim: :param yticks_loc: :param yticks_labels: :param yticks_showchainid: whether show chainid of protein residues :param xticks_loc: :param xticks_lables: :return: """ # load cmap file cmapdata = np.loadtxt(cmapmatrix, delimiter=",") cm_sorted = sorted(list(cmapdata), key=lambda x: x[0], reverse=False) # get key protein residues involving protein ligand binding key_res = [] true_res = np.sum(np.asarray(cm_sorted)[:, 1:], axis=0) > 0 for res in range(true_res.shape[0]): if true_res[res]: key_res.append(res) print("KEY RES ", key_res) # get full residue name index list res_labels = [] if len(refpdb) == 3: ppdb = pdbIO.parsePDB("") fullreslist = ppdb.getNdxForRes(refpdb[0], [refpdb[1]]) shortresmap = ppdb.longRes2ShortRes() fullreslist = [x for x in fullreslist if x[2] in refpdb[1]] for resk in key_res: if fullreslist[resk][0] in shortresmap.keys(): resseq = str(resk + refpdb[2]) resname = shortresmap[fullreslist[resk][0]] chainid = fullreslist[resk][2] if yticks_showchainid: id = resname + resseq + chainid else: id = resname + resseq res_labels.append(id) # only keep the important residue cmap keyres_cmap = np.asarray(cm_sorted)[:, 1:][:, list(key_res)] # get the length of x and y axis shapex = len(key_res) shapey = cmapdata.shape[0] + 1 print("Protein residue numbers: %d" % shapex) print("Time point numbers: %d" % shapey) z = np.transpose(keyres_cmap[:, 1:]).T plt.pcolormesh(z.T, cmap=plt.get_cmap(cmaptype)) if colorbar_show: plt.colorbar(label=colorbar_label) plt.xlabel(xlabel, fontsize=fsize) plt.ylabel(ylabel, fontsize=fsize) if len(yticks_loc) and len(yticks_labels): plt.yticks(yticks_loc, yticks_labels) else: if len(refpdb) == 3: plt.yticks(np.array(range(shapex)) + 0.5, res_labels) if len(xlim): plt.xlim(xlim) if len(ylim): plt.ylim(ylim) else: plt.ylim([0, shapex + 0.5]) if len(xticks_labels) and len(xticks_loc): plt.xticks(xticks_loc, xticks_labels) if len(savefig): plt.savefig(savefig, dpi=2000) plt.show() return 1