def writeSPDB(self): aamap = AAmap() fo = open(self.pdbfile + '.spdb', 'w') for a in self.atoms: fo.write('%f %f %f %d %s\n' % (a.x, a.y, a.z, a.resSeq, aamap.getAAmap(a.resName))) fo.close()
def getSeq(self): aamap = AAmap() seq='' last_resSeq = -1 seqPos = 0 for i in xrange(0,len(self.atoms)): a=self.atoms[i] if last_resSeq != a.resSeq: seq=seq+aamap.getAAmap(a.resName) last_resSeq = a.resSeq key = '%s%s' % (a.chainID, a.resSeq) self.resDict[key] = (seqPos, seq[seqPos]) seqPos+=1 return seq
def filterClusters(self): if len(self.pairwiseDict) == 0: self.pairwise() amap = AAmap() for i in xrange(0, len(self.atoms)): c = cluster(self.pdb, self.top, self.pfam, '', '', self.seqheader, '', '', self.center, self.cutoff, self.scutoff, self.flag, 1.0, self.desc) c.addNeighbor(amap, self.atoms[i], i) # put itself in first nbnum = 0 for j in xrange(0, len(self.atoms)): key = "%d-%d" % (i, j) if (self.pairwiseDict[key] <= self.cutoff) and (abs(i - j) >= self.scutoff): c.addNeighbor(amap, self.atoms[j], j) nbnum = nbnum + 1 c.thetaPhi.append( self.calculateThetaPhi(self.atoms[i], self.atoms[j])) if nbnum < self.nbcutoff: continue c.pdbidx = c.pdbidx.lstrip() # will change meanDist c.pdbResSeq = c.pdbResSeq.lstrip() meanDist = self.clusterMeanDist(c) if meanDist < 5.8: print('%s,%0.2f,%s,%s,%s,%s') % (self.pdb, meanDist, ''.join( sorted(c.str)), ''.join(sorted( c.typeStr)), c.pdbResSeq, self.getSphericalStr(c)) self.clusters.append(c)
def writeFASTA(self): fafile = self.pdb+'.fa' aamap = AAmap() seq='' count = 0 last_resSeq = -1 for i in xrange(0,len(self.atoms)): a=self.atoms[i] if last_resSeq != a.resSeq: seq=seq+aamap.getAAmap(a.resName) last_resSeq = a.resSeq count+=1 seq=seq+'\n' header = '>%s/1-%d\n' % (self.pdb, count) print header+seq fp=open(fafile, 'w') fp.write(header+seq) fp.close()
def writeFASTA(self): fafile = self.pdb + '.fa' aamap = AAmap() seq = '' count = 0 last_resSeq = -1 for i in xrange(0, len(self.atoms)): a = self.atoms[i] if last_resSeq != a.resSeq: seq = seq + aamap.getAAmap(a.resName) last_resSeq = a.resSeq count += 1 seq = seq + '\n' header = '>%s/1-%d\n' % (self.pdb, count) print header + seq fp = open(fafile, 'w') fp.write(header + seq) fp.close()
def __init__(self, nafile): self.pdb = nafile[0:4] self.rsaDict = {} self.resiDict = defaultdict(lambda: '') self.alphabet = ['B', 'E'] aamap = AAmap() lines = [line.strip() for line in open(nafile)] for naline in lines: head = naline[0:3] if head == 'RES': r = rsa(naline) key = '%s%s%s' % (aamap.getAAmap(r.resn), r.chain, r.resi) self.rsaDict[key] = r varkey = '%s%s' % (aamap.getAAmap(r.resn), self.accessible(key)) varvalue = '%s%s%s ' % (self.resiDict[varkey], r.chain, r.resi) self.resiDict[varkey] = varvalue elif head == 'TOTAL': key = 'TOTAL' self.rasDict[key] = naline.split()
def resn2bfactor(): if len(sys.argv) < 3: print 'resn2bfactor(): replace b factor values with residue type.' print 'resn2bfactor(): used for pymol spectrum b' return scoreValue = { 'X':0,'-': 0,'.': 0,'A': 1,'C': 2,'D': 3,'E': 4,'F': 5,'G': 6,'H': 7,'I': 8,'K': 9, 'L': 10,'M': 11,'N': 12,'P': 13,'Q': 14,'R': 15,'S': 16,'T': 17,'V': 18,'W': 19,'Y': 20, 'B': 3 } aamap = AAmap() pdbfile = sys.argv[2] p = protein(pdbfile) outfile = '%s_rb.pdb' % pdbfile[:-4] fout = open(outfile, 'w') for a in p.atoms: newBFactor = scoreValue[aamap.getAAmap(a.resName)] print 'new b-factor: [%s : %s] -> %d' % (a.resName, aamap.getAAmap(a.resName), newBFactor) a.tempFactor = newBFactor fout.write(a.writeAtom()) fout.close() print 'Output file: %s' % outfile
def getSeq(self): aamap = AAmap() seq = '' #last_resSeq = -1 # 1a8v the first resi starts from -1 !!!! last_resSeq = -9999 # 1a8v the first resi starts from -1 !!!! seqPos = 0 resArray = [] resAtomsAll = [] resatoms = [] for i in xrange(0, len(self.atoms)): a = self.atoms[i] if last_resSeq != a.resSeq: seq = seq + aamap.getAAmap(a.resName) last_resSeq = a.resSeq key = '%s%d' % (a.chainID, a.resSeq) self.resDict[key] = (seqPos, seq[seqPos]) seqPos += 1 #resArray.append('%s %s %s' % (a.chainID,aamap.getAAmap(a.resName),str(a.resSeq))) resArray.append( (a.chainID, aamap.getAAmap(a.resName), a.resSeq)) if len(resatoms) > 0: resAtomsAll.append(resatoms) resatoms = [] resatoms.append(a) # after loop add the last res into resatoms # only resSeq change trigger adding above if len(resatoms) > 0: resAtomsAll.append(resatoms) return seq, resArray, resAtomsAll
def main(): if len(sys.argv) < 3: print 'python proc_dendrogram.py preffix cutoff' exit preffix = sys.argv[1] cutoff = float(sys.argv[2]) # load tip pdb file pr = protein(preffix) aamap = AAmap() n = len(pr.atoms) resimap = {} print 'writing %s.resimap ...' % (preffix) fr = open(preffix+'.resimap', 'w') px = [] count = 0 for a in pr.atoms: px.append((a.x, a.y, a.z)) resimap[count] = ('%s%d' % (a.chainID, a.resSeq), aamap.getAAmap(a.resName)) fr.write('%d %s%d %s\n' % (count, a.chainID, a.resSeq, aamap.getAAmap(a.resName))) count+=1 fr.close() x = np.array(px) # calculate pairwised distance pdist = {} print 'writing %s.pdist ...' % (preffix) fo=open(preffix+'.pdist','w') for i in xrange(0,len(x)): for j in xrange(i+1,len(x)): dist = np.linalg.norm(x[i]-x[j]) pdist['%d-%d' % (i,j)] = dist fo.write('%d-%d : %f\n' % (i,j,dist)) fo.close() # for hc extraction hcdict = {} hclist = [] existdict = {} #linkage_matrix = linkage(x, "single") linkage_matrix = linkage(x, "complete") #ddata = augmented_dendrogram(linkage_matrix, color_threshold=1) #plt.show() print 'writing %s.hcluster ...' % (preffix) fo1 = open(preffix+'.hcluster', 'w') m = linkage_matrix for i in xrange(0,len(m)): #print '%d %d %d %f %d' % (n+i,m[i,0],m[i,1],m[i,2],m[i,3]) fo1.write('%d %d %d %f %d\n' % (n+i,m[i,0],m[i,1],m[i,2],m[i,3])) hcline = '%d %d %d %f %d' % (n+i,m[i,0],m[i,1],m[i,2],m[i,3]) h = hc(hcline, n) hcdict[h.clusterID] = h hclist.append(h) fo1.close() # resolve leaves for each cluster print 'resolving leaves ...' for h in hclist: h.getChildren(hcdict) #h.dump() print 'iterating clusters for largest proximity contact ...' for i in xrange(0, n): leafstr = '%d %d %d 0.0 1' % (i, i, i) h = hc(leafstr, n) h.leaves = [i] hcdict[i] = h #hcdict[i].dump() # add single leaf in for i in xrange(0, n): existdict[i]= True for h in hclist: if h.dist <= cutoff: if h.c1 in existdict and h.c2 in existdict: # both been checked before #print '1AA' if existdict[h.c1] == True and existdict[h.c2] == True: ret = checkProximity2(hcdict[h.c1], hcdict[h.c2], pdist, cutoff) existdict[h.clusterID] = ret if ret == True: # combine both and delete sub cluster in the dict existdict[h.c1] = False existdict[h.c2] = False elif existdict[h.c1] == False or existdict[h.c2] == False: existdict[h.clusterID] = False elif h.c1 in existdict and h.c2 not in existdict: #print '1AB' if existdict[h.c1] == False: # c1 is not a contact; get h existdict[h.clusterID] = False existdict[h.c2] = checkProximity(hcdict[h.c2], pdist, cutoff) # get c2 elif existdict[h.c1] == True: # c1 is a contact; get c2 then get h = c1 and c2 ret = checkProximity(hcdict[h.c2], pdist, cutoff) # get c2 existdict[h.c2] = ret if ret == False: existdict[h.clusterID] = False elif ret == True: # h.c2 is a contact ret1 = checkProximity2(hcdict[h.c1], hcdict[h.c2], pdist, cutoff) existdict[h.clusterID] = ret1 if ret1 == True: existdict[h.c1] = False existdict[h.c2] = False elif h.c1 not in existdict and h.c2 in existdict: #print '1BA' if existdict[h.c2] == False: # c2 is not a contact; get h existdict[h.clusterID] = False existdict[h.c1] = checkProximity(hcdict[h.c1], pdist, cutoff) # get c1 elif existdict[h.c2] == True: # c2 is a contact; get c1 then get h = c1 and c2 ret = checkProximity(hcdict[h.c1], pdist, cutoff) # get c1 existdict[h.c1] = ret if ret == False: existdict[h.clusterID] = False elif ret == True: # h.c1 is a contact ret1 = checkProximity2(hcdict[h.c1], hcdict[h.c2], pdist, cutoff) existdict[h.clusterID] = ret1 if ret1 == True: existdict[h.c1] = False existdict[h.c2] = False elif h.c1 not in existdict and h.c2 not in existdict: #print '1BB' r1 = checkProximity(hcdict[h.c1], pdist, cutoff) existdict[h.c1] = r1 r2 = checkProximity(hcdict[h.c2], pdist, cutoff) existdict[h.c2] = r2 if r1 == False or r2 == False: existdict[h.clusterID] = False elif r1 == True and r2 == True: ret = checkProximity2(hcdict[h.c1], hcdict[h.c2], pdist, cutoff) if ret == True: existdict[h.c1] = False existdict[h.c2] = False elif h.dist > cutoff: #print '0XX' existdict[h.clusterID] = False if h.c1 not in existdict: existdict[h.c1] = checkProximity(hcdict[h.c1], pdist, cutoff) if h.c2 not in existdict: existdict[h.c2] = checkProximity(hcdict[h.c2], pdist, cutoff) # print out the result print 'writing result into %s.hcg' % preffix fout = open(preffix+'.hcg', 'w') count=0 for hid in existdict: #if hid >= N and existdict[hid] == True: if existdict[hid] == True: #fout.write('%d: %r, %s' % (hid, existdict[hid], hcdict[hid].writeString())) fout.write('%s,%s\n' % (preffix, hcdict[hid].writeLeaves(resimap))) count+=len(hcdict[hid].leaves) print '%d leaves in total\n' % count
def writeSPDB(self): aamap = AAmap() fo = open(self.pdbfile+'.spdb', 'w') for a in self.atoms: fo.write('%f %f %f %d %s\n' % (a.x, a.y, a.z, a.resSeq, aamap.getAAmap(a.resName))) fo.close()
def __init__(self, pdbname, chain='all', top='', pfam='', center='CA', cutoff=5, scutoff=1, flag=0, desc='', nbcutoff=4): self.atoms = [] #dictionary for pairwise distance self.pairwiseDict = {} self.clusters = [] #pdb, top, pfam, str, pdbidx, seqheader, alignstr, alignidx, center, cutoff, scutoff, flag, desc #self.pdb = pdbname[len(pdbname)-8:len(pdbname)-4] self.pdbfile = pdbname self.pdb = pdbname[:-4] self.chain = chain self.top = top self.pfam = pfam self.center = center self.cutoff = cutoff self.scutoff = scutoff self.seqheader = self.pdb self.flag = flag self.desc = desc self.nbcutoff = nbcutoff self.ca = [] fin = open(pdbname, 'r') lines = fin.readlines() fin.close() lastname = '' lastres = '' aamap = AAmap() for i in xrange(0, len(lines)): line = lines[i] # load only one model if 'END' in line[0:6]: break if line[17:20].strip() not in aamap.AAA2A: continue if self.chain != 'all': if (self.chain != line[21]): continue if line[0:6] == 'ATOM ': at = atom(lines[i]) if (at.name == lastname) and (at.resSeq == lastres): #print '[%s]::alter loc:\n%s' % (self.pdbfile, lines[i]) #if (line[16]==' ' or line[16]=='A'): # to avoid alternative location continue else: self.atoms.append(at) if at.name.strip() == 'CA': self.ca.append(at) lastname = at.name lastres = at.resSeq # map for Chain+Resi : (index in sequence, ResName) # 'B529': (132, 'V') self.resDict = {} # assigned in self.getSeq() function # resAtoms, a list of lists, each (element) list contains atoms of residues # resArray, gives a list of keys eg. (A,Q,70), (A,I,71), (A,V,72) self.seq, self.resArray, self.resAtoms = self.getSeq() # some residue does not have CA!! 1e6i.aln.pdb the last residue #aamap = AAmap() #self.seq = ''.join([aamap.getAAmap(a.resName) for a in self.ca]) # map for sequence index: Chain+Resi(ResName) # 132 : 'B529(V)' self.seqDict = {-1: '.'} for r in self.resDict: self.seqDict[self.resDict[r][0]] = '%s(%s)' % (r, self.resDict[r][1])