示例#1
0
def msai2resi():
	if len(sys.argv) < 4:
		print 'msai2resi: output the mapping between msa position index and pdb residue number'
		print 'example:python utils_msa.py msai2resi PF07714_full.fa BTK_HUMAN 1k2p.pdb\n'
		print 'output: PF07714_full.fa.1k2p.pdb.map'
		return

	msafile = sys.argv[2]
	target = sys.argv[3]
	pdbfile = sys.argv[4]
	outfile = msafile+'.'+pdbfile+'.map'

	print 'msafile: %s\ntarget header: %s\npdbfile: %s\noutput file: %s' % (msafile, target, pdbfile, outfile)
	m = msa(msafile)
	p = protein(pdbfile)
	rtmap = m.getResiTargetMap(p, target)
	if len(rtmap) < 1:
		print 'error occoured in generating rtmap'
		return
	#print '%s: %s' % (tvar, repr(rtmap[tvar]))
	# construct trmap from rtmap
	# 3128: (B641, 'R')
	trmap = {}
	#trmap = {v: k for k, v in rtmap.iteritems()}
	fout = open(outfile ,'w')
	for k in rtmap:
		msai, resn = rtmap[k]
		if msai in trmap:
			print 'error. duplicate key [%d] in rtmap' % msai
			return
		trmap[msai] = (k, resn)
		fout.write('%d %d %d' % (msai, k, resn))
	fout.close()
def main():
    if len(sys.argv)< 2:
        print "Usage: proc_spectralFilter.py pdb(tip)file >> tip_clusters.txt"
        return 
  
    p=protein(sys.argv[1], 'v4',center='TIP')
    p.spectralClustering(6)
def main():
	if len(sys.argv) < 2:
		print "Usage python proc_extractDomain.py domain_desc_file"
		return

	fin = open(sys.argv[1], 'r')
	for line in fin.readlines():
		line = line.strip()
		strArr = line.split(',')

		tip_filename = strArr[0]
		start = int(strArr[1])
		end = int(strArr[2])
		
		print tip_filename+'.domain'
		fo = open(tip_filename+'.domain', 'w')
		p = protein('a'+tip_filename+'.tip', 'alpha',center='TIP') 

		for a in p.atoms:
			if a.resSeq >= start and a.resSeq <= end:
				fo.write(a.writeAtom())

		fo.close()

	fin.close()
def main():
    if len(sys.argv)< 2:
        print "Usage: proc_SinglePDBFilter.py pdb(tip)file >> tip_clusters.txt"
        return 
  
    p=protein(sys.argv[1], 'v3',center='TIP')
    p.filterClusters()
示例#5
0
def writencg():
	if len(sys.argv) < 4:
		print 'writencg(): write non parametric contact group matrix for a (coarse-grained) pdb with size cutoff'
		print 'writencg(): python utils_ncg.py writencg 1t3r.pdb 3'
		return	

	pdbfile = sys.argv[2]
	size = int(sys.argv[3])
	outfile = pdbfile+'.ncg'

	print 'writencg(): pdbfile: %s' % pdbfile
	print 'writencg(): ncg size: %d' % size
	print 'writencg(): output: %s' % outfile

	ncgArray = []
	p = protein(pdbfile)
	for a in p.atoms:
		c = ncg(a, size)
		ncgArray.append(c)

	fout = open(outfile, 'w')
	for c in ncgArray:
		c.grow(p.atoms)
		fout.write(c.outStr()+'\n')
	fout.close()
示例#6
0
def main():
    if len(sys.argv)< 3:
        print "Usage: proc_N-mer.py tip_file cluster_file n_mer dist_cutoff >> n_mer_outfile"
        return 
    infile = sys.argv[1]
    infile2 = sys.argv[2]
    n_mer = int(sys.argv[3])
    cutoff = float(sys.argv[4])

    p = protein(infile, center='TIP')
    p.initCGResiMap()

    with open(infile2) as fp:
    	for line in fp:
    		cg = cgroup(line.strip())
    		if cg.getSize() == n_mer:
   				if p.cgResiGroupFilter(cg, cutoff) == True:
 						print cg.getString()
	    	elif cg.getSize() < n_mer:
	    		continue
    		else:	
    			# generate combinations
    			for idx in list(itertools.combinations(range(cg.getSize()),n_mer)):
    				sub_cg = cgroup()
    				sub_cg.pdb = cg.pdb
    				sub_cg.chain = cg.chain
    				for i in idx: # iterate all the tuples
    					sub_cg.AAgroup = sub_cg.AAgroup + cg.AAgroup[i]
    					sub_cg.resi.append(cg.resi[i])
    				if p.cgResiGroupFilter(sub_cg, cutoff) == True:
 						print sub_cg.getString()
  	fp.close()
示例#7
0
def ncg2sdiicol():
	if len(sys.argv)<7:
		print 'ncg2sdiicol: write selected MSA column into .sdiicol file'
		print 'python utils_msa.py ncg2sdiicol 1aps_A_1_97.rpdb.tip 1aps_A_1_97.rpdb.tip.ncg PF00708_full.txt.rseq PF00708_full.txt.all_2_sdiii ACYP2_HORSE 2'
		return

	pdbfile = sys.argv[2] # pdb name
	ncgfile = sys.argv[3] # hcg
	msafile = sys.argv[4] # msa (full or reduced)
	sdiifile = sys.argv[5] # sdii
	target = sys.argv[6] # target name	
	orderlist = [int(i) for i in sys.argv[7].split(',')]
	outfile =  pdbfile[0:4]+'_'+msafile[0:7]+'.sdiicol'# new substitution matrix


	print 'pdbfile :%s' % pdbfile
	print 'ncgfile :%s' % ncgfile
	print 'msafile :%s' % msafile
	print 'sdiifile :%s' % sdiifile
	print 'uniprot name :%s' % target
	print 'ncg order list : [%s]' % repr(orderlist)
	print 'outfile: %s' % outfile

	# get msa in matrix format
	m = msa(msafile)
	msaMatrix = np.array([list(s[1]) for s in m.msaArray]) # matrix format of msa

	#for i in xrange(0, len(seqs)):
	#	print seqs[i]
	print 'msa matrix: ' + repr(msaMatrix.shape)

	# get resi -> msai map	
	p = protein(pdbfile)

	rtmap = m.getResiTargetMap(p, target) # ('A9', (14, 'V')) : (resi+chain, (MSA index, resn))

	sdiidict = loadsdii(sdiifile) # key: 39-140-210, value = 0.0788593466276019
	msaGroupArray = ncg2msa(ncgfile, rtmap) # unsorted [[86, 83, 198, 127, 120], [138, 76, 82, 127, 132]]

	# output msa column set
	colset = set()
	for i in orderlist:
		for g in msaGroupArray:
			rg = g[0:i] # get ith order contact group
			rg.sort() # for generating key
			sdiikey = '-'.join([str(r) for r in rg])
			if sdiikey not in sdiidict:
				#print 'ncg2sdiicol(): discard group: %s for low sdii' % sdiikey
				continue
			print (sdiikey, sdiidict[sdiikey])			
			for resi in rg: # for significant ncg, add corresponding MSA column index
				colset.add(resi)

	print 'ncg2sdiicol():writing %s: %s' % (outfile, repr(colset))
	fout = open(outfile, 'w')
	fout.write(' '.join([str(c) for c in colset]))
	fout.close()
示例#8
0
def getresset():
	if len(sys.argv) < 2:
		print 'getresset(): python utils_sdii.py getresset result_sdii'
		return

	pdbfile = sys.argv[2]
	chain = sys.argv[3]
	p = protein(pdbfile, chain)
	print p.seq
示例#9
0
def main():
    fin = open('pdblist.txt', 'r')
    lines = fin.readlines()
    fin.close()
    
    for i in xrange(0,len(lines)):  
        line = lines[i].strip()
#        pdb_filename=line+'.pdb'
        pdb_filename=line+'.tip'
        p=protein(pdb_filename,'fasta')
        p.writeSeq(line+'.fa')

    pass
示例#10
0
def main():
    fin = open('pdblist.txt', 'r')
    lines = fin.readlines()
    fin.close()
    
    for i in xrange(0,len(lines)):  
        line = lines[i].strip()
        pdb_filename=line+'.pdb'
        print pdb_filename
        p=protein(pdb_filename,'CA_A')
        p.writeChainACA('ca_'+line+'.pdb')

    pass
示例#11
0
def sdii2resi():
	if len(sys.argv) < 5:
		print 'resi2target: given a residue number output the corresponding position in target msa'
		print 'example:python utils_msa.py sdii2resi PF07714_full.fa.r50 BTK_HUMAN 1k2p.pdb PF07714_full.fa.r50.3128_3_sdii\n'
		print 'output: PF07714_full.fa.r50.3128_3_sdii_resi'
		return

	msafile = sys.argv[2]
	target = sys.argv[3]
	pdbfile = sys.argv[4]
	sdiifile = sys.argv[5]

	print 'msafile: %s\ntarget header: %s\npdbfile: %s\nsdii file: %s' % (msafile, target, pdbfile, sdiifile)
	m = msa(msafile)
	p = protein(pdbfile)
	rtmap = m.getResiTargetMap(p, target)
	if len(rtmap) < 1:
		print 'error occoured in generating rtmap'
		return
	#print '%s: %s' % (tvar, repr(rtmap[tvar]))
	# construct trmap from rtmap
	# 3128: (B641, 'R')
	trmap = {}
	#trmap = {v: k for k, v in rtmap.iteritems()}
	for k in rtmap:
		msai, resn = rtmap[k]
		if msai in trmap:
			print 'error. duplicate key [%d] in rtmap' % msai
			return
		trmap[msai] = (k, resn)

	#print trmap

	# read sdii file
	with open(sdiifile) as f:
		sdiilines = f.readlines()

	outfile = sdiifile + '_resi'
	fout = open(outfile, 'w')

	# 52 [pid:20029] 926-3089-3128 0.001106226720675
	count = 0
	for line in sdiilines:
		count += 1
		print '%d/%d processed ...' % (count, len(sdiilines))
		strArr = line.strip().split(' ')
		msailist = strArr[2].split('-')
		sdiivalue = strArr[3]
		fout.write('%s %s\n' % ('-'.join([repr(trmap[int(i)]) for i in msailist]), sdiivalue))
	fout.close()
	print 'done.\noutput file: [%s]' % outfile
示例#12
0
def writeseq():
	if len(sys.argv) < 3:
		print 'writeseq(): write pdb sequence'
		print 'writeseq(): python utils_protein.py writeseq 1t3r.pdb'
		print 'writeseq(): output: 1t3r.pdb.seq'
		return

	pdbfile = sys.argv[2]
	outfile = sys.argv[2]+'.seq'
	print 'writeseq(): pdbfile: %s' % pdbfile
	print 'writeseq(): outfile: %s' % outfile

	p = protein(pdbfile)
	fout = open(outfile, 'w')
	fout.write(p.seq+'\n')
	fout.close()
示例#13
0
def searchpdbseq():
	if len(sys.argv) < 2:
		print 'searchpdbseq: locate pdb sequence in MSA' 
		print 'example: python utils_msa.py searchpdbseq PF07714_full.fa 1T49_A.pdb\n'
		return	

	msafile = sys.argv[2]
	target = sys.argv[3]

	print 'msa file: %s' % msafile
	print 'pdb target: %s' % target

	m = msa(msafile)
	p = protein(target)

	if m.searchTargetPDB(p)==0:
		print 'cannot locate pdb sequence in MSA'
示例#14
0
def main():
    if len(sys.argv) < 2:
        print 'Usage: proc_getTip.py pdblist'
        return
    pdblist = sys.argv[1]
    fin = open(pdblist, 'r')
    lines = fin.readlines()
    fin.close()
    
    for i in xrange(0,len(lines)):  
        line = lines[i].strip()
        #pdb_filename=line+'.pdb'
        pdb_filename=line
        print pdb_filename
        p=protein(pdb_filename)
        p.writeChainATips('AAtips.def',line+'.tip')

    pass
示例#15
0
def resi2target():
	if len(sys.argv) < 5:
		print 'resi2target: given a residue number output the corresponding position in target msa'
		print 'example:python utils_msa.py resi2target PF07714_full.fa.r50 BTK_HUMAN 1k2p.pdb B641\n'
		return

	msafile = sys.argv[2]
	target = sys.argv[3]
	pdbfile = sys.argv[4]
	tvar = sys.argv[5]

	print 'msafile: %s\ntarget header: %s\npdbfile: %s\ntarget variable: %s' % (msafile, target, pdbfile, tvar)
	m = msa(msafile)
	p = protein(pdbfile)
	print p.resDict[tvar]
	rtmap = m.getResiTargetMap(p, target)
	if len(rtmap) < 1:
		return
	print 'map %s: %s' % (tvar, repr(rtmap[tvar]))
	return (tvar, rtmap[tvar][0], rtmap[tvar][1])
示例#16
0
def resi2msai():
	if len(sys.argv) < 5:
		print 'resi2target: given a residue number output the corresponding position in target msa'
		print 'python utils_msa.py resi2msai PF00014_full.txt BPT1_BOVIN 5pti_pf.pdb A6'
		return

	msafile = sys.argv[2]
	target = sys.argv[3]
	pdbfile = sys.argv[4]
	tvar = sys.argv[5]

	print 'msafile: %s\ntarget header: %s\npdbfile: %s\ntarget variable: %s' % (msafile, target, pdbfile, tvar)
	m = msa(msafile)
	p = protein(pdbfile)
	print p.resDict[tvar]
	rtmap = m.getResiTargetMap(p, target)
	if len(rtmap) < 1:
		return
	print 'map %s: %s' % (tvar, repr(rtmap[tvar]))
	return (tvar, rtmap[tvar][0], rtmap[tvar][1])
示例#17
0
def pdbcut():
	if len(sys.argv) < 5:
		print 'pdbcut(): write pdb by residue segment'
		print 'pdbcut(): python utils_protein.py pdbcut 1t3r.pdb A 5-15'
		print 'pdbcut(): python utils_protein.py pdbcut 1t3r.pdb all 5-15'
		return

	pdbfile = sys.argv[2]
	chain = sys.argv[3]
	rangeStr = sys.argv[4]

	rangeArray = rangeStr.split('-')

	rBegin = int(rangeArray[0])
	rEnd = int(rangeArray[1])

	pdbname = pdbfile[0:4]
	outfile = '%s_%s_%d-%d.rpdb' % (pdbname, chain, rBegin, rEnd)

	print 'pdbcut():pdbfile: %s' % pdbfile
	print 'pdbcut():pdb: %s' % pdbname
	print 'pdbcut():chain: %s' % chain
	print 'pdbcut():residue range: %d - %d' % (rBegin, rEnd)

	p = protein(pdbfile)
	out = []
	if chain == 'all':
		for a in p.atoms:
			if a.resSeq <= rEnd and a.resSeq >= rBegin:
				out.append(a)
	else:
		for a in p.atoms:
			if (a.resSeq <= rEnd and a.resSeq >= rBegin and a.chainID.lower() == chain.lower()):
				out.append(a)

	fout = open(outfile, 'w')
	print 'pdbcut():output: %s' % outfile
	print 'pdbcut():%d atoms written.' % len(out)
	for a in out:
		fout.write(a.writeAtom())
	fout.close()
示例#18
0
def resn2bfactor():
	if len(sys.argv) < 3:
		print 'resn2bfactor(): replace b factor values with residue type.'
		print 'resn2bfactor(): used for pymol spectrum b'
		return
	scoreValue = {
							'X':0,'-': 0,'.': 0,'A': 1,'C': 2,'D': 3,'E': 4,'F': 5,'G': 6,'H': 7,'I': 8,'K': 9,
							'L': 10,'M': 11,'N': 12,'P': 13,'Q': 14,'R': 15,'S': 16,'T': 17,'V': 18,'W': 19,'Y': 20, 'B': 3
						}
	aamap = AAmap()

	pdbfile = sys.argv[2]
	p = protein(pdbfile)
	outfile = '%s_rb.pdb' % pdbfile[:-4]
	fout = open(outfile, 'w')
	for a in p.atoms:
		newBFactor = scoreValue[aamap.getAAmap(a.resName)]
		print 'new b-factor: [%s : %s] -> %d' % (a.resName, aamap.getAAmap(a.resName), newBFactor)
		a.tempFactor = newBFactor
		fout.write(a.writeAtom())
	fout.close()
	print 'Output file: %s' % outfile
示例#19
0
def main():

	if len(sys.argv) < 3:
		print 'python proc_dendrogram.py preffix cutoff'
		exit 

	preffix = sys.argv[1]
	cutoff = float(sys.argv[2])
	# load tip pdb file
	pr = protein(preffix)
	aamap = AAmap()
	n = len(pr.atoms)

	resimap = {}
	print 'writing %s.resimap ...' % (preffix)
	fr = open(preffix+'.resimap', 'w')
	px = []

	count = 0
	for a in pr.atoms:
		px.append((a.x, a.y, a.z))
		resimap[count] = ('%s%d' % (a.chainID, a.resSeq), aamap.getAAmap(a.resName))
		fr.write('%d %s%d %s\n' % (count, a.chainID, a.resSeq, aamap.getAAmap(a.resName)))
		count+=1
	fr.close()

	x = np.array(px)

	# calculate pairwised distance
	pdist = {}
	print 'writing %s.pdist ...' % (preffix)
	fo=open(preffix+'.pdist','w')
	for i in xrange(0,len(x)):
		for j in xrange(i+1,len(x)):
			dist = np.linalg.norm(x[i]-x[j])
			pdist['%d-%d' % (i,j)] = dist
			fo.write('%d-%d : %f\n' % (i,j,dist))
	fo.close()

	# for hc extraction
	hcdict = {}
	hclist = []
	existdict = {}

	#linkage_matrix = linkage(x, "single")
	linkage_matrix = linkage(x, "complete")
	#ddata = augmented_dendrogram(linkage_matrix, color_threshold=1)
	#plt.show()
	print 'writing %s.hcluster ...' % (preffix)
	fo1 = open(preffix+'.hcluster', 'w')
	m = linkage_matrix
	for i in xrange(0,len(m)):
		#print '%d %d %d %f %d' % (n+i,m[i,0],m[i,1],m[i,2],m[i,3])
		fo1.write('%d %d %d %f %d\n' % (n+i,m[i,0],m[i,1],m[i,2],m[i,3]))
		hcline = '%d %d %d %f %d' % (n+i,m[i,0],m[i,1],m[i,2],m[i,3])
		h = hc(hcline, n)
		hcdict[h.clusterID] = h
		hclist.append(h)		
	fo1.close()

	# resolve leaves for each cluster
	print 'resolving leaves ...'
	for h in hclist:
		h.getChildren(hcdict)
		#h.dump()


	print 'iterating clusters for largest proximity contact ...'
	for i in xrange(0, n):
		leafstr = '%d %d %d 0.0 1' % (i, i, i)
		h = hc(leafstr, n)
		h.leaves = [i]
		hcdict[i] = h
		#hcdict[i].dump()


	# add single leaf in
	for i in xrange(0, n):
		existdict[i]= True

	for h in hclist:
		if h.dist <= cutoff:
			if h.c1 in existdict and h.c2 in existdict: # both been checked before
				#print '1AA'
				if existdict[h.c1] == True and existdict[h.c2] == True:
					ret = checkProximity2(hcdict[h.c1], hcdict[h.c2], pdist, cutoff)
					existdict[h.clusterID] = ret
					if ret == True: # combine both and delete sub cluster in the dict
						existdict[h.c1] = False
						existdict[h.c2] = False
				elif existdict[h.c1] == False or existdict[h.c2] == False:
					existdict[h.clusterID] = False

			elif h.c1 in existdict and h.c2 not in existdict:
				#print '1AB'
				if existdict[h.c1] == False: # c1 is not a contact; get h
					existdict[h.clusterID] = False
					existdict[h.c2] = checkProximity(hcdict[h.c2], pdist, cutoff) # get c2
				elif existdict[h.c1] == True: # c1 is a contact; get c2 then get h = c1 and c2
					ret = checkProximity(hcdict[h.c2], pdist, cutoff) # get c2
					existdict[h.c2] = ret
					if ret == False:
						existdict[h.clusterID] = False
					elif ret == True: # h.c2 is a contact
						ret1 = checkProximity2(hcdict[h.c1], hcdict[h.c2], pdist, cutoff)
						existdict[h.clusterID] = ret1
						if ret1 == True:
							existdict[h.c1] = False
							existdict[h.c2] = False

			elif h.c1 not in existdict and h.c2 in existdict:
				#print '1BA'
				if existdict[h.c2] == False: # c2 is not a contact; get h
					existdict[h.clusterID] = False
					existdict[h.c1] = checkProximity(hcdict[h.c1], pdist, cutoff) # get c1
				elif existdict[h.c2] == True: # c2 is a contact; get c1 then get h = c1 and c2
					ret = checkProximity(hcdict[h.c1], pdist, cutoff) # get c1
					existdict[h.c1] = ret
					if ret == False:
						existdict[h.clusterID] = False
					elif ret == True: # h.c1 is a contact
						ret1 = checkProximity2(hcdict[h.c1], hcdict[h.c2], pdist, cutoff)
						existdict[h.clusterID] = ret1
						if ret1 == True:
							existdict[h.c1] = False
							existdict[h.c2] = False

			elif h.c1 not in existdict and h.c2 not in existdict:
				#print '1BB'
				r1 = checkProximity(hcdict[h.c1], pdist, cutoff)
				existdict[h.c1] = r1
				r2 = checkProximity(hcdict[h.c2], pdist, cutoff)
				existdict[h.c2] = r2
				if r1 == False or r2 == False:
					existdict[h.clusterID] = False
				elif r1 == True and r2 == True:
					ret = checkProximity2(hcdict[h.c1], hcdict[h.c2], pdist, cutoff)
					if ret == True:
						existdict[h.c1] = False
						existdict[h.c2] = False

		elif h.dist > cutoff:
			#print '0XX'
			existdict[h.clusterID] = False
			if h.c1 not in existdict:
				existdict[h.c1] = checkProximity(hcdict[h.c1], pdist, cutoff)
			if h.c2 not in existdict:
				existdict[h.c2] = checkProximity(hcdict[h.c2], pdist, cutoff)


	# print out the result
	print 'writing result into %s.hcg' % preffix  
	fout = open(preffix+'.hcg', 'w')
	count=0
	for hid in existdict:
		#if hid >= N and existdict[hid] == True:
		if existdict[hid] == True:
			#fout.write('%d: %r, %s' % (hid, existdict[hid], hcdict[hid].writeString()))
			fout.write('%s,%s\n' % (preffix, hcdict[hid].writeLeaves(resimap)))
			count+=len(hcdict[hid].leaves)
	print '%d leaves in total\n' % count
示例#20
0
def ncg2blossum():
	if len(sys.argv) < 7:
		print 'ncg2blossum: construct new substitution matrix from contact group'
		print 'example:python utils_msa.py ncg2blossum 5pti_pf.pdb 5pti_pf.tip.ncg PF00014_full.txt.rseq PF00014_full.txt.sdii BPT1_BOVIN order'
		print 'output: a substitution matrix file (same format as BLOSSUM62)'
		return
	#print sys.argv[0] # utils_msa.py
	#print sys.argv[1] # hcg2blossum
	pdbfile = sys.argv[2] # pdb name
	ncgfile = sys.argv[3] # hcg
	msafile = sys.argv[4] # msa (full or reduced)
	sdiifile = sys.argv[5] # sdii
	target = sys.argv[6] # target name
	order = int(sys.argv[7])
	outfile = msafile[0:7]+".sm" # new substitution matrix

	# get msa in matrix format
	m = msa(msafile)
	msaMatrix = np.array([list(s[1]) for s in m.msaArray]) # matrix format of msa

	#for i in xrange(0, len(seqs)):
	#	print seqs[i]
	print 'msa matrix: ' + repr(msaMatrix.shape)

	# get resi -> msai map	
	p = protein(pdbfile)
	rtmap = m.getResiTargetMap(p, target)

	sdiidict = loadsdii(sdiifile) # key: 39-140-210, value = 0.0788593466276019
	msaGroupArray = ncg2msa(ncgfile, rtmap) # [[210, 215], [106, 211], [73, 95, 166], [109, 124, 139]]

	# get non overlapped column indices
	colset = set()
	for g in msaGroupArray:
		rg = g[0:order] # get ith order contact group
		rg.sort() # for generating key
		sdiikey = '-'.join([str(r) for r in rg])
		if sdiikey not in sdiidict:
			#print 'ncg2sdiicol(): discard group: %s for low sdii' % sdiikey
			continue
		print (sdiikey, sdiidict[sdiikey])			
		for resi in rg: # for significant ncg, add corresponding MSA column index
			colset.add(resi)

	# init substitution matrix
	EBlist = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '*']
	#AAlist = sorted(EBlist)
	#AAlist = sorted(['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '*'])
	AAlist = sorted(['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'])
	sm = {}
	for i in xrange(0, len(AAlist)):
		for j in xrange(i, len(AAlist)):
			key = '%s%s' % (AAlist[i], AAlist[j])
			sm[key] = 0
	print AAlist
	print 'Alphabet: %d' % len(AAlist) 
	print 'AA: %d' % len(sm)

	# accumulate substitution matrix AA frequency for all the contact group columns
	# Sum the scores for each columns across column
	print ''
	w = 0 # count column number
	for col in colset:
		w+=1
		calcColSM(sm, msaMatrix, col)
	'''
	for mg in msaGroupArray:
		# form key for co-evolve value 
		sdiikey = '-'.join([str(i) for i in mg])
		if sdiikey not in sdiidict:
			print 'hcg2blossum():discard group: %s' % sdiikey
			continue
		sdiiweight = sdiidict[sdiikey]
		print (sdiikey, sdiiweight)

		# accumulate SM for each contact group / column group
		for col in mg:
			w +=1
			calcColSM(sm, msaMatrix, col)
		print ''
	'''
	#print repr(sm)
	#print ''

	n = msaMatrix.shape[0]
	T = w*n*(n-1)/2 # normalization term
	print 'w: %d' % w # number of columns (contact group)
	print 'n: %d' % n # number of sequence
	print 'T: %d' % T


	# convert cij to qij
	# Normalize the pair frequencies so they will sum to 1
	for c in sm:
		sm[c] = 1.0*sm[c]/T

	#print repr(sm)
	#print ''

	# Calculate the expected probability of occurrence of the ith residue in an (i,j) pair
	# pi = qii + sum( qij/2 )_{i!=j}
	pi = {}
	for i in xrange(0, len(AAlist)):
		A = AAlist[i]
		sum_qij = 0
		for j in xrange(i+1, len(AAlist)): # i should not = j
			B = AAlist[j]
			sum_qij += sm[A+B]/2
		pi[A] = sm[A+A] + sum_qij

	print repr(pi)	
	print ''

	# The desired denominator is the expected frequency for each pair 
	eij = {}
	for i in xrange(0, len(AAlist)):
		A = AAlist[i]
		for j in xrange(i+1, len(AAlist)):
			B = AAlist[j]
			eij[A+B] = 2 * pi[A] * pi[B]
		eij[A+A] = pi[A] * pi[A]

	print len(eij)
	print repr(eij)	
	print ''

	#  Log odds ratio sij = round(2*log2(qij/eij))
	sij = {}
	for i in xrange(0, len(AAlist)):
		A = AAlist[i]
		for j in xrange(i, len(AAlist)):
			B = AAlist[j]
			if eij[A+B] == 0.0 or sm[A+B]==0.0:
				sij[A+B] = 0
			else:
				sij[A+B] = int(round(2*math.log((sm[A+B]/eij[A+B]),2)))
			#	sij[A+B] = sm[A+B]/eij[A+B]

	print repr(sij)	
	print len(sij)
	print ''

	saveBlosum(EBlist, sij, outfile)
示例#21
0
def main():
	pdbname = '1t3r.pdb'
	p = protein(pdbname)
	#p.writeCA(p.pdb+'.ca')
	p.printPDB()
	p.writeChainATips('AAtips.def', p.pdb+'.tip')