示例#1
0
def CalcPotentialByEmpSI(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.):
	f=open(userRef, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response, prdProb in predDistMatrix.iteritems():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                refProb = refData[response][0]
		potential = - np.log ( predProb / refProb )
		
		rc = largestDistance
		cutoff = config.GetCutoffs(response)
		lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff)
		lastCol = potential[:, :, lastDistBin]
		potential = potential - lastCol
		potential[:, :, lastDistBin: ] =0

		CheckPotentialValues(potential)
                potentials[response] = potential

        return potentials
def CalcDistOriPotential(predData, labelNames=['CaCa', 'CbCb', 'NO'] + ['Ca1Cb1Cb2Ca2','N1Ca1Cb1Cb2','Ca1Cb1Cb2'], distPotType='DFIRE', param4Potential=1.61, largestDistance=18, useWeight4Dist=True, useRef4Ori=True, useWeight4Ori=True, minPotential=-30, maxPotential=30):
	assert distPotType.upper() in ['DFIRE', 'DOPE']

	predProbMatrix, labelWeight, labelDistribution = predData

	validDistribution = dict()
   	validLabelWeight = dict()
        validLabelDistribution = dict()

        existingLabelNames = []
        for response, pred in predProbMatrix.iteritems():
                labelName,_, _ = config.ParseResponse(response)
                if labelName not in labelNames:
                        continue
                existingLabelNames.append(labelName)
                validDistribution[response] = pred
                validLabelWeight[response] = labelWeight[response]
                validLabelDistribution[response] = labelDistribution[response]

        missingLabelNames = list(set(labelNames) - set(existingLabelNames))
        if len(missingLabelNames)>0:
                print 'WARNING: the predicted probability file does not have information for the following label names: ', missingLabelNames

        pairPotential = dict()
        validProb = dict()

	if distPotType == 'DOPE':
		distPotential, distValidProb = CalcPotentialByDOPE(validDistribution, largestDistance=rc, rgScale=param4Potential, useWeight=useWeight4Dist, minPotential=minPotential, maxPotential=maxPotential)
	else:
        	distPotential, distValidProb = CalcPotentialByDFIRE(validDistribution, alpha=param4Potential, largestDistance=largestDistance, useWeight=useWeight4Dist, minPotential=minPotential, maxPotential=maxPotential)
        pairPotential.update(distPotential)
	validProb.update(distValidProb)

        oriPotential, oriValidProb = CalcOrientationPotential(validDistribution, useRef=useRef4Ori, useWeight=useWeight4Ori, labelWeight=validLabelWeight, labelDistribution=validLabelDistribution, minPotential=minPotential, maxPotential=maxPotential)
        pairPotential.update(oriPotential)
        validProb.update(oriValidProb)

	cutoffs = dict()
	for response in pairPotential.keys():
		cutoffs[response] = config.GetCutoffs(response)

	return pairPotential, cutoffs, validProb, distPotential, oriPotential
示例#3
0
def CalcPotentialByEmpSD(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.):
	f=open(userRef, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response, predProb in predDistMatrix.iteritems():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                refProbList = refData[response][1]

		length = predProb.shape[0]
		if length < 400:
			refProbs = [ ref for sz, freq, ref in refProbList if sz<=1.3*length and sz>=length/1.3 ]
		else:
			refProbs = [ ref for sz, freq, ref in refProbList if sz>=350 ]

		print '#refProbMatrix: ', len(refProbs), ' for proteins with length= ', length

		refProb = np.average(refProbs, axis=0)
		potential = - np.log ( predProb / refProb )

		rc = largestDistance
		cutoff = config.GetCutoffs(response)
		lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff)
		
		lastCol = potential[:, :, lastDistBin]
		potential = potential - lastCol
		potential{;, :, lastDistBin: ] = 0

		CheckPotentialValues(potential)
                potentials[response] = potential

        return potentials
def CalcPotentialBySimuRW(predDistMatrix, refFile, largestDistance=20, sequence=None, useWeight=False, minPotential=-30., maxPotential=30.):
	f=open(refFile, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response in predDistMatrix.keys():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
                        #print 'WARNING: unsupported response for SimuRW potential: ', response
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                predProb = predDistMatrix[response]

		## the first row of refProb corresponds to offset=1
                refProb = refData[response]
		if labelName != 'CbCb':
			print 'distance label name not supported yet: ', labelName
			exit(1)

		if not subType.endswith('34C'):
			print 'distance label type not supported yet: ', subType
			exit(1)

		cutoff = config.GetCutoffs(response)

		length = predProb.shape[0]
		numLabels = predProb.shape[2]
		assert numLabels == refProb.shape[1]

		## maxAllowedDist[offset] is the maximum physically feasible distance between two Cb atoms when their sequence separation is equal to offset
		maxAllowedDist = [ (offset * 3.8 + 3.06) for offset in range(length) ]
		maxAllowedDist[0] = 0
		eps = 0.00001
		maxAllowedDist[2] = 10.5 - eps
		maxAllowedDist[3] = 13.0 - eps
		maxAllowedDist[4] = 15.5 - eps
		maxAllowedDist[5] = 17.5 - eps
		maxAllowedDist[6] = 19.5 - eps

		potential = np.zeros_like(predProb)

		for i in range(0, length):
			for j in range(i+2, length):
				offset = j-i
				## find the distance bin into which the maxAllowedDist falls
				lastDistBin = DistanceUtils.LabelsOfOneDistance(maxAllowedDist[offset], cutoff)
				if lastDistBin < (numLabels - 1):
					## merge the pred prob and ref prob in the bins from lastDistBin to the end
					pred = predProb[i, j,  : lastDistBin+1]
					ref = refProb[offset-1][:lastDistBin+1]

					potential[i, j, :lastDistBin+1] = -np.log( pred / ref )
					potential[i, j, lastDistBin+1: ] = maxPotential
				else:
					## determine the last distance bin
                			rc = min(cutoff[-1], largestDistance) - 0.001
                			if (rc<10.0):
                        			print 'ERROR: the largest distance cutoff for SimuRW is too small: ', rc
                        			exit(1)
                			rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

					refProbLen = refProb.shape[0]
					#idx4rc = numLabels - 2
					potential[i, j] = -np.log( predProb[i, j] / refProb[min(offset, refProbLen) -1 ] )
					potential[i, j] -= potential[i, j, rc_index]
					potential[i, j, rc_index + 1: ] = 0

				## only valid for symmetric atom pairs
				potential[j, i] = potential[i, j]

		if useWeigt and subType.endswith('Plus'):
			potential *= (1-predProb[:, :, -1])

                CheckPotentialValues(potential)

		potentials[response] = potential

        return potentials
def CalcPotentialByDOPE(predDistMatrix, largestDistance=20, rgScale=1., useWeight=False, minPotential=-30., maxPotential=30.):
	potentials = dict()
	validProbs = dict()
	for response in predDistMatrix.keys():
		labelName, labelType, subType = config.ParseResponse(response)
                if labelName not in config.allAtomPairNames:
                        #print 'WARNING: unsupported response for DOPE potential: ', response
                        continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                cutoff = config.GetCutoffs(response)

		## determine the last distance bin
                rc = min(cutoff[-1], largestDistance) - 0.001
                if (rc<10.0):
                        print 'ERROR: the largest distance cutoff for DOPE is too small: ', rc
                        exit(1)
                rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

		binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]
		bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]

		## a is the radius of reference sphere and rg is the estimated radius of gyration
		length = predDistMatrix[response].shape[0]
		rg = 0.395*length**(3./5)+7.257	
		a = np.sqrt(5./3) * rg * rgScale

		""" calculate n(r,a) defined in the DOPE paper. Below is the original formulation.
		## rc is the upper bound of distance between two atoms
		rc = bincenters[-1]
		if rc <= 2*a:
			#nra = 6. * np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / np.power(rc,3) /(np.power(rc, 3) - 18 * np.square(a)*rc + 32 * np.power(a, 3))
		else:
			#nra = 3* np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / 16. / np.power(a, 6)
		"""
		## calculate n(r,a) described in the DOPE paper. Ignore the constant factor and the denominator since they are same for all distance bins
		nra = np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) 

		def CalcApproxRefPot(idx=0):
			points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5)
			values = np.square(points * (points - 2*a)) * (points + 4*a) 
			tmpNra = np.average(values)	
			return tmpNra

		## get a more accurate estimation of nra for the first several bins if their binwidth is > 0.5
		for i in range(len(binwidths)):
			if binwidths[i] >= 1:
				nra[i] = CalcApproxRefPot(i) 

		## calculate reference potential defined as log (nra(r)/nra(rc)) + log(\delta r/ \delta rc)
		## \delta(r) is equal to binwidths
		refPot = np.log( nra / nra[rc_index] * binwidths / binwidths[rc_index] )
		
	  	## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability
                predProb = predDistMatrix[response]
                predProbRC = predProb[:, :, rc_index : rc_index+1]
                obsPot = np.log(predProb / predProbRC)

                ## calculate the final potential, which is the difference between reference and observed potential
                potential = np.zeros_like(predDistMatrix[response])
                potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index]

		if subType.endswith('Plus'):
			validProb = 1 - predProb[:, :, -1]
		else:
			validProb = np.ones((predProb.shape[0], predProb.shape[1]), dtype=np.float32)

		##if useWeight and the prob of disroder exists, adjust potential by prob of not beining in disorder status
		if useWeight and subType.endswith('Plus'):
			potential *= validProb[:, :, np.newaxis]

		## remove the potential for the last distance bin, which corresponds to disorder status
		if subType.endswith('Plus'):
			potential = potential[:, :, :-1]

		CheckPotentialValues(m=potential)

		potentials[response] = potential.astype(np.float32)
		validProbs[response] = validProb.astype(np.float32)

	return potentials, validProbs
def CalcPotentialByDFIRE(predDistMatrix, alpha=1.61, largestDistance=18, useWeight=False, minPotential=-30, maxPotential=30):
	potentials = dict()

	## validProbs saves the prob of one atom/residue pair likely have valid coordinates
	validProbs = dict()
	for response in predDistMatrix.keys():
		labelName, labelType, subType = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			#print 'WARNING: unsupported response for DFIRE potential: ', response
			continue
		if not config.IsDiscreteLabel(labelType):
                        print 'WARNING: the distance label is not discrete: ', response
			continue

		cutoff = config.GetCutoffs(response)

		## determine the last distance bin
		rc = min(cutoff[-1], largestDistance) - 0.001
		if (rc<10.0):
			print 'ERROR: the largest distance cutoff for DFIRE is too small: ', rc
			exit(1)
		rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

		binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]
		bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]

		## calculate reference potential defined as alpha*log (r/rc) + log(\delta r/ \delta rc)
		## \delta(r) is binwidths and r is the bincenters
		refPot = alpha * np.log( bincenters / bincenters[rc_index]) + np.log( binwidths / binwidths[rc_index] )

		## idx is the index for a bin
		def CalcApproxRefPot(idx=0):
                        points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5)
                        values = np.power(points / bincenters[rc_index], alpha)
			avg = np.average(values)
                        tmpRefPot = np.log(avg) + np.log( binwidths[idx] / binwidths[rc_index] )
			return tmpRefPot

		## get a more accurate estimation of reference for the bin with a large width
		for i in range(len(binwidths)):
			if binwidths[i] >= 1:
				refPot[i] = CalcApproxRefPot(i)
		
		## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability
		predProb = predDistMatrix[response]
		predProbRC = predProb[:, :, rc_index : rc_index+1]
		#obsPot = np.log(predProb / (sys.float_info.min + predProbRC))
		obsPot = np.log(predProb / predProbRC)

		## calculate the final potential, which is the difference between reference potential and observed potential
		potential = np.zeros_like(predDistMatrix[response])
		potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index]

		if subType.endswith('Plus'):
			validProb = 1 - predProb[:, :, -1]
		else:
			validProb = np.ones((predProb.shape[0], predProb.shape[1]), dtype=np.float32)

		##if useWeight=True and the prob of being disorder exists, adjust potential by the prob of not being in disorder status
		if useWeight and subType.endswith('Plus'):
			potential *= validProb[:, :, np.newaxis]

		## remove the potential for the last distance bin, which corresponds to disorder status
		if subType.endswith('Plus'):
			potential = potential[:, :, :-1]

		CheckPotentialValues(m=potential)

		potentials[response] = potential.astype(np.float32)
		validProbs[response] = validProb.astype(np.float32)

	return potentials, validProbs
示例#7
0
def CalcPotentialByDFIRE(predDistMatrix, alpha=1.61, largestDistance=15, minPotential=-20, maxPotential=20):
	potentials = dict()
	for response in predDistMatrix.keys():
		labelName, labelType, subType = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			print 'WARNING: unsupported response for DFIRE potential: ', response
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

		cutoff = config.GetCutoffs(response)

		## determine the last distance bin
		rc = min(cutoff[-1], largestDistance) - 0.001
		if (rc<10.0):
			print 'ERROR: the largest distance cutoff for DFIRE is too small: ', rc
			exit(1)
		rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

		binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]
		bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]

		## calculate reference potential defined as alpha*log (r/rc) + log(\delta r/ \delta rc)
		## \delta(r) is binwidths and r is the bincenters
		refPot = alpha * np.log( bincenters / bincenters[rc_index]) + np.log( binwidths / binwidths[rc_index] )

		## idx is the index for binwidth
		def CalcApproxRefPot(idx=0):
                        points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5)
                        values = np.power(points / bincenters[rc_index], alpha)
			avg = np.average(values)
                        tmpRefPot = np.log(avg) + np.log( binwidths[idx] / binwidths[rc_index] )
			return tmpRefPot

		## get a more accurate estimation of reference for the first bin
		[ refPot[i] = CalcApproxRefPot(i) for i in range(len(binwidths)) if binwdiths[i] >= 1 ]
		
		## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability
		predProb = predDistMatrix[response]
		predProbRC = predProb[:, :, rc_index : rc_index+1]
		obsPot = np.log(predProb / predProbRC)

		## calculate the final potential, which is the difference between reference potential and observed potential
		potential = np.zeros_like(predDistMatrix[response])
		potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index]

		CheckPotentialValues(m=potential)

		potentials[response] = potential

	return potentials

def CalcPotentialByDOPE(predDistMatrix, largestDistance=20, rgScale=1., minPotential=-20., maxPotential=20.):
	potentials = dict()
	for response in predDistMatrix.keys():
		labelName, labelType, subType = config.ParseResponse(response)
                if labelName not in config.allAtomPairNames:
                        print 'WARNING: unsupported response for DOPE potential: ', response
                        continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                cutoff = config.GetCutoffs(response)

		## determine the last distance bin
                rc = min(cutoff[-1], largestDistance) - 0.001
                if (rc<10.0):
                        print 'ERROR: the largest distance cutoff for DOPE is too small: ', rc
                        exit(1)
                rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

		binwidths = [ d2 - d1 for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]
		bincenters = [ (d2 + d1)/2. for d1, d2 in zip(cutoff[:-1], cutoff[1:]) ]

		## a is the radius of reference sphere and rg is the estimated radius of gyration
		length = predDistMatrix[response].shape[0]
		rg = 0.395*length**(3./5)+7.257	
		a = np.sqrt(5./3) * rg * rgScale

		""" calculate n(r,a) defined in the DOPE paper. Below is the original formulation.
		## rc is the upper bound of distance between two atoms
		rc = bincenters[-1]
		if rc <= 2*a:
			#nra = 6. * np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / np.power(rc,3) /(np.power(rc, 3) - 18 * np.square(a)*rc + 32 * np.power(a, 3))
		else:
			#nra = 3* np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) / 16. / np.power(a, 6)
		"""
		## calculate n(r,a) described in the DOPE paper. Ignore the constant factor and the denominator since they are same for all distance bins
		nra = np.square(bincenters * (bincenters - 2*a)) * (bincenters + 4*a) 

		def CalcApproxRefPot(idx=0):
			points = np.arange(cutoff[idx] + 0.5/2, cutoff[idx+1], 0.5)
			values = np.square(points * (points - 2*a)) * (points + 4*a) 
			tmpNra = np.average(values)	
			return tmpNra

		## get a more accurate estimation of nra for the first several bins if their binwidth is > 0.5
		[ nra[i] = CalcApproxRefPot(i) for i in range(len(binwidths)) if binwidths[i] >= 1 ]

		## calculate reference potential defined as log (nra(r)/nra(rc)) + log(\delta r/ \delta rc)
		## \delta(r) is equal to binwidths
		refPot = np.log( nra / nra[rc_index] * binwidths / binwidths[rc_index] )
		
	  	## calculate the observed potential defined as log( p(r) /p(rc) ) where p(r) is the predicted distance probability
                predProb = predDistMatrix[response]
                predProbRC = predProb[:, :, rc_index : rc_index+1]
                obsPot = np.log(predProb / predProbRC)

                ## calculate the final potential, which is the difference between reference and observed potential
                potential = np.zeros_like(predDistMatrix[response])
                potential[:, :, :rc_index ] = refPot[: rc_index] - obsPot[:, :, :rc_index]

		CheckPotentialValues(m=potential)

		potentials[response] = potential

	return potentials
		

def CalcPotentialBySimuRW(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.):
	f=open(userRef, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response in predDistMatrix.keys():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
                        print 'WARNING: unsupported response for SimuRW potential: ', response
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                predProb = predDistMatrix[response]

		## the first row of refProb corresponds to offset=1
                refProb = refData[response]
		if labelName != 'CbCb':
			print 'distance label name not supported yet: ', labelName
			exit(1)

		if not subType.endswith('34C'):
			print 'distance label type not supported yet: ', subType
			exit(1)

		cutoff = config.GetCutoffs(response)

		length = predProb.shape[0]
		numLabels = predProb.shape[2]
		assert numLabels == refProb.shape[1]

		## maxAllowedDist[offset] is the maximum physically feasible distance between two Cb atoms when their sequence separation is equal to offset
		maxAllowedDist = [ (offset * 3.8 + 3.06) for offset in range(length) ]
		maxAllowedDist[0] = 0
		eps = 0.00001
		maxAllowedDist[2] = 10.5 - eps
		maxAllowedDist[3] = 13.0 - eps
		maxAllowedDist[4] = 15.5 - eps
		maxAllowedDist[5] = 17.5 - eps
		maxAllowedDist[6] = 19.5 - eps

		potential = np.zeros_like(predProb)

		for i in range(0, length):
			for j in range(i+2, length):
				offset = j-i
				## find the distance bin into which the maxAllowedDist falls
				lastDistBin = DistanceUtils.LabelsOfOneDistance(maxAllowedDist[offset], cutoff)
				if lastDistBin < (numLabels - 1):
					## merge the pred prob and ref prob in the bins from lastDistBin to the end
					pred = predProb[i, j,  : lastDistBin+1]
					ref = refProb[offset-1][:lastDistBin+1]

					potential[i, j, :lastDistBin+1] = -np.log( pred / ref )
					potential[i, j, lastDistBin+1: ] = maxPotential
				else:
					## determine the last distance bin
                			rc = min(cutoff[-1], largestDistance) - 0.001
                			if (rc<10.0):
                        			print 'ERROR: the largest distance cutoff for SimuRW is too small: ', rc
                        			exit(1)
                			rc_index = DistanceUtils.LabelsOfOneDistance(rc, cutoff)

					refProbLen = refProb.shape[0]
					#idx4rc = numLabels - 2
					potential[i, j] = -np.log( predProb[i, j] / refProb[min(offset, refProbLen) -1 ] )
					potential[i, j] -= potential[i, j, rc_index]
					potential[i, j, rc_index + 1: ] = 0

				## only valid for symmetric atom pairs
				potential[j, i] = potential[i, j]

                CheckPotentialValues(potential)

		potentials[response] = potential

        return potentials

def CalcPotentialByEmpSD(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.):
	f=open(userRef, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response, predProb in predDistMatrix.iteritems():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                refProbList = refData[response][1]

		length = predProb.shape[0]
		if length < 400:
			refProbs = [ ref for sz, freq, ref in refProbList if sz<=1.3*length and sz>=length/1.3 ]
		else:
			refProbs = [ ref for sz, freq, ref in refProbList if sz>=350 ]

		print '#refProbMatrix: ', len(refProbs), ' for proteins with length= ', length

		refProb = np.average(refProbs, axis=0)
		potential = - np.log ( predProb / refProb )

		rc = largestDistance
		cutoff = config.GetCutoffs(response)
		lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff)
		
		lastCol = potential[:, :, lastDistBin]
		potential = potential - lastCol
		potential{;, :, lastDistBin: ] = 0

		CheckPotentialValues(potential)
                potentials[response] = potential

        return potentials

def CalcPotentialByEmpSI(predDistMatrix, userRef, largestDistance=20, sequence=None, minPotential=-20., maxPotential=20.):
	f=open(userRef, 'rb')
	refData = cPickle.load(f)
	f.close()

	potentials = dict()
        for response, prdProb in predDistMatrix.iteritems():
		labelName, labelType, _ = config.ParseResponse(response)
		if labelName not in config.allAtomPairNames:
			continue
		if not conifg.IsDiscreteLabel(labelType):
			continue

                refProb = refData[response][0]
		potential = - np.log ( predProb / refProb )
		
		rc = largestDistance
		cutoff = config.GetCutoffs(response)
		lastDistBin = DistanceUtils.LabelsOfOneDistance(rc, cutoff)
		lastCol = potential[:, :, lastDistBin]
		potential = potential - lastCol
		potential[:, :, lastDistBin: ] =0

		CheckPotentialValues(potential)
                potentials[response] = potential

        return potentials

	

allRefTypesWithFiles = [ ref.upper() for ref in ['SimuRW', 'EmpSI', 'EmpSD'] ]
allRefTypes = [ 'DFIRE', 'DOPE' ] + allRefTypesWithFiles

def main(argv):

    	inputFile = None
    	targetName = None
	labelNames = ['CbCb']
	potentialFileSuffix = 'pkl'
	minPotential = -30.0
	maxPotential = 30.0
	minSeqSep = 3
	minSeqSepStr='3'

	## the largest dist cutoff
	rc = 18

	alpha4DFIRE = 1.61
	rgScale4DOPE = 1.

	## reference 
	reference = 'DFIRE'

	## refFile
	refFile = None

    	try:
        	opts, args = getopt.getopt(argv,"i:a:r:l:u:s:f:tn",["input=", "atomPairType=", "refState=", "minPotential=", "maxPotential=", "minSeqSep=", "refFile=", "textFormat=", "nonZero="])
        	print opts, args
    	except getopt.GetoptError:
        	Usage()
        	exit(1)


    	if len(opts) < 1:
        	Usage()
        	exit(1)

    	for opt, arg in opts:
		if opt in ("-i", "--input"):
	    		inputFile = arg

		elif opt in ("-a", "--atomPairType"):
			labelNames = config.ParseLabelNames(arg)

		elif opt in ("-r", "--refState"):
			fields = arg.split('+')
			reference = fields[0].upper()
			if reference not in allRefTypes:
				print 'allowed reference types: ', allRefTypes
				exit(1)

			if len(fields) > 1:
				if reference  == 'DFIRE':
					rc = np.float32(fields[1])
					if len(fields) > 2:
						alpha4DFIRE = np.float32(fields[2])

				elif reference == 'DOPE':
					rc = np.float32(fields[1])
					if len(fields) > 2:
						rgScale4DOPE = np.float32(fields[2])
				elif reference == 'SimuRW'.upper():
					rc = np.float32(fields[1])
				else:
					print 'WARNING: unsupported reference format: ', arg
				

		elif opt in ("-f", "--refFile"):
			refFile = arg
			if not os.path.isfile(refFile):
				print 'the provided file for reference state is not valid: ', refFile
				exit(1)

		elif opt in ("-l", "--minPotential"):
			minPotential = np.float32(arg)
		elif opt in ("-u", "--maxPotential"):
			maxPotential = np.float32(arg)

		elif opt in ("-s", "--minSeqSep"):
			minSeqSep = np.int32(arg)
			minSeqSepStr = arg
			if minSeqSep < 1:
				print 'ERROR: minSeqSep shall be at least 1'
				exit(1)

		elif opt in ("-t", "--textFormat"):
	    		potentialFileSuffix = '.txt'

		elif opt in ("-n", "--nonZero"):
			resetFlag = False	

		else:
	    		Usage()
	    		exit(1)

    	if inputFile is None:
		print 'Please provide an input file'
		exit(1)
    	if not os.path.isfile(inputFile):
		print 'The input file does not exist: ', inputFile
		exit(1)

	if reference in allRefTypesWithFiles and refFile is None:
		print 'The file for user-sepcified reference state is empty'
		exit(1)

        targetName = os.path.basename(inputFile).split('.')[0]

    	content = DistanceUtils.LoadRawDistProbFile(inputFile)
	assert len(content) >=6

    	name, sequence, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content[:6]
	assert labelWeight is not None, "labelWeight shall not be empty"

	## if needed, add code to here the predicted dist probability

	filenames = [ targetName, 'distPotential']
	if reference == 'DFIRE':
		potential = CalcPotentialByDFIRE(predictedDistProb, alpha=alpha4DFIRE, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), str(alpha4DFIRE), potentialFileSuffix])
	elif reference == 'DOPE':
		potential = CalcPotentialByDOPE(predictedDistProb, largestDistance=rc, rgScale=rgScale4DOPE, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), str(rgScale4DOPE), potentialFileSuffix])
	elif reference == 'SimuRW'.upper():
		potential = CalcPotentialBySimuRW(predictedDistProb, refFile, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), potentialFileSuffix])
	else:
		print 'ERROR: unimplemented reference state: ', reference
		exit(1)

	potentialFileName = '.'.join(filenames)

	## save to PKL file
	if potentialFileName.endswith('.pkl'):
        	fh = open(potentialFileName, 'wb')
		potential_new = dict()
		distCutoffs = dict()
		for response, pot in potential.iteritems():
			labelName = config.Response2LabelName(response)
			if labelName not in set(labelNames):
				continue

			potential_new[response] = pot
			distCutoffs[response] = config.GetCutoffs(response)

		cPickle.dump((name, sequence, potential_new, distCutoffs), fh, protocol=cPickle.HIGHEST_PROTOCOL)
		fh.close()
		return

	## save to text file
	potentialFileName = targetName + '.distPotential.s' + minSeqSepStr + potentialFileSuffix
	fh = open(potentialFileName, 'w')
	fh.write('#TARGET\t' + targetName + '\n')
	fh.write('#SEQ\t' + sequence + '\n')
	fh.write('#DistanceBinBoundaries\t' + "Please check config.py" + '\n')

	for response, pot in potential.iteritems():
		labelName, labelType, subType = config.ParseResponse(response)
		if labelName not in set(labelNames):
			continue

		size = pot.shape
		for i in xrange(size[0]):
			rawPotStrs = []

			for j in xrange(i+ minSeqSep, size[1]):
				atom1, atom2 = config.SelectAtomPair(sequence, i, j, labelName)
				y = pot[i, j]

				rawPotStr = ' '.join(['AtomPair', atom1.upper(), str(i+1), atom2.upper(), str(j+1), subType] + [ "{:.4f}".format(e) for e in y ] )
				rawPotStrs.append(rawPotStr)

			if len(rawPotStrs) >0:
				fh.write('\n'.join(rawPotStrs) + '\n')

	fh.close()


if __name__ == "__main__":
    	main(sys.argv[1:])
示例#8
0
def main(argv):

    	inputFile = None
    	targetName = None
	labelNames = ['CbCb']
	potentialFileSuffix = 'pkl'
	minPotential = -30.0
	maxPotential = 30.0
	minSeqSep = 3
	minSeqSepStr='3'

	## the largest dist cutoff
	rc = 18

	alpha4DFIRE = 1.61
	rgScale4DOPE = 1.

	## reference 
	reference = 'DFIRE'

	## refFile
	refFile = None

    	try:
        	opts, args = getopt.getopt(argv,"i:a:r:l:u:s:f:tn",["input=", "atomPairType=", "refState=", "minPotential=", "maxPotential=", "minSeqSep=", "refFile=", "textFormat=", "nonZero="])
        	print opts, args
    	except getopt.GetoptError:
        	Usage()
        	exit(1)


    	if len(opts) < 1:
        	Usage()
        	exit(1)

    	for opt, arg in opts:
		if opt in ("-i", "--input"):
	    		inputFile = arg

		elif opt in ("-a", "--atomPairType"):
			labelNames = config.ParseLabelNames(arg)

		elif opt in ("-r", "--refState"):
			fields = arg.split('+')
			reference = fields[0].upper()
			if reference not in allRefTypes:
				print 'allowed reference types: ', allRefTypes
				exit(1)

			if len(fields) > 1:
				if reference  == 'DFIRE':
					rc = np.float32(fields[1])
					if len(fields) > 2:
						alpha4DFIRE = np.float32(fields[2])

				elif reference == 'DOPE':
					rc = np.float32(fields[1])
					if len(fields) > 2:
						rgScale4DOPE = np.float32(fields[2])
				elif reference == 'SimuRW'.upper():
					rc = np.float32(fields[1])
				else:
					print 'WARNING: unsupported reference format: ', arg
				

		elif opt in ("-f", "--refFile"):
			refFile = arg
			if not os.path.isfile(refFile):
				print 'the provided file for reference state is not valid: ', refFile
				exit(1)

		elif opt in ("-l", "--minPotential"):
			minPotential = np.float32(arg)
		elif opt in ("-u", "--maxPotential"):
			maxPotential = np.float32(arg)

		elif opt in ("-s", "--minSeqSep"):
			minSeqSep = np.int32(arg)
			minSeqSepStr = arg
			if minSeqSep < 1:
				print 'ERROR: minSeqSep shall be at least 1'
				exit(1)

		elif opt in ("-t", "--textFormat"):
	    		potentialFileSuffix = '.txt'

		elif opt in ("-n", "--nonZero"):
			resetFlag = False	

		else:
	    		Usage()
	    		exit(1)

    	if inputFile is None:
		print 'Please provide an input file'
		exit(1)
    	if not os.path.isfile(inputFile):
		print 'The input file does not exist: ', inputFile
		exit(1)

	if reference in allRefTypesWithFiles and refFile is None:
		print 'The file for user-sepcified reference state is empty'
		exit(1)

        targetName = os.path.basename(inputFile).split('.')[0]

    	content = DistanceUtils.LoadRawDistProbFile(inputFile)
	assert len(content) >=6

    	name, sequence, predictedDistProb, predictedContactProb, labelWeight, labelDistribution = content[:6]
	assert labelWeight is not None, "labelWeight shall not be empty"

	## if needed, add code to here the predicted dist probability

	filenames = [ targetName, 'distPotential']
	if reference == 'DFIRE':
		potential = CalcPotentialByDFIRE(predictedDistProb, alpha=alpha4DFIRE, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), str(alpha4DFIRE), potentialFileSuffix])
	elif reference == 'DOPE':
		potential = CalcPotentialByDOPE(predictedDistProb, largestDistance=rc, rgScale=rgScale4DOPE, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), str(rgScale4DOPE), potentialFileSuffix])
	elif reference == 'SimuRW'.upper():
		potential = CalcPotentialBySimuRW(predictedDistProb, refFile, largestDistance=rc, minPotential=minPotential, maxPotential=maxPotential)
		filenames.extend([reference, str(rc), potentialFileSuffix])
	else:
		print 'ERROR: unimplemented reference state: ', reference
		exit(1)

	potentialFileName = '.'.join(filenames)

	## save to PKL file
	if potentialFileName.endswith('.pkl'):
        	fh = open(potentialFileName, 'wb')
		potential_new = dict()
		distCutoffs = dict()
		for response, pot in potential.iteritems():
			labelName = config.Response2LabelName(response)
			if labelName not in set(labelNames):
				continue

			potential_new[response] = pot
			distCutoffs[response] = config.GetCutoffs(response)

		cPickle.dump((name, sequence, potential_new, distCutoffs), fh, protocol=cPickle.HIGHEST_PROTOCOL)
		fh.close()
		return

	## save to text file
	potentialFileName = targetName + '.distPotential.s' + minSeqSepStr + potentialFileSuffix
	fh = open(potentialFileName, 'w')
	fh.write('#TARGET\t' + targetName + '\n')
	fh.write('#SEQ\t' + sequence + '\n')
	fh.write('#DistanceBinBoundaries\t' + "Please check config.py" + '\n')

	for response, pot in potential.iteritems():
		labelName, labelType, subType = config.ParseResponse(response)
		if labelName not in set(labelNames):
			continue

		size = pot.shape
		for i in xrange(size[0]):
			rawPotStrs = []

			for j in xrange(i+ minSeqSep, size[1]):
				atom1, atom2 = config.SelectAtomPair(sequence, i, j, labelName)
				y = pot[i, j]

				rawPotStr = ' '.join(['AtomPair', atom1.upper(), str(i+1), atom2.upper(), str(j+1), subType] + [ "{:.4f}".format(e) for e in y ] )
				rawPotStrs.append(rawPotStr)

			if len(rawPotStrs) >0:
				fh.write('\n'.join(rawPotStrs) + '\n')

	fh.close()