def EvaluateAccuracy(pred_prob, truth, pad_len): pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:], dtype=theano.config.floatX) truth_in_correct_shape = truth[pad_len:, pad_len:] labelName, labelType, subType = ParseResponse(currentResponse) symmetric = config.IsSymmetricLabel(labelName) if labelName in config.allOrientationNames: if not config.IsDiscreteLabel(labelType): print 'ERROR: unsupported label type for orientation matrix prediction: ', currentResponse exit(1) numLabels = GetResponseProbDims(currentResponse) if subType.endswith('Plus') or subType.endswith('Minus'): largestValidLabel = numLabels - 2 else: largestValidLabel = numLabels - 1 return TopAccuracyOrientation( pred=pred_in_correct_shape, truth=truth_in_correct_shape, largestValidLabel=largestValidLabel, symmetric=symmetric) if labelType.startswith('LogNormal'): return TopAccuracyLogNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Normal'): return TopAccuracyNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Discrete'): #subType = labelType[len('Discrete'): ] if subType.startswith('2C'): return TopAccuracy2C(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) else: return TopAccuracyMultiC(pred=pred_in_correct_shape, truth=truth_in_correct_shape, subType=subType, symmetric=symmetric) else: print 'ERROR: unsupported label type in EvaluateAccuracy: ', labelType exit(1)
def DeriveOriContactMatrix(predOriMatrix, response): labelName, labelType, subType = config.ParseResponse(response) symmetric = config.IsSymmetricLabel(labelName) if labelName not in config.allOrientationNames: print 'ERROR: unsupported orientation label name in', response exit(1) if not config.IsDiscreteLabel(labelType): print 'ERROR: unsupported orientation label type in', response exit(1) numLabels = config.GetResponseProbDims(response) if subType.endswith('Plus') or subType.endswith('Minus'): largestValidLabel = numLabels - 2 else: largestValidLabel = numLabels - 1 contactMatrix = np.sum(predOriMatrix[:, :, :largestValidLabel], axis=2) return contactMatrix
def AddLabel2OneBatch(names, batch, modelSpecs, sharedLabelPool, sharedLabelWeightPool, floatType=theano.config.floatX): numSeqs = len(names) for name in names: if (not sharedLabelPool.has_key(name)) or ( not sharedLabelWeightPool.has_key(name)): print 'the label or label weight matrix does not exist for protein ', name exit(1) seqLens = [sharedLabelWeightPool[name].shape[0] for name in names] ## get the boundingbox for this batch if not config.TrainByRefLoss(modelSpecs): box = batch[-1] else: box = batch[-2] top, left, bottom, right = box assert bottom - top == right - left boxsize = bottom - top if boxsize < max(seqLens) and numSeqs > 1: ## make sure that there is only one protein in this batch print 'ERROR: when one batch has a large protein, it can only have one protein' exit(1) ## we crop pairwise labels at this step to save memory and computational time maxMatrixSize = min(boxsize, max(seqLens)) ## Y shall be a list of 2D or 3D matrices, each for one response Y = [] for response in modelSpecs['responses']: labelName, labelType, _ = ParseResponse(response) dataType = np.int16 if not config.IsDiscreteLabel(labelType): dataType = floatType rValDims = GetResponseValueDims(response) if rValDims == 1: y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize), dtype=dataType) Y.append(y) else: y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize, rValDims), dtype=dataType) Y.append(y) ## when Y is empty, weight is useless. So When Y is empty, weight shall also be empty weightMatrix = [] if bool(Y) and config.UseSampleWeight(modelSpecs): weightMatrix = [ np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize), dtype=floatType) ] * len(modelSpecs['responses']) for j, name, seqLen in zip(range(len(names)), names, seqLens): ## we align all matrices in the bottom/right corner ## posInX and posInY are the starting position of one protein in the final output tensor ## here X and Y refer to x-axis and y-axis posInX = -min(boxsize, seqLen) posInY = -min(boxsize, seqLen) for y, response in zip(Y, modelSpecs['responses']): if boxsize < seqLen: tmp = sharedLabelPool[name][response][top:bottom, left:right] else: tmp = sharedLabelPool[name][response] if len(y.shape) == 3: y[j, posInX:, posInY:] = tmp else: y[j, posInX:, posInY:, ] = tmp labelWeightMatrix = sharedLabelWeightPool[name] for w, response in zip(weightMatrix, modelSpecs['responses']): if boxsize < seqLen: w[j, posInX:, posInY:] = labelWeightMatrix[response][top:bottom, left:right] else: w[j, posInX:, posInY:] = labelWeightMatrix[response] ## the input batch contains bounding box tail = 1 ## check to see if the input batch contains one flag for RefState if config.TrainByRefLoss(modelSpecs): tail += 1 newbatch = batch[:-tail] newbatch.extend(Y) newbatch.extend(weightMatrix) newbatch.extend(batch[-tail:]) return newbatch
def AssembleOneBatch(data, modelSpecs, forRefState=False, bounds=None, floatType=theano.config.floatX, bUseSharedMemory=False): if not data: print 'WARNING: the list of data is empty' return None numSeqs = len(data) seqLens = [d['seqLen'] for d in data] names = [d['name'] for d in data] ## use maxSeqLen and minSeqLen for sequential features ## we do not crop sequential features at this step since the theano deep model will do so after 1D convolution operation maxSeqLen = max(seqLens) minSeqLen = min(seqLens) #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen numSeqFeatures = FeatureUtils.DetermineNumSeqFeatures( data[0]['seqFeatures']) X1d = np.zeros(shape=(numSeqs, maxSeqLen, numSeqFeatures), dtype=floatType) numMatrixFeatures = FeatureUtils.DetermineNumMatrixFeatures( data[0]['matrixFeatures']) + FeatureUtils.DetermineNumMatrixFeatures( data[0]['matrixFeatures_nomean']) ## we use maxMatrixSize and minMatrixSize for pairwise features ## we crop pairwise features at this step to save memory and computational time minMatrixSize, maxMatrixSize = CalcMinMaxMatrixSize(bounds, seqLens) if bUseSharedMemory: shmX2d = SharedNDArray( (numSeqs, maxMatrixSize, maxMatrixSize, numMatrixFeatures), dtype=floatType, name='/RaptorX-' + str(os.getppid()) + '-X2d-' + randomString(6)) X2d = shmX2d.array X2d[:] = 0 else: X2d = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize, numMatrixFeatures), dtype=floatType) X1dem = None if data[0].has_key('embedFeatures'): numEmbedFeatures = data[0]['embedFeatures'].shape[1] X1dem = np.zeros(shape=(numSeqs, maxSeqLen, numEmbedFeatures), dtype=floatType) ## Y shall be a list of 2D or 3D matrices, each for one response Y = [] if data[0].has_key('atomLabelMatrix'): for response in modelSpecs['responses']: labelName, labelType, _ = ParseResponse(response) dataType = np.int16 if not config.IsDiscreteLabel(labelType): dataType = floatType rValDims = GetResponseValueDims(response) if rValDims == 1: y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize), dtype=dataType) Y.append(y) else: y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize, rValDims), dtype=dataType) Y.append(y) ## when Y is empty, weight is useless. So When Y is None, weight shall also be None weightMatrix = [] if bool(Y) and config.UseSampleWeight(modelSpecs): weightMatrix = [ np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize), dtype=floatType) ] * len(modelSpecs['responses']) ## for mask. we do not used shared ndarray for them since they are small M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen), dtype=np.int8) M2d = np.zeros(shape=(numSeqs, maxMatrixSize - minMatrixSize, maxMatrixSize), dtype=np.int8) if bounds is not None: boxes = bounds else: boxes = [None] * len(data) for j, d, box in zip(range(len(data)), data, boxes): seqLen = d['seqLen'] ## posInSeq, posInX and posInY are the starting position of one protein in the final output tensor posInSeq = -seqLen ## here X and Y refer to x-axis and y-axis if box is not None: top, left, bottom, right = box posInX = -(bottom - top) posInY = -(right - left) else: posInX = -seqLen posInY = -seqLen if forRefState: ## this code needs reexamination, it may not be correct when d['seqFeatures']/d['matrixFeatures'] is represented as a list of arrays instead of a single array X1d[j, posInSeq:, :] = np.array( [modelSpecs['seqFeatures_expected']] * seqLen).reshape( (seqLen, -1)) tmp = [modelSpecs['matrixFeatures_expected']] * (seqLen * seqLen) tmp2 = np.array(tmp).reshape((seqLen, seqLen, -1)) tmp3 = np.concatenate((tmp2, d['matrixFeatures_nomean']), axis=2) if box is not None: X2d[j, posInX:, posInY:, :] = tmp3[top:bottom, left:right, ] else: X2d[j, posInX:, posInY:, :] = tmp3 else: if isinstance(d['seqFeatures'], np.ndarray): X1d[j, posInSeq:, :] = d['seqFeatures'] else: startPos = 0 for f in d['seqFeatures']: if len(f.shape) == 1: X1d[j, posInSeq:, startPos:startPos + 1] = f[:, np.newaxis] startPos += 1 elif len(f.shape) == 2: X1d[j, posInSeq:, startPos:startPos + f.shape[1]] = f startPos = startPos + f.shape[1] else: print 'wrong shape in sequential feature: ', f.shape exit(1) # add 2D features in matrixFeatures to holder staring from the start position # holder is a 3D array and start is the starting position in the 3rd dimension def Add2DFeatures(matrixFeatures, holder, start): if isinstance(matrixFeatures, np.ndarray): features = [matrixFeatures] else: features = matrixFeatures startPos = start #for f in matrixFeatures: for f in features: if len(f.shape) == 2: endPos = startPos + 1 if box is None: holder[:, :, startPos:endPos] = f[:, :, np.newaxis] else: holder[:, :, startPos:endPos] = f[top:bottom, left:right, np.newaxis] elif len(f.shape) == 3: endPos = startPos + f.shape[2] if box is None: holder[:, :, startPos:endPos] = f else: holder[:, :, startPos:endPos] = f[top:bottom, left:right, :] else: print 'wrong shape in matrixFeatures: ', f.shape exit(1) startPos = endPos return endPos end = Add2DFeatures(d['matrixFeatures'], X2d[j, posInX:, posInY:, :], 0) Add2DFeatures(d['matrixFeatures_nomean'], X2d[j, posInX:, posInY:, :], end) M1d[j, posInSeq:].fill(1) M2d[j, posInX:, posInY:].fill(1) if X1dem is not None: ## embed feature is always represented as a single array, so the code shall be correct if forRefState: X1dem[j, posInSeq:, :] = np.array( [modelSpecs['embedFeatures_expected']] * seqLen).reshape( (seqLen, -1)) else: X1dem[j, posInSeq:, :] = d['embedFeatures'] for y, response in zip(Y, modelSpecs['responses']): if box is not None: tmp = d['atomLabelMatrix'][response][top:bottom, left:right] else: tmp = d['atomLabelMatrix'][response] if len(y.shape) == 3: y[j, posInX:, posInY:] = tmp else: y[j, posInX:, posInY:, ] = tmp if bool(weightMatrix): if d.has_key('labelWeightMatrix'): labelWeightMatrix = d['labelWeightMatrix'] else: labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix( d['atomLabelMatrix'], modelSpecs, floatType=floatType) for w, response in zip(weightMatrix, modelSpecs['responses']): if box is not None: w[j, posInX:, posInY:] = labelWeightMatrix[response][top:bottom, left:right] else: w[j, posInX:, posInY:] = labelWeightMatrix[response] if bUseSharedMemory: onebatch = [X1d, shmX2d, M1d, M2d] else: onebatch = [X1d, X2d, M1d, M2d] if X1dem is not None: onebatch.append(X1dem) onebatch.extend(Y) onebatch.extend(weightMatrix) return onebatch, names
elif bPrintOtherAtomPairs: contactFileName = filename + '.' + apt + '.CM.txt' contactCASPFileName = filename + '.' + apt + '.CASP.rr' else: continue contactFile = os.path.join(savefolder, contactFileName) np.savetxt(contactFile, m, fmt='%1.6f', delimiter=' ') contactCASPFile = os.path.join(savefolder, contactCASPFileName) if contactOnly: ContactUtils.SaveContactMatrixInCASPFormat(targetName, sequence, m, contactCASPFile, distMatrix=None, probScaleFactor=1) continue responses = FindStringsStartWith(distProbMatrix.keys(), apt) if len(responses) != 1: ## right now for one apt, only one response is allowed print 'ERROR: incorrect distance information for', apt, 'in', predFile exit(1) response = responses[0] labelName, labelType, subType = config.ParseResponse(response) if not config.IsDiscreteLabel(labelType): print 'ERROR: right now only discrete distance matrix is supported' exit(1) ## convert distance matrix to what's needed by CASP distMatrix = DistanceUtils.MergeDistanceBinsBySum(distProbMatrix[response], config.distCutoffs[subType], config.distCutoffs['10C']) ContactUtils.SaveContactMatrixInCASPFormat(targetName, sequence, m, contactCASPFile, distMatrix=distMatrix, probScaleFactor=1)
def main(argv): newName = None savefolder = os.getcwd() try: opts, args = getopt.getopt(argv, "s:n:", ["savefolder=", "name="]) #print opts, args except getopt.GetoptError: Usage() exit(1) if len(args) < 2: Usage() exit(1) baseMatrixFile = args[0] subMatrixFiles = args[1:] for opt, arg in opts: if opt in ("-s", "--savefolder"): savefolder = arg elif opt in ("-n", "--name"): newName = arg else: Usage() exit(1) baseMatrix = DistanceUtils.LoadRawDistProbFile(baseMatrixFile) sequence = baseMatrix[1] targetName = baseMatrix[0] ## baseMatrix and subMatrix are a tuple of 6 items subMatrices = [] for subMatrixFile in subMatrixFiles: subMatrix = DistanceUtils.LoadRawDistProbFile(subMatrixFile) ## make sure that both matrix files are of the same type, although they may not equal if baseMatrix[4] is None: assert (subMatrix[4] is None) if baseMatrix[4] is not None: assert (subMatrix[4] is not None) subMatrices.append(subMatrix) ## new distance and contact matrices with response as the keys newDistMatrices = {} counterMatrices = {} ## initialize for response, m in baseMatrix[2].iteritems(): newDistMatrices[response] = deepcopy(m) counterMatrices[response] = np.ones(m.shape[:2], dtype=np.int32) ## add submatrices onto newDistMatrices for subMatrix, smfile in zip(subMatrices, subMatrixFiles): print 'Adding submatrix in ', smfile subSequence = subMatrix[1] ## try to find its position in the original sequence by assumming that this domain has only one seq segment index = sequence.find(subSequence) if index >= 0: for response, m in subMatrix[2].iteritems(): if not newDistMatrices.has_key(response): print 'WARNING: the original matrix does not have response', response, ' in subMatrixFile:', smfile continue AddSubMatrix(newDistMatrices[response], m, index) AddSubMatrix(counterMatrices[response], np.ones(m.shape[:2], dtype=np.int32), index) else: ## try to find its positions in the original sequence by assuming that this domain has two seq segments res = FindIndexBySegments(sequence, subSequence) if res is None: print 'ERROR: cannot map domain sequence to the whole chain sequence!' print ' domain Seq= ', subSequence print ' chain Seq= ', sequence exit(1) for response, m in subMatrix[2].iteritems(): if not newDistMatrices.has_key(response): print 'WARNING: the original matrix does not have response', response, ' in subMatrixFile:', smfile continue AddSubMatrixBySegments(newDistMatrices[response], m, starts=res[0], sizes=res[1]) AddSubMatrix(counterMatrices[response], np.ones(m.shape[:2], dtype=np.int32), starts=res[0], sizes=res[1]) ## final processing for response, m in newDistMatrices.iteritems(): newDistMatrices[response] = np.divide( newDistMatrices[response], counterMatrices[response][:, :, np.newaxis]) ## convert distMatrix to contactMatrix newContMatrices = {} for response, m in newDistMatrices.iteritems(): ## derive contact matrix from distance matrix labelName, labelType, subType = config.ParseResponse(response) if not config.IsDiscreteLabel(labelType): print 'ERROR: unsupported labelType by ReplaceSubDistMatrix.py: ', labelType exit(1) if labelName in config.allAtomPairNames: labelOf8 = DistanceUtils.LabelsOfOneDistance( config.ContactDefinition, config.distCutoffs[subType]) newContMatrices[labelName] = ContactUtils.Distance2Contact( m, labelOf8) elif labelName in config.allOrientationNames: newContMatrices[ labelName] = OrientationUtils.DeriveOriContactMatrix( m, response) else: print 'ERROR: unsupported labelName in replaceSubDistMatrix(): ', labelName exit(1) content4save = (targetName, sequence, newDistMatrices, newContMatrices, baseMatrix[4], baseMatrix[5]) ## save the new result if newName is None: fileName = os.path.basename(baseMatrixFile).split('.')[0] + '-mixed' else: fileName = newName savefile = os.path.join(savefolder, fileName + '.predictedDistMatrix.pkl') with open(savefile, 'wb') as fh: cPickle.dump(content4save, fh, protocol=cPickle.HIGHEST_PROTOCOL)
def CalcLabelWeight(modelSpecs): print 'Calculating label weight ...' numRanges = RangeNWeight.GetNumRanges(modelSpecs) RangeNWeight.SetWeight4Range(modelSpecs) #print 'weight for range: ', modelSpecs['weight4range'] RangeNWeight.SetWeight43C2C(modelSpecs) #print 'LRbias= ', modelSpecs['LRbias'] #print 'weight43C= ', modelSpecs['weight4Discrete3C'] allRefProbs = modelSpecs['labelDistributions'] ##for discrete labels, we calculate their weights by inferring from the weight intialized to 3 bins: 0-8, 8-15 and >15 or -1, which makes inference easier modelSpecs['weight4labels'] = dict() for response in modelSpecs['responses']: labelName, labelType, subType = config.ParseResponse(response) numLabels = GetResponseProbDims(response) if config.IsContinuousLabel(labelType): ## just need to assign range weight for continuous response modelSpecs['weight4labels'][response] = modelSpecs[ 'weight4continuous'] continue if not config.IsDiscreteLabel(labelType): print 'ERROR: unsupported response in CalcLabelWeight: ', response exit(1) if labelName in config.allOrientationNames or config.NoWeight4Label( modelSpecs): modelSpecs['weight4labels'][response] = np.multiply( np.ones((numRanges, numLabels), dtype=np.float32), modelSpecs['weight4range']) elif labelName in ['HB', 'Beta']: ## if the response is for HB and Beta-Pairing if subType.startswith('2C'): modelSpecs['weight4labels'][response] = modelSpecs['weight4' + response] else: print 'ERROR: unsupported label subtype in CalcLabelWeight: ', response exit(1) elif labelName in config.allAtomPairNames: ## calculate label weight for atom pairs Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO if subType.startswith('2C'): print 'ERROR: 2C is not supported for contact/distance prediction any more' exit(1) elif subType.startswith('3C'): ## if 3C is used for the response modelSpecs['weight4labels'][response] = modelSpecs[ 'weight4Discrete3C'] else: modelSpecs['weight4labels'][ response] = DistanceUtils.CalcLabelWeight( modelSpecs['weight4Discrete3C'], allRefProbs[response], config.distCutoffs[subType]) else: print 'ERROR: unsupported label name in CalcLabelWeight: ', response exit(1) ## set the weight of the label for the invalid entry (distance or orientation) to 0 if subType.endswith('Minus'): modelSpecs['weight4labels'][response][:, -1] = 0 """ ## for log for response in modelSpecs['responses']: print 'weight4labels for response: ', response print modelSpecs['weight4labels'][response] """ return modelSpecs['weight4labels']