def CalcLabelDistribution(data, modelSpecs): ## collect all discrete label matrices allLabelMatrices = dict() for response in modelSpecs['responses']: labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): continue allLabelMatrices[response] = [ d['atomLabelMatrix'][response] for d in data ] ## calculate the discrete label distribution allRefProbs = dict() for response in modelSpecs['responses']: labelName, labelType, subType = config.ParseResponse(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): allRefProbs[response] = np.array([1.] * numRanges).reshape( (-1, 1)).astype(np.float32) continue if modelSpecs.has_key('UseBoundingBox4RefProbs') and ( modelSpecs['UseBoundingBox4RefProbs'] is True): ## here we sample a sub label matrix using BoundingBox to account for the real training scenario newLabelMatrices = [] for lMatrix in allLabelMatrices[response]: bounds = SampleBoundingBox( (lMatrix.shape[0], lMatrix.shape[1]), modelSpecs['maxbatchSize']) new_lMatrix = lMatrix[bounds[0]:bounds[2], bounds[1]:bounds[3]].astype(np.int32) newLabelMatrices.append(new_lMatrix) if labelName in config.allOrientationNames: allRefProbs[response] = OrientationUtils.CalcLabelProb( data=newLabelMatrices, numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) else: allRefProbs[response] = DistanceUtils.CalcLabelProb( data=newLabelMatrices, numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) else: if labelName in config.allOrientationNames: allRefProbs[response] = OrientationUtils.CalcLabelProb( data=[ m.astype(np.int32) for m in allLabelMatrices[response] ], numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) else: allRefProbs[response] = DistanceUtils.CalcLabelProb( data=[ m.astype(np.int32) for m in allLabelMatrices[response] ], numLabels=GetResponseProbDims(response), numRanges=RangeNWeight.GetNumRanges(modelSpecs)) modelSpecs['labelDistributions'] = allRefProbs return allRefProbs
def CalcLabelWeightMatrix(LabelMatrix=None, modelSpecs=None, floatType=theano.config.floatX): if LabelMatrix is None: return None RangeBoundaries = RangeNWeight.GetRangeBoundaries( numRanges=RangeNWeight.GetNumRanges(modelSpecs)) shape = LabelMatrix.values()[0].shape a = np.mgrid[0:shape[0], 0:shape[1]] seqSeparation = abs(a[0] - a[1]) masks = [] for i, bound in zip(range(len(RangeBoundaries)), RangeBoundaries): if i == 0: mask = (seqSeparation >= bound).astype(np.int16) else: mask = ((seqSeparation >= bound) * (seqSeparation < RangeBoundaries[i - 1])).astype(np.int16) masks.append(mask) for response in modelSpecs['responses']: if not modelSpecs['weight4labels'].has_key(response): print 'ERROR: Cannot find the weight factor for response ', response exit(1) ##the below procedure is not very effective. We shall improve it later. labelWeightMatrices = dict() for response in modelSpecs['responses']: labelName, labelType, subType = config.ParseResponse(response) ## wMatrix is a matrix with dimension config.numRanges * numLabels, where numRanges corresponds to ER, LR, MR, SR, and NR, respectively wMatrix = (modelSpecs['weight4labels'][response]).astype(np.float32) assert wMatrix.shape[0] == len(RangeBoundaries) if config.IsContinuousLabel(labelType): low, high = config.GetLabelMinMaxValues(labelName) ## if the label is real value, then for each range, there is only a single weight for all values tmpWeightMatrices = [] for w in wMatrix: ## the below two sentences may be incorrect. Need further examination. M0s = np.zeros_like(LabelMatrix[response], dtype=np.int16) tmp = w[M0s] if labelName in config.allOrientationNames: ## set the weight for the invalid orientation entry to 0. np.putmask(tmp, LabelMatrix[response] > high, 0) np.putmask(tmp, LabelMatrix[response] < low, 0) else: ## set the weight for the invalid distance entry to 0. An invalid entry in the label matrix is indicated by a negative value,e.g., -1 np.putmask(tmp, LabelMatrix[response] < low, 0) tmpWeightMatrices.append(tmp) else: tmpWeightMatrices = [w[LabelMatrix[response]] for w in wMatrix] labelWeightMatrices[response] = sum([ m * w for m, w in zip(masks, tmpWeightMatrices) ]).astype(floatType) return labelWeightMatrices
def CollectTemplateMatrixFeatures(d, modelSpecs): #print 'Using template distance and orientation matrix...' if not d.has_key('tplDistMatrix'): print 'ERROR: the data for ', d[ 'name'], ' has no tplDistMatrix, which is needed since you want to use template information' exit(1) if not d['tplDistMatrix'].has_key('CbCb'): print 'ERROR: tplDistMatrix shall have distance matrices for atom pairs CbCb' exit(1) CbCbMatrix = d['tplDistMatrix']['CbCb'] templateMatrixFeatures = [] for response in modelSpecs['responses']: labelName, labelType, subType = config.ParseResponse(response) if labelName in config.allAtomPairNames: ## process template dist matrix tplDistMatrix = d['tplDistMatrix'][labelName] strengthMatrix = np.copy(tplDistMatrix) np.putmask(strengthMatrix, tplDistMatrix < 2.0, 2.0) strengthMatrix = 2.0 / strengthMatrix ## for invalid entry (no valid coordinates or insertion in the sequence), assign the strength to 0 np.putmask(strengthMatrix, tplDistMatrix < 0, 0) templateMatrixFeatures.append(strengthMatrix) continue if labelName in config.allOrientationNames: ## process template orientation matrix if not d.has_key('tplOriMatrix'): print 'ERROR: the data for ', d[ 'name'], ' has no tplOriMatrix, which is needed since you want to use template orientation information' exit(1) oriMatrix = d['tplOriMatrix'][labelName] ## discretize and convert to one-hot encoded matrix oriBins = config.GetCutoffs(response) invalidEntrySeparated = subType.endswith( 'Plus') or subType.endswith('Minus') #labelMatrix, _, _ = OrientationUtils.DiscretizeOrientationMatrix(oriMatrix, bins=oriBins, distThreshold4Orientation=modelSpecs['distThreshold4Orientation'], distMatrix=CbCbMatrix, invalidEntrySeparated=invalidEntrySeparated ) labelMatrix, _, _ = OrientationUtils.DiscretizeOrientationMatrix( oriMatrix, bins=oriBins, distThreshold4Orientation=modelSpecs[ 'distThreshold4Orientation'], distMatrix=CbCbMatrix, invalidEntrySeparated=invalidEntrySeparated, asResponse=False) oneHotLabelMatrix = LabelUtils.GetOneHotLabelMatrix( labelMatrix, numLabels=config.GetResponseProbDims(response)) templateMatrixFeatures.append(oneHotLabelMatrix) continue print 'WARNING: template information for response not implemented: ', response return templateMatrixFeatures
def Score(atomDistMatrix, potential, labelNames, outputDetails=False, minSeqSep=6, maxCstDist=None): totalScore = 0.0 for response, pot in potential.iteritems(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in set(labelNames): continue if not atomDistMatrix.has_key(labelName): print 'WARNING: the atomDistMatrix does not have distance information for atom pair:', labelName continue if not labelType.startswith('Discrete'): print 'unsupported labelType: ', labelType exit(1) distm = atomDistMatrix[labelName] assert distm.shape == ( pot.shape[0], pot.shape[1] ), "the size of the distance-based statitical potential not compatible with the distance matrix" ## discretize the distance matrix, an invalid entry -1 will have the largest label number labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix( distm, config.distCutoffs[subType], invalidDistanceSeparated=False) size = pot.shape m = np.mgrid[0:size[0], 0:size[1]] scoreMatrix = pot[m[0], m[1], labelMatrix] if maxCstDist is not None: label4maxDist = DistanceUtils.LabelsOfOneDistance( maxCstDist, config.distCutoffs[subType]) np.putmask(scoreMatrix, labelMatrix > label4maxDist, 0) totalScore = np.sum(np.triu(scoreMatrix, minSeqSep)) if outputDetails: ## note that if the potential matrix is not symmetric, we have to do something more here indices = np.triu_indices(size[0], k=minSeqSep, m=size[1]) scores = scoreMatrix[indices] labels = labelMatrix[indices] for i, j, s, label in zip(indices[0], indices[1], scores, labels): outinfo = [ str(i + 1), str(j + 1), apt, str(label), "{:.4f}".format(s) ] + ["{:.3f}".format(v) for v in pot[i, j]] outstr = ' '.join(outinfo) print outstr return totalScore
def DeriveOriContactMatrix(predOriMatrix, response): labelName, labelType, subType = config.ParseResponse(response) symmetric = config.IsSymmetricLabel(labelName) if labelName not in config.allOrientationNames: print 'ERROR: unsupported orientation label name in', response exit(1) if not config.IsDiscreteLabel(labelType): print 'ERROR: unsupported orientation label type in', response exit(1) numLabels = config.GetResponseProbDims(response) if subType.endswith('Plus') or subType.endswith('Minus'): largestValidLabel = numLabels - 2 else: largestValidLabel = numLabels - 1 contactMatrix = np.sum(predOriMatrix[:, :, :largestValidLabel], axis=2) return contactMatrix
def main(argv): #printContactMatrix = False #nativefile = None newName = None savefolder = os.getcwd() try: #opts, args = getopt.getopt(argv,"cg:n:",["contact=", "nativefile=", "name="]) opts, args = getopt.getopt(argv, "s:n:", ["savefolder=", "name="]) #print opts, args except getopt.GetoptError: Usage() exit(1) if len(args) < 2: Usage() exit(1) baseMatrixFile = args[0] subMatrixFiles = args[1:] for opt, arg in opts: """ if opt in ("-c", "--contact"): printContactMatrix = True elif opt in ("-g", "--nativefile"): nativefile = arg if not os.path.isfile(nativefile): print 'ERROR: invalid ground truth file for contact accuracy evaluation:', nativefile exit(1) """ if opt in ("-s", "--savefolder"): savefolder = arg elif opt in ("-n", "--name"): newName = arg else: Usage() exit(1) baseMatrix = DistanceUtils.LoadRawDistProbFile(baseMatrixFile) sequence = baseMatrix[1] targetName = baseMatrix[0] ## baseMatrix and subMatrix are a tuple of 6 items subMatrices = [] for subMatrixFile in subMatrixFiles: subMatrix = DistanceUtils.LoadRawDistProbFile(subMatrixFile) ## make sure that both matrix files are of the same type, although they may not equal if baseMatrix[4] is None: assert (subMatrix[4] is None) if baseMatrix[4] is not None: assert (subMatrix[4] is not None) subMatrices.append(subMatrix) ## new distance and contact matrices with response as the keys newDistMatrices = {} newContMatrices = {} ## replace the distance matrix for response, m in baseMatrix[2].iteritems(): tmpResult = m for subMatrix, smfile in zip(subMatrices, subMatrixFiles): if not subMatrix[2].has_key(response): print 'WARNING: there is no response', response, ' in subMatrixFile: ', smfile #exit(1) subSequence = subMatrix[1] ## try by assumming that this domain has only one seq segment index = sequence.find(subSequence) if index >= 0: tmpResult = ReplaceSubMatrix(tmpResult, subMatrix[2][response], index) continue ## try by assuming that this domain has two seq segments res = FindIndexBySegments(sequence, subSequence) if res is None: print 'ERROR: cannot map domain sequence to the whole chain sequence!' print ' domain Seq= ', subSequence print ' chain Seq= ', sequence exit(1) tmpResult = ReplaceSubMatrixBySegments(tmpResult, subMatrix[2][response], starts=res[0], sizes=res[1]) newDistMatrices[response] = tmpResult ## derive contact matrix from distance matrix labelName, labelType, subType = config.ParseResponse(response) if not labelType.startswith('Discrete'): print 'ERROR: unsupported labelType by ReplaceSubDistMatrix.py: ', labelType exit(1) if labelName in config.allAtomPairNames: labelOf8 = DistanceUtils.LabelsOfOneDistance( config.ContactDefinition, config.distCutoffs[subType]) newContMatrices[labelName] = ContactUtils.Distance2Contact( newDistMatrices[response], labelOf8) elif labelName in config.allOrientationNames: newContMatrices[ labelName] = OrientationUtils.DeriveOriContactMatrix( newDistMatrices[response], response) else: print 'ERROR: unsupported labelName in replaceSubDistMatrix(): ', labelName exit(1) """ if newName is not None: targetName = newName else: targetName = targetName + '.mixed' """ if newName is None: fileName = os.path.basename(baseMatrixFile).split('.')[0] + '-mixed' else: fileName = newName ## save the new result content4save = (targetName, sequence, newDistMatrices, newContMatrices, baseMatrix[4], baseMatrix[5]) savefile = os.path.join(savefolder, fileName + '.predictedDistMatrix.pkl') with open(savefile, 'wb') as fh: cPickle.dump(content4save, fh, protocol=cPickle.HIGHEST_PROTOCOL)
elif bPrintOtherAtomPairs: contactFileName = filename + '.' + apt + '.CM.txt' contactCASPFileName = filename + '.' + apt + '.CASP.rr' else: continue contactFile = os.path.join(savefolder, contactFileName) np.savetxt(contactFile, m, fmt='%1.6f', delimiter=' ') contactCASPFile = os.path.join(savefolder, contactCASPFileName) if contactOnly: ContactUtils.SaveContactMatrixInCASPFormat(targetName, sequence, m, contactCASPFile, distMatrix=None, probScaleFactor=1) continue responses = FindStringsStartWith(distProbMatrix.keys(), apt) if len(responses) != 1: ## right now for one apt, only one response is allowed print 'ERROR: incorrect distance information for', apt, 'in', predFile exit(1) response = responses[0] labelName, labelType, subType = config.ParseResponse(response) if not config.IsDiscreteLabel(labelType): print 'ERROR: right now only discrete distance matrix is supported' exit(1) ## convert distance matrix to what's needed by CASP distMatrix = DistanceUtils.MergeDistanceBinsBySum(distProbMatrix[response], config.distCutoffs[subType], config.distCutoffs['10C']) ContactUtils.SaveContactMatrixInCASPFormat(targetName, sequence, m, contactCASPFile, distMatrix=distMatrix, probScaleFactor=1)
def main(argv): targetResponse = 'CbCb_Discrete14C' savefolder = os.getcwd() if len(argv) < 1: Usage() exit(1) try: opts, args = getopt.getopt(argv, "r:s:", ["response=", "savefolder="]) #print opts, args except getopt.GetoptError: Usage() exit(1) if len(args) != 1: Usage() exit(1) for opt, arg in opts: if opt in ("-r", "--response"): targetResponse = arg elif opt in ("-s", "--savefolder"): savefolder = arg if not os.path.isdir(savefolder): os.mkdir(savefolder) else: Usage() exit(1) predFile = args[0] if not os.path.isfile(predFile): print 'ERROR: the predicted distance prob matrix file does not exist: ', predFile exit(1) targetLabelName, targetLabelType, targetSubType = config.ParseResponse( targetResponse) if targetSubType.endswith('Plus') or targetSubType.endswith('Minus'): print 'ERROR: currently the target distance type cannot end with Plus or Minus' exit(1) if not config.distCutoffs.has_key(targetSubType): print 'ERROR: the key ', targetSubType, ' is not defined in config.distCutoffs' exit(1) dstDistCutoff = config.distCutoffs[targetSubType] with open(predFile, 'rb') as fh: pred = cPickle.load(fh) target, sequence, distProbMatrix, contProbMatrix, labelWeight, labelDistribution = pred[: 6] newDistProbMatrix = dict() newLabelWeight = dict() newLabelDistribution = dict() for response in distProbMatrix.keys(): labelName, labelType, subType = config.ParseResponse(response) if labelName not in config.allAtomPairNames: continue if labelName != targetLabelName: continue if not config.distCutoffs.has_key(subType): print 'ERROR: the dist prob matrix to be reduced has a discretization scheme undefined in config.distCutoffs: ', subType exit(1) srcDistCutoff = config.distCutoffs[subType] ## convert distProbMatrix newDistProbMatrix[ targetResponse] = DistanceUtils.MergeDistanceBinsBySum( distProbMatrix[response], srcDistCutoff, dstDistCutoff) ## convert labelDistribution newLabelDistribution[ targetResponse] = DistanceUtils.MergeDistanceBinsBySum( labelDistribution[response], srcDistCutoff, dstDistCutoff) ## convert labelWeight newLabelWeight[ targetResponse] = DistanceUtils.MergeDistanceBinsByAverage( labelWeight[response], srcDistCutoff, dstDistCutoff, labelDistribution[response]) fields = os.path.basename(predFile).split('.') savefile = '.'.join( [fields[0], '.'.join(fields[1:-1]) + '4' + targetSubType, fields[-1]]) savefile = os.path.join(savefolder, savefile) with open(savefile, 'wb') as fh: cPickle.dump((target, sequence, newDistProbMatrix, contProbMatrix, newLabelWeight, newLabelDistribution), fh, protocol=cPickle.HIGHEST_PROTOCOL)
def TopAccuracyByRange(self, zList): currentResponse = None topRatio = 0.5 ## in this function, we assume that pred is a tensor3 of floatX and truth is a matrix ## pred has shape (dataLen, dataLen, 2) and truth has shape (dataLen, dataLen) ## we also assume that label 0 is positive and label 1 is negative ## the result is not 100% accurate for non-symmetric response, e.g., hydrogen-bonding matrix def TopAccuracy2C(pred=None, truth=None, symmetric=False): M1s = T.ones_like(truth, dtype=np.int8) LRsel = T.triu(M1s, 24) MLRsel = T.triu(M1s, 12) SMLRsel = T.triu(M1s, 6) MRsel = MLRsel - LRsel SRsel = SMLRsel - MLRsel dataLen = truth.shape[0] pred0 = pred[:, :, 0] if symmetric: avg_pred = (pred0 + pred0.dimshuffle(1, 0)) / 2.0 else: avg_pred = pred0 #pred_truth = T.concatenate( (avg_pred, truth.dimshuffle(0, 1, 'x') ), axis=2) pred_truth = T.stack([avg_pred, T.cast(truth, 'int32')], axis=2) accuracyList = [] for Rsel in [LRsel, MRsel, MLRsel, SRsel]: selected_pred_truth = pred_truth[Rsel.nonzero()] ## sort by the predicted value for label 0 from the largest to the smallest selected_pred_truth_sorted = selected_pred_truth[( selected_pred_truth[:, 0]).argsort()[::-1]] #print 'topRatio =', topRatio numTops = T.minimum(T.iround(dataLen * topRatio), selected_pred_truth_sorted.shape[0]) selected_sorted_truth = T.cast( selected_pred_truth_sorted[:, -1], 'int32') numTruths = T.bincount(selected_sorted_truth, minlength=2) numCorrects = T.bincount(selected_sorted_truth[0:numTops], minlength=2) #numTops = T.minimum(numTops, numTruths[0]) accuracyList.append( T.stack([ numCorrects[0] * 1. / (numTops + 0.001), numTops, numTruths[0] ], axis=0)) return T.stacklists(accuracyList) def TopAccuracyNormal(pred=None, truth=None, symmetric=True): truth_new = T.ge(truth, config.ContactDefinition) if pred.ndim == 2: pred_new = -pred.dimshuffle(0, 1, 'x') else: pred_new = -pred return TopAccuracy2C(pred=pred_new, truth=truth_new, symmetric=symmetric) def TopAccuracyLogNormal(pred=None, truth=None, symmetric=True): truth_new = T.ge(truth, T.log(config.ContactDefinition)) if pred.ndim == 2: pred_new = -pred.dimshuffle(0, 1, 'x') else: pred_new = -pred return TopAccuracy2C(pred=pred_new, truth=truth_new, symmetric=symmetric) ## in this function, we assume that pred is tensor3 of float and truth is a matrix of int8 or int ## pred has shape (dataLen, dataLen, numLabels), having the predicted probability of each label ## truth has shape (dataLen, dataLen) def TopAccuracyMultiC(pred=None, truth=None, subType=None, symmetric=True): ## convert pred and truth to 2C distBins = config.distCutoffs[subType] label8 = DistanceUtils.LabelsOfOneDistance( config.ContactDefinition, distBins) label15 = DistanceUtils.LabelsOfOneDistance( config.InteractionLimit, distBins) truth1 = T.cast(T.ge(truth, label8), 'int32') truth_new = truth1 pred1 = T.sum(pred[:, :, :label8], axis=2, keepdims=True) pred2 = T.sum(pred[:, :, label8:], axis=2, keepdims=True) pred_new = T.concatenate((pred1, pred2), axis=2) return TopAccuracy2C(pred=pred_new, truth=truth_new, symmetric=symmetric) ## this function calculates the top accuracy of predicted orientation angles. Here the accuracy is defined as the percentage of residue pairs that is correctly to be predicted to have a valid orientation angle ## this is not a good definition, but this is what we can do here ## pred has shape (dataLen, dataLen, numLabels), having the predicted probability of each label ## truth has shape (dataLen, dataLen) def TopAccuracyOrientation(pred=None, truth=None, largestValidLabel=36, symmetric=True): pred1 = T.sum(pred[:, :, :largestValidLabel], axis=2, keepdims=True) pred2 = T.sum(pred[:, :, largestValidLabel:], axis=2, keepdims=True) pred_new = T.concatenate((pred1, pred2), axis=2) truth_new = T.cast(T.ge(truth, largestValidLabel), 'int32') return TopAccuracy2C(pred=pred_new, truth=truth_new, symmetric=symmetric) ##def EvaluateAccuracy(pred_prob, truth, pad_len, response): def EvaluateAccuracy(pred_prob, truth, pad_len): pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:], dtype=theano.config.floatX) truth_in_correct_shape = truth[pad_len:, pad_len:] labelName, labelType, subType = ParseResponse(currentResponse) symmetric = config.IsSymmetricLabel(labelName) if labelName in config.allOrientationNames: if not config.IsDiscreteLabel(labelType): print 'ERROR: unsupported label type for orientation matrix prediction: ', currentResponse exit(1) numLabels = GetResponseProbDims(currentResponse) if subType.endswith('Plus') or subType.endswith('Minus'): largestValidLabel = numLabels - 2 else: largestValidLabel = numLabels - 1 return TopAccuracyOrientation( pred=pred_in_correct_shape, truth=truth_in_correct_shape, largestValidLabel=largestValidLabel, symmetric=symmetric) if labelType.startswith('LogNormal'): return TopAccuracyLogNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Normal'): return TopAccuracyNormal(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) elif labelType.startswith('Discrete'): #subType = labelType[len('Discrete'): ] if subType.startswith('2C'): return TopAccuracy2C(pred=pred_in_correct_shape, truth=truth_in_correct_shape, symmetric=symmetric) else: return TopAccuracyMultiC(pred=pred_in_correct_shape, truth=truth_in_correct_shape, subType=subType, symmetric=symmetric) else: print 'ERROR: unsupported label type in EvaluateAccuracy: ', labelType exit(1) accuracyList = [] for res, out_prob, z, ratio in zip(self.responses, self.output_probList, zList, self.modelSpecs['topRatios']): labelName, labelType, subType = config.ParseResponse(res) ## currently TopAccuracy only works when the dimension of each z is 3 assert z.ndim == 3 if self.mask_1d is not None: paddingLens = self.mask_1d.shape[1] - T.sum(self.mask_1d, axis=1) else: paddingLens = T.zeros_like(z[:, 0, 0], dtype=np.int32) currentResponse = res topRatio = ratio ##here we use scan to calculate accuracy for each protein result, updates = theano.scan(fn=EvaluateAccuracy, outputs_info=None, sequences=[out_prob, z, paddingLens]) accuracy = T.mean(result, axis=0) accuracyList.append(accuracy) return T.stacklists(accuracyList)
def main(argv): newName = None savefolder = os.getcwd() try: opts, args = getopt.getopt(argv, "s:n:", ["savefolder=", "name="]) #print opts, args except getopt.GetoptError: Usage() exit(1) if len(args) < 2: Usage() exit(1) baseMatrixFile = args[0] subMatrixFiles = args[1:] for opt, arg in opts: if opt in ("-s", "--savefolder"): savefolder = arg elif opt in ("-n", "--name"): newName = arg else: Usage() exit(1) baseMatrix = DistanceUtils.LoadRawDistProbFile(baseMatrixFile) sequence = baseMatrix[1] targetName = baseMatrix[0] ## baseMatrix and subMatrix are a tuple of 6 items subMatrices = [] for subMatrixFile in subMatrixFiles: subMatrix = DistanceUtils.LoadRawDistProbFile(subMatrixFile) ## make sure that both matrix files are of the same type, although they may not equal if baseMatrix[4] is None: assert (subMatrix[4] is None) if baseMatrix[4] is not None: assert (subMatrix[4] is not None) subMatrices.append(subMatrix) ## new distance and contact matrices with response as the keys newDistMatrices = {} counterMatrices = {} ## initialize for response, m in baseMatrix[2].iteritems(): newDistMatrices[response] = deepcopy(m) counterMatrices[response] = np.ones(m.shape[:2], dtype=np.int32) ## add submatrices onto newDistMatrices for subMatrix, smfile in zip(subMatrices, subMatrixFiles): print 'Adding submatrix in ', smfile subSequence = subMatrix[1] ## try to find its position in the original sequence by assumming that this domain has only one seq segment index = sequence.find(subSequence) if index >= 0: for response, m in subMatrix[2].iteritems(): if not newDistMatrices.has_key(response): print 'WARNING: the original matrix does not have response', response, ' in subMatrixFile:', smfile continue AddSubMatrix(newDistMatrices[response], m, index) AddSubMatrix(counterMatrices[response], np.ones(m.shape[:2], dtype=np.int32), index) else: ## try to find its positions in the original sequence by assuming that this domain has two seq segments res = FindIndexBySegments(sequence, subSequence) if res is None: print 'ERROR: cannot map domain sequence to the whole chain sequence!' print ' domain Seq= ', subSequence print ' chain Seq= ', sequence exit(1) for response, m in subMatrix[2].iteritems(): if not newDistMatrices.has_key(response): print 'WARNING: the original matrix does not have response', response, ' in subMatrixFile:', smfile continue AddSubMatrixBySegments(newDistMatrices[response], m, starts=res[0], sizes=res[1]) AddSubMatrix(counterMatrices[response], np.ones(m.shape[:2], dtype=np.int32), starts=res[0], sizes=res[1]) ## final processing for response, m in newDistMatrices.iteritems(): newDistMatrices[response] = np.divide( newDistMatrices[response], counterMatrices[response][:, :, np.newaxis]) ## convert distMatrix to contactMatrix newContMatrices = {} for response, m in newDistMatrices.iteritems(): ## derive contact matrix from distance matrix labelName, labelType, subType = config.ParseResponse(response) if not config.IsDiscreteLabel(labelType): print 'ERROR: unsupported labelType by ReplaceSubDistMatrix.py: ', labelType exit(1) if labelName in config.allAtomPairNames: labelOf8 = DistanceUtils.LabelsOfOneDistance( config.ContactDefinition, config.distCutoffs[subType]) newContMatrices[labelName] = ContactUtils.Distance2Contact( m, labelOf8) elif labelName in config.allOrientationNames: newContMatrices[ labelName] = OrientationUtils.DeriveOriContactMatrix( m, response) else: print 'ERROR: unsupported labelName in replaceSubDistMatrix(): ', labelName exit(1) content4save = (targetName, sequence, newDistMatrices, newContMatrices, baseMatrix[4], baseMatrix[5]) ## save the new result if newName is None: fileName = os.path.basename(baseMatrixFile).split('.')[0] + '-mixed' else: fileName = newName savefile = os.path.join(savefolder, fileName + '.predictedDistMatrix.pkl') with open(savefile, 'wb') as fh: cPickle.dump(content4save, fh, protocol=cPickle.HIGHEST_PROTOCOL)
def CalcLabelWeight(modelSpecs): print 'Calculating label weight ...' numRanges = RangeNWeight.GetNumRanges(modelSpecs) RangeNWeight.SetWeight4Range(modelSpecs) #print 'weight for range: ', modelSpecs['weight4range'] RangeNWeight.SetWeight43C2C(modelSpecs) #print 'LRbias= ', modelSpecs['LRbias'] #print 'weight43C= ', modelSpecs['weight4Discrete3C'] allRefProbs = modelSpecs['labelDistributions'] ##for discrete labels, we calculate their weights by inferring from the weight intialized to 3 bins: 0-8, 8-15 and >15 or -1, which makes inference easier modelSpecs['weight4labels'] = dict() for response in modelSpecs['responses']: labelName, labelType, subType = config.ParseResponse(response) numLabels = GetResponseProbDims(response) if config.IsContinuousLabel(labelType): ## just need to assign range weight for continuous response modelSpecs['weight4labels'][response] = modelSpecs[ 'weight4continuous'] continue if not config.IsDiscreteLabel(labelType): print 'ERROR: unsupported response in CalcLabelWeight: ', response exit(1) if labelName in config.allOrientationNames or config.NoWeight4Label( modelSpecs): modelSpecs['weight4labels'][response] = np.multiply( np.ones((numRanges, numLabels), dtype=np.float32), modelSpecs['weight4range']) elif labelName in ['HB', 'Beta']: ## if the response is for HB and Beta-Pairing if subType.startswith('2C'): modelSpecs['weight4labels'][response] = modelSpecs['weight4' + response] else: print 'ERROR: unsupported label subtype in CalcLabelWeight: ', response exit(1) elif labelName in config.allAtomPairNames: ## calculate label weight for atom pairs Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO if subType.startswith('2C'): print 'ERROR: 2C is not supported for contact/distance prediction any more' exit(1) elif subType.startswith('3C'): ## if 3C is used for the response modelSpecs['weight4labels'][response] = modelSpecs[ 'weight4Discrete3C'] else: modelSpecs['weight4labels'][ response] = DistanceUtils.CalcLabelWeight( modelSpecs['weight4Discrete3C'], allRefProbs[response], config.distCutoffs[subType]) else: print 'ERROR: unsupported label name in CalcLabelWeight: ', response exit(1) ## set the weight of the label for the invalid entry (distance or orientation) to 0 if subType.endswith('Minus'): modelSpecs['weight4labels'][response][:, -1] = 0 """ ## for log for response in modelSpecs['responses']: print 'weight4labels for response: ', response print modelSpecs['weight4labels'][response] """ return modelSpecs['weight4labels']