def TrainByOneBatch(batch, train, modelSpecs, forRefState=False): ## batch is a list of protein locations, so we need to load the real data here minibatch = DataProcessor.LoadRealData(batch, modelSpecs) ## add code here to make sure that the data has the same input dimension as the model specification FeatureUtils.CheckModelNDataConsistency(modelSpecs, minibatch) onebatch, names4onebatch = DataProcessor.AssembleOneBatch( minibatch, modelSpecs, forRefState=forRefState) x1d, x2d, x1dmask, x2dmask = onebatch[0:4] ## crop a large protein to deal with limited GPU memory. For sequential and embedding features, the theano model itself will crop based upon bounding box bounds = SampleBoundingBox((x2d.shape[1], x2d.shape[2]), modelSpecs['maxbatchSize']) #x1d_new = x1d[:, bounds[1]:bounds[3], :] x1d_new = x1d x2d_new = x2d[:, bounds[0]:bounds[2], bounds[1]:bounds[3], :] #x1dmask_new = x1dmask[:, bounds[1]:x1dmask.shape[1] ] x1dmask_new = x1dmask x2dmask_new = x2dmask[:, bounds[0]:x2dmask.shape[1], bounds[1]:bounds[3]] input = [x1d_new, x2d_new, x1dmask_new, x2dmask_new] ## if embedding is used ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ): if config.EmbeddingUsed(modelSpecs): embed = onebatch[4] #embed_new = embed[:, bounds[1]:bounds[3], : ] embed_new = embed input.append(embed_new) remainings = onebatch[5:] else: remainings = onebatch[4:] ##crop the ground truth and weight matrices for x2d0 in remainings: if len(x2d0.shape) == 3: input.append(x2d0[:, bounds[0]:bounds[2], bounds[1]:bounds[3]]) else: input.append(x2d0[:, bounds[0]:bounds[2], bounds[1]:bounds[3], :]) ## add bounding box to the input list input.append(bounds) if config.TrainByRefLoss(modelSpecs): if forRefState: input.append(np.int32(-1)) else: input.append(np.int32(1)) train_loss, train_errors, param_L2 = train(*input) return train_loss, train_errors, param_L2
def PrepareInput4Prediction(data, modelSpecs, floatType=np.float32, UseSharedMemory=False, forRefState=False): if not bool(data): print 'ERROR: the input data for PrepareInput4Prediction is empty' exit(1) onebatch, _= DataProcessor.AssembleOneBatch(data, modelSpecs, forRefState=forRefState, floatType=floatType, bUseSharedMemory=UseSharedMemory) maxSeqLen = max( [ d['seqLen'] for d in data ] ) box = np.array([0, 0, maxSeqLen, maxSeqLen]).astype(np.int32) onebatch.append( box ) return onebatch
def PrepareInput4Validate(data, modelSpecs, floatType=np.float32, forRefState=False, UseSharedMemory=False): if not bool(data): print 'ERROR: the input data for PrepareInput4Validate is empty' exit(1) if UseSharedMemory: ## when shared memory is used, there is no explicit limit on the size of an ndarray maxAllowedLen = np.iinfo(np.int32).max else: ## when the real content of a large matrix is passed through Queue, its size shall be <2GB maxAllowedLen = 800 maxSeqLen = max([ d['seqLen'] for d in data ]) if maxSeqLen > maxAllowedLen and len(data)>1: print 'ERROR: when one validation protein has length > ', maxAllowedLen, ', it shall form a minibatch by itself' exit(1) ##determine the bounding box. if maxSeqLen <= maxAllowedLen: bounds = None else: bounds = [] for d in data: seqLen = d['seqLen'] if seqLen > maxAllowedLen: ## cut off a submatrix along the diagonal line so that the top accurcy function in our deep model works correctly top = 0 bottom = maxAllowedLen #left = seqLen - maxAllowedLen left = 0 #right = seqLen right = maxAllowedLen box = [top, left, bottom, right] bounds.append(box) else: bounds.append(None) onebatch, _= DataProcessor.AssembleOneBatch(data, modelSpecs, forRefState=forRefState, bounds=bounds, floatType=floatType, bUseSharedMemory=UseSharedMemory) if maxSeqLen <= maxAllowedLen: box = np.array([0, 0, maxSeqLen, maxSeqLen]).astype(np.int32) else: ## in this case, len(bounds)==1 and len(data) == 1 assert len(bounds)==1 assert bounds[0] is not None box = np.array(bounds[0]).astype(np.int32) onebatch.append( box ) return onebatch
def PrepareInput4Train(data, modelSpecs, floatType=np.float32, forRefState=False, UseSharedMemory=False): if not bool(data): print 'ERROR: the input data for PrepareInput4Train2 is empty' exit(1) allowedLen = int(math.floor(math.sqrt(modelSpecs['maxbatchSize']) ) ) bounds =[] for d in data: if d['seqLen'] < allowedLen: bounds.append( None ) continue box = SampleBoundingBox( (d['seqLen'], d['seqLen']), modelSpecs['maxbatchSize'] ) bounds.append(box) #print allowedLen #print bounds onebatch, _= DataProcessor.AssembleOneBatch(data, modelSpecs, forRefState=forRefState, bounds=bounds, floatType=floatType, bUseSharedMemory=UseSharedMemory) ## determine the bounding box. maxSeqLen = max([ d['seqLen'] for d in data ]) #print maxSeqLen if maxSeqLen > allowedLen and len(data)>1: print 'ERROR: one minibatch has more than one large proteins: ', [ d['name'] for d in data ] exit(1) if maxSeqLen <= allowedLen: box = np.array([0, 0, maxSeqLen, maxSeqLen]).astype(np.int32) else: ## in this case, len(data) == 1 and len(bounds) == 1 assert bounds[0] is not None box = np.array(bounds[0]).astype(np.int32) onebatch.append(box) if config.TrainByRefLoss(modelSpecs): if forRefState: onebatch.append(np.int32(-1) ) else: onebatch.append(np.int32(1) ) return onebatch
def PredictMatrixLabels(models, predictors, names, inputFolders, aliFolders=None, tplFolder=None, aliFile=None, tplFile=None, saveFolder=None): if not isinstance(names, (list, tuple)): targetName = names else: targetName = None ##allresults is a nested dictionary, i.e., allresults[proteinName][response] = sum of predicted_prob_matrices ##We predict one prob_matrix by each model for each protein and each response and then average them per protein and response to get the final results ##two different models may share common responses allsequences = dict() allresults = dict() ## the results predicted from the real input numModels = dict( ) ## count the number of models that may predict each response for model, predictor in zip(models, predictors): #predict, inputVariables = BuildPredictor(model) predict, inputVariables = predictor ## load data for each model separately since each model may have a different specification if targetName is None: rawData = LoadProteinData4OneModel(model, names, inputFolders, aliFolders, tplFolder) elif aliFile is not None and tplFile is not None: rawData = LoadOneAlignment4OneModel(model, targetName, inputFolders, aliFile, tplFile) else: rawData = LoadOneProteinData4OneModel(model, targetName, inputFolders, aliFolders, tplFolder) predData = DataProcessor.ExtractFeaturesNLabels( rawData, modelSpecs=model, forTrainValidation=False, returnMode='list') ##make sure the input has the same number of features as the model FeatureUtils.CheckModelNDataConsistency(model, predData) ## check sequence consistency for d in predData: name = d['name'] if not allresults.has_key(name): allresults[name] = dict() numModels[name] = dict() if not allsequences.has_key(name): allsequences[name] = d['sequence'] elif allsequences[name] != d['sequence']: print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files' exit(1) predSeqData = DataProcessor.SplitData2Batches(data=predData, numDataPoints=624, modelSpecs=model) print '#predData: ', len(predData), '#batches: ', len(predSeqData) ##for onebatch, names4onebatch in zip(predSeqData, names): for minibatch in predSeqData: onebatch, names4onebatch = DataProcessor.AssembleOneBatch( minibatch, model) input = onebatch[:len(inputVariables)] result = predict(*input) ##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model assert result.shape[3] == sum([ GetResponseProbDims(response) for response in model['responses'] ]) ## calculate the start and end positions of each response in the last dimension of result dims = [ GetResponseProbDims(response) for response in model['responses'] ] endPositions = np.cumsum(dims) startPositions = endPositions - dims x1d, x2d, x1dmask, x2dmask = input[0:4] seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1) maxSeqLen = x1d.shape[1] for response, start, end in zip(model['responses'], startPositions, endPositions): ## batchres is a batch of result, its ndim=4 ## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are distance/orientation matrix sizes and the 4th is for the predicted probability parameters batchres = result[:, :, :, start:end] ## remove masked positions revised_batchres = [ probMatrix[maxSeqLen - seqLen:, maxSeqLen - seqLen:, :] for probMatrix, seqLen in zip(batchres, seqLens) ] for res4one, name in zip(revised_batchres, names4onebatch): if not allresults[name].has_key(response): allresults[name][response] = res4one numModels[name][response] = np.int32(1) else: ## here we save sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins allresults[name][response] += res4one numModels[name][response] += np.int32(1) ## calculate the final result, which is the average of predictd prob matrices by all models for the same protein and the same response finalresults = dict() for name, results in allresults.iteritems(): if not finalresults.has_key(name): finalresults[name] = dict() ## finalresults has 3 dimensions. for response in results.keys(): finalresults[name][response] = (allresults[name][response] / numModels[name][response]).astype( np.float32) ##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy. labelName = Response2LabelName(response) if config.IsSymmetricLabel(labelName): finalresults[name][response] = ( finalresults[name][response] + np.transpose(finalresults[name][response], (1, 0, 2))) / 2. ## convert predicted distance probability matrix into contact matrix predictedContactMatrices = DeriveContactMatrix(finalresults) ## collect the average label distributions and weight matrix finalLabelWeights, finalLabelDistributions = CollectLabelWeightNDistribution( models) ##write all the results here ## for each protein, we have a output file saving a tuple (name, sequence, predicted distance matrix, predicted contact matrix, labelWeight, labelDistribution) for name, results in finalresults.iteritems(): savefilename = name + '.predictedDistMatrix.pkl' if saveFolder is not None: savefilename = os.path.join(saveFolder, savefilename) if targetName is not None: originalName = targetName else: for n in names: if name.startswith(n): originalName = n break with open(savefilename, 'wb') as fh: #cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump((originalName, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL) return (predictedContactMatrices, allsequences) """