def ValidateAllData(validData, validate, modelSpecs, forRefState=False): accs = [] losses = [] errs = [] numSamples = [] if config.UseSampleWeight(modelSpecs): w4losses = [] w4errors = [] else: w4losses = None w4errors = None for batch in validData: input = ToFloatX(ToNonSharedArray(batch)) onebatch = input[:-1] onebatch_res = validate(*input) los = onebatch_res[0] err = onebatch_res[1] losses.append(los) errs.append(err) if len(onebatch_res) > 2: acc = onebatch_res[2] accs.append(acc) ##numSamples is the number of proteins in one batch numSamples.append(onebatch[0].shape[0]) if config.UseSampleWeight(modelSpecs): #weights = onebatch[ len(onebatch) - len(modelSpecs['responses']) : ] weights = onebatch[-len(modelSpecs['responses']):] w4loss = [] w4error = [] for res, w in zip(modelSpecs['responses'], weights): wSum = np.sum(w) w4loss.append(wSum) w4error.extend([wSum] * GetResponseValueDims(res)) w4losses.append(w4loss) w4errors.append(w4error) ## The loss and err is normalized by the weight of each minibatch. This is equivalent to minimize loss and err per residue pair ## The top accuracy is not normalized by the weight of a minibatch, i.e., we want to maximize per-protein accuracy. if len(accs) > 0 and len(numSamples) > 0: return np.average(losses, axis=0, weights=w4losses), np.average( errs, axis=0, weights=w4errors), np.average(accs, axis=0, weights=numSamples) else: return np.average(losses, axis=0, weights=w4losses), np.average(errs, axis=0, weights=w4errors)
def TrainDataLoader(sharedQ, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False): ## here we use labelPool to cache the labels of all the training proteins ## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment ## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM. labelPool = dict() labelMatrixPool = dict() while True: trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) numOriginals = len(trainDataLocation) trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) random.shuffle(trainSeqData) for batch in trainSeqData: data = [] for protein in batch: name = protein['name'] if labelPool.has_key(name): ## label is already in the pool d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') d['atomLabelMatrix'] = labelPool[name] else: d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list') assert d.has_key('atomLabelMatrix') labelPool[name] = d['atomLabelMatrix'] if config.UseSampleWeight(modelSpecs): if not labelMatrixPool.has_key(name): labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16) labelMatrixPool[name] = labelWeightMatrix d['labelWeightMatrix'] = labelWeightMatrix else: d['labelWeightMatrix'] = labelMatrixPool[name] data.append(d) FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Train(data, modelSpecs, floatType=np.float16, UseSharedMemory=UseSharedMemory) #print 'putting data to trainDataLoader queue...' sharedQ.put(data)
def AddLabel2OneBatch(names, batch, modelSpecs, sharedLabelPool, sharedLabelWeightPool, floatType=theano.config.floatX): numSeqs = len(names) for name in names: if (not sharedLabelPool.has_key(name)) or ( not sharedLabelWeightPool.has_key(name)): print 'the label or label weight matrix does not exist for protein ', name exit(1) seqLens = [sharedLabelWeightPool[name].shape[0] for name in names] ## get the boundingbox for this batch if not config.TrainByRefLoss(modelSpecs): box = batch[-1] else: box = batch[-2] top, left, bottom, right = box assert bottom - top == right - left boxsize = bottom - top if boxsize < max(seqLens) and numSeqs > 1: ## make sure that there is only one protein in this batch print 'ERROR: when one batch has a large protein, it can only have one protein' exit(1) ## we crop pairwise labels at this step to save memory and computational time maxMatrixSize = min(boxsize, max(seqLens)) ## Y shall be a list of 2D or 3D matrices, each for one response Y = [] for response in modelSpecs['responses']: labelName, labelType, _ = ParseResponse(response) dataType = np.int16 if not config.IsDiscreteLabel(labelType): dataType = floatType rValDims = GetResponseValueDims(response) if rValDims == 1: y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize), dtype=dataType) Y.append(y) else: y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize, rValDims), dtype=dataType) Y.append(y) ## when Y is empty, weight is useless. So When Y is empty, weight shall also be empty weightMatrix = [] if bool(Y) and config.UseSampleWeight(modelSpecs): weightMatrix = [ np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize), dtype=floatType) ] * len(modelSpecs['responses']) for j, name, seqLen in zip(range(len(names)), names, seqLens): ## we align all matrices in the bottom/right corner ## posInX and posInY are the starting position of one protein in the final output tensor ## here X and Y refer to x-axis and y-axis posInX = -min(boxsize, seqLen) posInY = -min(boxsize, seqLen) for y, response in zip(Y, modelSpecs['responses']): if boxsize < seqLen: tmp = sharedLabelPool[name][response][top:bottom, left:right] else: tmp = sharedLabelPool[name][response] if len(y.shape) == 3: y[j, posInX:, posInY:] = tmp else: y[j, posInX:, posInY:, ] = tmp labelWeightMatrix = sharedLabelWeightPool[name] for w, response in zip(weightMatrix, modelSpecs['responses']): if boxsize < seqLen: w[j, posInX:, posInY:] = labelWeightMatrix[response][top:bottom, left:right] else: w[j, posInX:, posInY:] = labelWeightMatrix[response] ## the input batch contains bounding box tail = 1 ## check to see if the input batch contains one flag for RefState if config.TrainByRefLoss(modelSpecs): tail += 1 newbatch = batch[:-tail] newbatch.extend(Y) newbatch.extend(weightMatrix) newbatch.extend(batch[-tail:]) return newbatch
def AssembleOneBatch(data, modelSpecs, forRefState=False, bounds=None, floatType=theano.config.floatX, bUseSharedMemory=False): if not data: print 'WARNING: the list of data is empty' return None numSeqs = len(data) seqLens = [d['seqLen'] for d in data] names = [d['name'] for d in data] ## use maxSeqLen and minSeqLen for sequential features ## we do not crop sequential features at this step since the theano deep model will do so after 1D convolution operation maxSeqLen = max(seqLens) minSeqLen = min(seqLens) #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen numSeqFeatures = FeatureUtils.DetermineNumSeqFeatures( data[0]['seqFeatures']) X1d = np.zeros(shape=(numSeqs, maxSeqLen, numSeqFeatures), dtype=floatType) numMatrixFeatures = FeatureUtils.DetermineNumMatrixFeatures( data[0]['matrixFeatures']) + FeatureUtils.DetermineNumMatrixFeatures( data[0]['matrixFeatures_nomean']) ## we use maxMatrixSize and minMatrixSize for pairwise features ## we crop pairwise features at this step to save memory and computational time minMatrixSize, maxMatrixSize = CalcMinMaxMatrixSize(bounds, seqLens) if bUseSharedMemory: shmX2d = SharedNDArray( (numSeqs, maxMatrixSize, maxMatrixSize, numMatrixFeatures), dtype=floatType, name='/RaptorX-' + str(os.getppid()) + '-X2d-' + randomString(6)) X2d = shmX2d.array X2d[:] = 0 else: X2d = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize, numMatrixFeatures), dtype=floatType) X1dem = None if data[0].has_key('embedFeatures'): numEmbedFeatures = data[0]['embedFeatures'].shape[1] X1dem = np.zeros(shape=(numSeqs, maxSeqLen, numEmbedFeatures), dtype=floatType) ## Y shall be a list of 2D or 3D matrices, each for one response Y = [] if data[0].has_key('atomLabelMatrix'): for response in modelSpecs['responses']: labelName, labelType, _ = ParseResponse(response) dataType = np.int16 if not config.IsDiscreteLabel(labelType): dataType = floatType rValDims = GetResponseValueDims(response) if rValDims == 1: y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize), dtype=dataType) Y.append(y) else: y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize, rValDims), dtype=dataType) Y.append(y) ## when Y is empty, weight is useless. So When Y is None, weight shall also be None weightMatrix = [] if bool(Y) and config.UseSampleWeight(modelSpecs): weightMatrix = [ np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize), dtype=floatType) ] * len(modelSpecs['responses']) ## for mask. we do not used shared ndarray for them since they are small M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen), dtype=np.int8) M2d = np.zeros(shape=(numSeqs, maxMatrixSize - minMatrixSize, maxMatrixSize), dtype=np.int8) if bounds is not None: boxes = bounds else: boxes = [None] * len(data) for j, d, box in zip(range(len(data)), data, boxes): seqLen = d['seqLen'] ## posInSeq, posInX and posInY are the starting position of one protein in the final output tensor posInSeq = -seqLen ## here X and Y refer to x-axis and y-axis if box is not None: top, left, bottom, right = box posInX = -(bottom - top) posInY = -(right - left) else: posInX = -seqLen posInY = -seqLen if forRefState: ## this code needs reexamination, it may not be correct when d['seqFeatures']/d['matrixFeatures'] is represented as a list of arrays instead of a single array X1d[j, posInSeq:, :] = np.array( [modelSpecs['seqFeatures_expected']] * seqLen).reshape( (seqLen, -1)) tmp = [modelSpecs['matrixFeatures_expected']] * (seqLen * seqLen) tmp2 = np.array(tmp).reshape((seqLen, seqLen, -1)) tmp3 = np.concatenate((tmp2, d['matrixFeatures_nomean']), axis=2) if box is not None: X2d[j, posInX:, posInY:, :] = tmp3[top:bottom, left:right, ] else: X2d[j, posInX:, posInY:, :] = tmp3 else: if isinstance(d['seqFeatures'], np.ndarray): X1d[j, posInSeq:, :] = d['seqFeatures'] else: startPos = 0 for f in d['seqFeatures']: if len(f.shape) == 1: X1d[j, posInSeq:, startPos:startPos + 1] = f[:, np.newaxis] startPos += 1 elif len(f.shape) == 2: X1d[j, posInSeq:, startPos:startPos + f.shape[1]] = f startPos = startPos + f.shape[1] else: print 'wrong shape in sequential feature: ', f.shape exit(1) # add 2D features in matrixFeatures to holder staring from the start position # holder is a 3D array and start is the starting position in the 3rd dimension def Add2DFeatures(matrixFeatures, holder, start): if isinstance(matrixFeatures, np.ndarray): features = [matrixFeatures] else: features = matrixFeatures startPos = start #for f in matrixFeatures: for f in features: if len(f.shape) == 2: endPos = startPos + 1 if box is None: holder[:, :, startPos:endPos] = f[:, :, np.newaxis] else: holder[:, :, startPos:endPos] = f[top:bottom, left:right, np.newaxis] elif len(f.shape) == 3: endPos = startPos + f.shape[2] if box is None: holder[:, :, startPos:endPos] = f else: holder[:, :, startPos:endPos] = f[top:bottom, left:right, :] else: print 'wrong shape in matrixFeatures: ', f.shape exit(1) startPos = endPos return endPos end = Add2DFeatures(d['matrixFeatures'], X2d[j, posInX:, posInY:, :], 0) Add2DFeatures(d['matrixFeatures_nomean'], X2d[j, posInX:, posInY:, :], end) M1d[j, posInSeq:].fill(1) M2d[j, posInX:, posInY:].fill(1) if X1dem is not None: ## embed feature is always represented as a single array, so the code shall be correct if forRefState: X1dem[j, posInSeq:, :] = np.array( [modelSpecs['embedFeatures_expected']] * seqLen).reshape( (seqLen, -1)) else: X1dem[j, posInSeq:, :] = d['embedFeatures'] for y, response in zip(Y, modelSpecs['responses']): if box is not None: tmp = d['atomLabelMatrix'][response][top:bottom, left:right] else: tmp = d['atomLabelMatrix'][response] if len(y.shape) == 3: y[j, posInX:, posInY:] = tmp else: y[j, posInX:, posInY:, ] = tmp if bool(weightMatrix): if d.has_key('labelWeightMatrix'): labelWeightMatrix = d['labelWeightMatrix'] else: labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix( d['atomLabelMatrix'], modelSpecs, floatType=floatType) for w, response in zip(weightMatrix, modelSpecs['responses']): if box is not None: w[j, posInX:, posInY:] = labelWeightMatrix[response][top:bottom, left:right] else: w[j, posInX:, posInY:] = labelWeightMatrix[response] if bUseSharedMemory: onebatch = [X1d, shmX2d, M1d, M2d] else: onebatch = [X1d, X2d, M1d, M2d] if X1dem is not None: onebatch.append(X1dem) onebatch.extend(Y) onebatch.extend(weightMatrix) return onebatch, names
def BuildModel(modelSpecs, forTrain=True): rng = np.random.RandomState() ## x is for sequential features and y for matrix (or pairwise) features x = T.tensor3('x') y = T.tensor4('y') ## mask for x and y, respectively xmask = T.bmatrix('xmask') ymask = T.btensor3('ymask') xem = None ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ): if config.EmbeddingUsed(modelSpecs): xem = T.tensor3('xem') ## bounding box for crop of a big protein distance matrix. This box allows crop at any position. box = None if forTrain: box = T.ivector('boundingbox') ## trainByRefLoss can be either 1 or -1. When this variable exists, we train the model using both reference loss and the loss of real data trainByRefLoss = None if forTrain and config.TrainByRefLoss(modelSpecs): trainByRefLoss = T.iscalar('trainByRefLoss') distancePredictor = ResNet4DistMatrix(rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, embedInput=xem, boundingbox=box, modelSpecs=modelSpecs) ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] ) labelList = [] if forTrain: ## when this model is used for training. We need to define the label variable for response in modelSpecs['responses']: labelType = Response2LabelType(response) rValDims = GetResponseValueDims(response) if labelType.startswith('Discrete'): if rValDims > 1: ## if one response is a vector, then we use a 4-d tensor ## wtensor is for 16bit integer labelList.append(T.wtensor4('Tlabel4' + response)) else: labelList.append(T.wtensor3('Tlabel4' + response)) else: if rValDims > 1: labelList.append(T.tensor4('Tlabel4' + response)) else: labelList.append(T.tensor3('Tlabel4' + response)) ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen) weightList = [] if len(labelList) > 0 and config.UseSampleWeight(modelSpecs): weightList = [ T.tensor3('Tweight4' + response) for response in modelSpecs['responses'] ] ## for prediction, both labelList and weightList are empty if forTrain: return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList, box, trainByRefLoss else: return distancePredictor, x, y, xmask, ymask, xem
def TrainDataLoader2(sharedQ, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False): #print 'trainDataLoader has event: ', stopTrainDataLoader bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(modelSpecs) if any([bUseCCMraw, bUseFullMI, bUseFullCov]): ## when full coevolution matrices are used, we shall use float16 to save memory floatType = np.float16 else: floatType = theano.config.floatX ## here we use labelPool to cache the labels of all the training proteins ## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment ## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM. labelPool = dict() labelWeightPool = dict() while True: if stopTrainDataLoader.is_set() or os.getppid()==1: #print 'trainDataLoader receives the stop signal' break trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData) numOriginals = len(trainDataLocation) trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs) random.shuffle(trainSeqData) #i = 0 for batch in trainSeqData: if stopTrainDataLoader.is_set() or os.getppid()==1: #print 'trainDataLoader receives the stop signal' break data = [] for protein in batch: name = protein['name'] if labelPool.has_key(name): ## label is already in the pool d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list') d['atomLabelMatrix'] = labelPool[name] else: d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list') assert d.has_key('atomLabelMatrix') labelPool[name] = d['atomLabelMatrix'] if config.UseSampleWeight(modelSpecs): if not labelWeightPool.has_key(name): labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16) labelWeightPool[name] = labelWeightMatrix d['labelWeightMatrix'] = labelWeightMatrix else: d['labelWeightMatrix'] = labelWeightPool[name] data.append(d) FeatureUtils.CheckModelNDataConsistency(modelSpecs, data) if assembleData: data = PrepareInput4Train(data, modelSpecs, floatType=floatType, UseSharedMemory=UseSharedMemory) #print 'putting data to trainDataLoader queue...' sharedQ.put(data) """ i += 1 if i%100 == 0: print '#batches of train data loaded: ', i """ #print 'TrainDataLoader with #PID ', os.getpid(), ' currently has ', len(labelPool), ' label matrices and ', len(labelMatrixPool), ' label weight matrices' print 'TrainDataLoader has finished loading data' sharedQ.close()