def TrainByOneBatch(batch, train, modelSpecs, forRefState=False): ## batch is a list of protein locations, so we need to load the real data here minibatch = DataProcessor.LoadRealData(batch, modelSpecs) ## add code here to make sure that the data has the same input dimension as the model specification FeatureUtils.CheckModelNDataConsistency(modelSpecs, minibatch) onebatch, names4onebatch = DataProcessor.AssembleOneBatch( minibatch, modelSpecs, forRefState=forRefState) x1d, x2d, x1dmask, x2dmask = onebatch[0:4] ## crop a large protein to deal with limited GPU memory. For sequential and embedding features, the theano model itself will crop based upon bounding box bounds = SampleBoundingBox((x2d.shape[1], x2d.shape[2]), modelSpecs['maxbatchSize']) #x1d_new = x1d[:, bounds[1]:bounds[3], :] x1d_new = x1d x2d_new = x2d[:, bounds[0]:bounds[2], bounds[1]:bounds[3], :] #x1dmask_new = x1dmask[:, bounds[1]:x1dmask.shape[1] ] x1dmask_new = x1dmask x2dmask_new = x2dmask[:, bounds[0]:x2dmask.shape[1], bounds[1]:bounds[3]] input = [x1d_new, x2d_new, x1dmask_new, x2dmask_new] ## if embedding is used ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ): if config.EmbeddingUsed(modelSpecs): embed = onebatch[4] #embed_new = embed[:, bounds[1]:bounds[3], : ] embed_new = embed input.append(embed_new) remainings = onebatch[5:] else: remainings = onebatch[4:] ##crop the ground truth and weight matrices for x2d0 in remainings: if len(x2d0.shape) == 3: input.append(x2d0[:, bounds[0]:bounds[2], bounds[1]:bounds[3]]) else: input.append(x2d0[:, bounds[0]:bounds[2], bounds[1]:bounds[3], :]) ## add bounding box to the input list input.append(bounds) if config.TrainByRefLoss(modelSpecs): if forRefState: input.append(np.int32(-1)) else: input.append(np.int32(1)) train_loss, train_errors, param_L2 = train(*input) return train_loss, train_errors, param_L2
def BuildModel(modelSpecs, forTrain=True): rng = np.random.RandomState() ## x is for sequential features and y for matrix (or pairwise) features x = T.tensor3('x') y = T.tensor4('y') ## mask for x and y, respectively xmask = T.bmatrix('xmask') ymask = T.btensor3('ymask') xem = None ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ): if config.EmbeddingUsed(modelSpecs): xem = T.tensor3('xem') distancePredictor = ResNet4DistMatrix( rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, embedInput=xem, modelSpecs=modelSpecs ) else: distancePredictor = ResNet4DistMatrix( rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, modelSpecs=modelSpecs ) ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] ) labelList = [] if forTrain: ## when this model is used for training. We need to define the label variable for response in modelSpecs['responses']: labelType = Response2LabelType(response) rValDims = config.responseValueDims[labelType] if labelType.startswith('Discrete'): if rValDims > 1: ## if one response is a vector, then we use a 4-d tensor ## wtensor is for 16bit integer labelList.append( T.wtensor4('Tlabel4' + response ) ) else: labelList.append( T.wtensor3('Tlabel4' + response ) ) else: if rValDims > 1: labelList.append( T.tensor4('Tlabel4' + response ) ) else: labelList.append( T.tensor3('Tlabel4' + response ) ) ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen) weightList = [] if len(labelList)>0 and modelSpecs['UseSampleWeight']: weightList = [ T.tensor3('Tweight4'+response) for response in modelSpecs['responses'] ] ## for prediction, both labelList and weightList are empty return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList
def ExtractFeaturesNLabels(data, modelSpecs, forTrainValidation=True, returnMode='array'): ## each protein has sequential and pairwise features as input and distance matrix as label proteinFeatures = [] counter = 0 for d in data: oneprotein = dict() oneprotein['name'] = d['name'] ## convert the primary sequence to a one-hot encoding oneHotEncoding = SeqOneHotEncoding(d['sequence']) ## prepare features for embedding. Currently we may embed a pair of residues or a pair of residue+secondary structure if config.EmbeddingUsed(modelSpecs): if modelSpecs['seq2matrixMode'].has_key('Seq+SS'): embedFeature = RowWiseOuterProduct(oneHotEncoding, d['SS3']) else: embedFeature = oneHotEncoding oneprotein['embedFeatures'] = embedFeature seqFeature = CollectSequentialFeatures(d, modelSpecs, oneHotEncoding, returnMode=returnMode) matrixFeature, matrixFeature_nomean = CollectMatrixFeatures( d, modelSpecs, returnMode=returnMode) oneprotein['sequence'] = d['sequence'] oneprotein['seqLen'] = len(d['sequence']) oneprotein['seqFeatures'] = seqFeature oneprotein['matrixFeatures'] = matrixFeature oneprotein['matrixFeatures_nomean'] = matrixFeature_nomean if forTrainValidation: oneprotein['atomLabelMatrix'] = LabelUtils.CollectLabels( d, modelSpecs) ##at this point, finish collecting features and labels for one protein proteinFeatures.append(oneprotein) counter += 1 if (counter % 500 == 100): print 'assembled features and labels for ', counter, ' proteins.' return proteinFeatures
def LoadDistanceFeatures(files=None, modelSpecs=None, forTrainValidation=True): if files is None or len(files) == 0: print('the feature file is empty') exit(-1) fhs = [open(file, 'rb') for file in files] data = sum([cPickle.load(fh, encoding='latin1') for fh in fhs], []) [fh.close() for fh in fhs] ## each protein has sequential and pairwise features as input and distance matrix as label proteinFeatures = [] counter = 0 for d in data: oneprotein = dict() oneprotein['name'] = d['name'] ## convert the primary sequence to a one-hot encoding oneHotEncoding = config.SeqOneHotEncoding(d['sequence']) ## prepare features for embedding. Currently we may embed a pair of residues or a pair of residue+secondary structure if config.EmbeddingUsed(modelSpecs): if 'Seq+SS' in modelSpecs['seq2matrixMode']: embedFeature = RowWiseOuterProduct(oneHotEncoding, d['SS3']) else: embedFeature = oneHotEncoding oneprotein['embedFeatures'] = embedFeature ##collecting sequential features... seqMatrices = [oneHotEncoding] ## 3-state secondary structure shall always be placed before the other features, why? if 'UseSS' in modelSpecs and (modelSpecs['UseSS'] is True): seqMatrices.append(d['SS3']) if 'UseACC' in modelSpecs and (modelSpecs['UseACC'] is True): seqMatrices.append(d['ACC']) if 'UsePSSM' in modelSpecs and (modelSpecs['UsePSSM'] is True): seqMatrices.append(d['PSSM']) if 'UseDisorder' in modelSpecs and modelSpecs['UseDisorder'] is True: seqMatrices.append(d['DISO']) ##membrane protein specific features useMPSpecificFeatures = 'UseMPSpecificFeatures' in modelSpecs and ( modelSpecs['UseMPSpecificFeatures'] is True) if useMPSpecificFeatures: if 'MemAcc' in d: seqMatrices.append(d['MemAcc']) else: print('The data does not have a feature called MemAcc') exit(-1) if 'MemTopo' in d: seqMatrices.append(d['MemTopo']) else: print('The data does not have a feature called MemTopo') exit(-1) ## Add sequence-template similarity score here. This is used to predict distance matrix from a sequence-template alignment. ## this is mainly used for homology modeling if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']: #print 'Using template similarity score...' if 'tplSimScore' not in d: print( 'the data has no key tplSimScore, which is needed since you specify to use template information' ) exit(-1) if d['tplSimScore'].shape[1] != 11: print( 'The number of features for query-template similarity shall be equal to 11' ) exit(-1) seqMatrices.append(d['tplSimScore']) seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32) ##collecting pairwise features... pairfeatures = [] ##add one specific location feature here, i.e., posFeature[i, j]=min(1, abs(i-j)/30.0 ) posFeature = LocationFeature(d) pairfeatures.append(posFeature) cbrtFeature = CubeRootFeature(d) pairfeatures.append(cbrtFeature) if 'UseCCM' in modelSpecs and (modelSpecs['UseCCM'] is True): if 'ccmpredZ' not in d: print('Something must be wrong. The data for protein ', d['name'], ' does not have the normalized ccmpred feature!') exit(-1) pairfeatures.append(d['ccmpredZ']) if modelSpecs['UsePSICOV'] is True: pairfeatures.append(d['psicovZ']) if 'UseOtherPairs' in modelSpecs and (modelSpecs['UseOtherPairs'] is True): pairfeatures.append(d['OtherPairs']) ##add template-related distance matrix. This code needs modification later ## somewhere we shall also write code to add template-related sequential features such as secondary structure? if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']: #print 'Using template distance matrix...' if 'tplDistMatrix' not in d: print( 'the data for ', d['name'], ' has no tplDistMatrix, which is needed since you specify to use template information' ) exit(-1) ## Check to make sure that we use exactly the same set of inter-atom distance information from templates ## currently we do not use HB and Beta information from template apts = d['tplDistMatrix'].keys() assert (set(apts) == set(config.allAtomPairTypes)) ##assert ( set(apts) == set(config.allAtomPairTypes) or set(apts)==set(config.allLabelNames) ) tmpPairFeatures = dict() for apt, tplDistMatrix in d['tplDistMatrix'].items(): ##use one flagMatrix to indicate which entries are invalid (due to gaps or disorder) since they shall be same regardless of atom pair type if apt == 'CaCa': flagMatrix = np.zeros_like(tplDistMatrix) np.putmask(flagMatrix, tplDistMatrix < 0, 1) pairfeatures.append(flagMatrix) strengthMatrix = np.copy(tplDistMatrix) np.putmask(strengthMatrix, tplDistMatrix < 3.5, 3.5) np.putmask(strengthMatrix, tplDistMatrix < -0.01, 50) strengthMatrix = 3.5 / strengthMatrix if config.InTPLMemorySaveMode(modelSpecs): tmpPairFeatures[apt] = [strengthMatrix] else: tmpPairFeatures[apt] = [ strengthMatrix, np.square(strengthMatrix) ] ## here we add the tmpPairFeatures to pairfeatures in a fixed order. This can avoid errors introduced by different ordering of keys in a python dict() structure ## python of different versions may have different ordering of keys in dict() ? pairfeatures.extend(tmpPairFeatures['CbCb']) pairfeatures.extend(tmpPairFeatures['CgCg']) pairfeatures.extend(tmpPairFeatures['CaCg']) pairfeatures.extend(tmpPairFeatures['CaCa']) pairfeatures.extend(tmpPairFeatures['NO']) if config.InTPLMemorySaveMode(modelSpecs): matrixFeature = np.dstack(tuple(pairfeatures)).astype(np.float32) else: matrixFeature = np.dstack(tuple(pairfeatures)) #print 'matrixFeature.shape: ', matrixFeature.shape oneprotein['sequence'] = d['sequence'] oneprotein['seqLen'] = seqFeature.shape[0] oneprotein['seqFeatures'] = seqFeature oneprotein['matrixFeatures'] = matrixFeature ##collecting labels... if 'atomDistMatrix' in d: atomDistMatrix = d['atomDistMatrix'] oneprotein['atomLabelMatrix'] = dict() for response in modelSpecs['responses']: responseName = Response2LabelName(response) labelType = Response2LabelType(response) if responseName not in atomDistMatrix: print('In the raw feature data, ', d['name'], ' does not have matrix for ', responseName) exit(-1) ## atomDistMatrix is the raw data, so it does not have information about labelType distm = atomDistMatrix[responseName] if labelType.startswith('Discrete'): subType = labelType[len('Discrete'):] ## no need to discretize for HB and Beta-Pairing since they are binary matrices if responseName.startswith( 'HB') or responseName.startswith('Beta'): oneprotein['atomLabelMatrix'][response] = distm else: labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix( distm, config.distCutoffs[subType], subType.endswith('Plus')) oneprotein['atomLabelMatrix'][response] = labelMatrix elif labelType.startswith('LogNormal'): labelMatrix = DistanceUtils.LogDistMatrix(distm) oneprotein['atomLabelMatrix'][response] = labelMatrix elif labelType.startswith('Normal'): oneprotein['atomLabelMatrix'][response] = distm else: print('unsupported response: ', res) exit(-1) elif forTrainValidation: print( 'atomic distance matrix is needed for the training and validation data' ) exit(-1) ##at this point, finish collecting features and labels for one protein proteinFeatures.append(oneprotein) counter += 1 if (counter % 500 == 1): print('assembled features and labels for ', counter, ' proteins.') return proteinFeatures
def BuildModel(modelSpecs, forTrain=True): rng = np.random.RandomState() ## x is for sequential features and y for matrix (or pairwise) features x = T.tensor3('x') y = T.tensor4('y') ## mask for x and y, respectively xmask = T.bmatrix('xmask') ymask = T.btensor3('ymask') xem = None ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ): if config.EmbeddingUsed(modelSpecs): xem = T.tensor3('xem') ## bounding box for crop of a big protein distance matrix. This box allows crop at any position. box = None if forTrain: box = T.ivector('boundingbox') ## trainByRefLoss can be either 1 or -1. When this variable exists, we train the model using both reference loss and the loss of real data trainByRefLoss = None if forTrain and config.TrainByRefLoss(modelSpecs): trainByRefLoss = T.iscalar('trainByRefLoss') distancePredictor = ResNet4DistMatrix(rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, embedInput=xem, boundingbox=box, modelSpecs=modelSpecs) ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] ) labelList = [] if forTrain: ## when this model is used for training. We need to define the label variable for response in modelSpecs['responses']: labelType = Response2LabelType(response) rValDims = GetResponseValueDims(response) if labelType.startswith('Discrete'): if rValDims > 1: ## if one response is a vector, then we use a 4-d tensor ## wtensor is for 16bit integer labelList.append(T.wtensor4('Tlabel4' + response)) else: labelList.append(T.wtensor3('Tlabel4' + response)) else: if rValDims > 1: labelList.append(T.tensor4('Tlabel4' + response)) else: labelList.append(T.tensor3('Tlabel4' + response)) ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen) weightList = [] if len(labelList) > 0 and config.UseSampleWeight(modelSpecs): weightList = [ T.tensor3('Tweight4' + response) for response in modelSpecs['responses'] ] ## for prediction, both labelList and weightList are empty if forTrain: return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList, box, trainByRefLoss else: return distancePredictor, x, y, xmask, ymask, xem