def TrainByOneBatch(batch, train, modelSpecs, forRefState=False):

    ## batch is a list of protein locations, so we need to load the real data here
    minibatch = DataProcessor.LoadRealData(batch, modelSpecs)

    ## add code here to make sure that the data has the same input dimension as the model specification
    FeatureUtils.CheckModelNDataConsistency(modelSpecs, minibatch)

    onebatch, names4onebatch = DataProcessor.AssembleOneBatch(
        minibatch, modelSpecs, forRefState=forRefState)
    x1d, x2d, x1dmask, x2dmask = onebatch[0:4]

    ## crop a large protein to deal with limited GPU memory. For sequential and embedding features, the theano model itself will crop based upon bounding box
    bounds = SampleBoundingBox((x2d.shape[1], x2d.shape[2]),
                               modelSpecs['maxbatchSize'])

    #x1d_new = x1d[:, bounds[1]:bounds[3], :]
    x1d_new = x1d
    x2d_new = x2d[:, bounds[0]:bounds[2], bounds[1]:bounds[3], :]
    #x1dmask_new = x1dmask[:, bounds[1]:x1dmask.shape[1] ]
    x1dmask_new = x1dmask
    x2dmask_new = x2dmask[:, bounds[0]:x2dmask.shape[1], bounds[1]:bounds[3]]

    input = [x1d_new, x2d_new, x1dmask_new, x2dmask_new]

    ## if embedding is used
    ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ):
    if config.EmbeddingUsed(modelSpecs):
        embed = onebatch[4]
        #embed_new = embed[:, bounds[1]:bounds[3], : ]
        embed_new = embed
        input.append(embed_new)

        remainings = onebatch[5:]
    else:
        remainings = onebatch[4:]

##crop the ground truth and weight matrices
    for x2d0 in remainings:
        if len(x2d0.shape) == 3:
            input.append(x2d0[:, bounds[0]:bounds[2], bounds[1]:bounds[3]])
        else:
            input.append(x2d0[:, bounds[0]:bounds[2], bounds[1]:bounds[3], :])

    ## add bounding box to the input list
    input.append(bounds)

    if config.TrainByRefLoss(modelSpecs):
        if forRefState:
            input.append(np.int32(-1))
        else:
            input.append(np.int32(1))

    train_loss, train_errors, param_L2 = train(*input)

    return train_loss, train_errors, param_L2
def BuildModel(modelSpecs, forTrain=True):
	rng = np.random.RandomState()

	## x is for sequential features and y for matrix (or pairwise) features
	x = T.tensor3('x')
	y = T.tensor4('y')

	## mask for x and y, respectively
	xmask = T.bmatrix('xmask')
	ymask = T.btensor3('ymask')

	xem = None
	##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ):
	if config.EmbeddingUsed(modelSpecs):
		xem = T.tensor3('xem')
		distancePredictor = ResNet4DistMatrix( rng, seqInput=x,
											   matrixInput=y, mask_seq=xmask, mask_matrix=ymask,
											   embedInput=xem, modelSpecs=modelSpecs )
	else:
		distancePredictor = ResNet4DistMatrix( rng, seqInput=x,
											   matrixInput=y, mask_seq=xmask, mask_matrix=ymask,
											   modelSpecs=modelSpecs )

	## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] )
	labelList = []
	if forTrain:
		## when this model is used for training. We need to define the label variable
		for response in modelSpecs['responses']:
			labelType = Response2LabelType(response)
			rValDims = config.responseValueDims[labelType]

			if labelType.startswith('Discrete'):
				if rValDims > 1:
				## if one response is a vector, then we use a 4-d tensor
				## wtensor is for 16bit integer
					labelList.append( T.wtensor4('Tlabel4' + response ) )
				else:
					labelList.append( T.wtensor3('Tlabel4' + response ) )
			else:
				if rValDims > 1:
					labelList.append( T.tensor4('Tlabel4' + response ) )
				else:
					labelList.append( T.tensor3('Tlabel4' + response ) )

	## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen)
	weightList = []
	if len(labelList)>0 and modelSpecs['UseSampleWeight']:
		weightList = [ T.tensor3('Tweight4'+response) for response in modelSpecs['responses'] ]

	## for prediction, both labelList and weightList are empty
	return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList
def ExtractFeaturesNLabels(data,
                           modelSpecs,
                           forTrainValidation=True,
                           returnMode='array'):
    ## each protein has sequential and  pairwise features as input and distance matrix as label
    proteinFeatures = []
    counter = 0

    for d in data:
        oneprotein = dict()
        oneprotein['name'] = d['name']

        ## convert the primary sequence to a one-hot encoding
        oneHotEncoding = SeqOneHotEncoding(d['sequence'])

        ## prepare features for embedding. Currently we may embed a pair of residues or a pair of residue+secondary structure
        if config.EmbeddingUsed(modelSpecs):
            if modelSpecs['seq2matrixMode'].has_key('Seq+SS'):
                embedFeature = RowWiseOuterProduct(oneHotEncoding, d['SS3'])
            else:
                embedFeature = oneHotEncoding
            oneprotein['embedFeatures'] = embedFeature

        seqFeature = CollectSequentialFeatures(d,
                                               modelSpecs,
                                               oneHotEncoding,
                                               returnMode=returnMode)
        matrixFeature, matrixFeature_nomean = CollectMatrixFeatures(
            d, modelSpecs, returnMode=returnMode)

        oneprotein['sequence'] = d['sequence']
        oneprotein['seqLen'] = len(d['sequence'])
        oneprotein['seqFeatures'] = seqFeature
        oneprotein['matrixFeatures'] = matrixFeature
        oneprotein['matrixFeatures_nomean'] = matrixFeature_nomean

        if forTrainValidation:
            oneprotein['atomLabelMatrix'] = LabelUtils.CollectLabels(
                d, modelSpecs)

        ##at this point, finish collecting features and labels for one protein
        proteinFeatures.append(oneprotein)

        counter += 1
        if (counter % 500 == 100):
            print 'assembled features and labels for ', counter, ' proteins.'

    return proteinFeatures
Пример #4
0
def LoadDistanceFeatures(files=None, modelSpecs=None, forTrainValidation=True):
    if files is None or len(files) == 0:
        print('the feature file is empty')
        exit(-1)

    fhs = [open(file, 'rb') for file in files]
    data = sum([cPickle.load(fh, encoding='latin1') for fh in fhs], [])
    [fh.close() for fh in fhs]

    ## each protein has sequential and  pairwise features as input and distance matrix as label
    proteinFeatures = []
    counter = 0

    for d in data:
        oneprotein = dict()
        oneprotein['name'] = d['name']

        ## convert the primary sequence to a one-hot encoding
        oneHotEncoding = config.SeqOneHotEncoding(d['sequence'])

        ## prepare features for embedding. Currently we may embed a pair of residues or a pair of residue+secondary structure
        if config.EmbeddingUsed(modelSpecs):
            if 'Seq+SS' in modelSpecs['seq2matrixMode']:
                embedFeature = RowWiseOuterProduct(oneHotEncoding, d['SS3'])
            else:
                embedFeature = oneHotEncoding
            oneprotein['embedFeatures'] = embedFeature

        ##collecting sequential features...
        seqMatrices = [oneHotEncoding]

        ## 3-state secondary structure shall always be placed before the other features, why?
        if 'UseSS' in modelSpecs and (modelSpecs['UseSS'] is True):
            seqMatrices.append(d['SS3'])

        if 'UseACC' in modelSpecs and (modelSpecs['UseACC'] is True):
            seqMatrices.append(d['ACC'])

        if 'UsePSSM' in modelSpecs and (modelSpecs['UsePSSM'] is True):
            seqMatrices.append(d['PSSM'])

        if 'UseDisorder' in modelSpecs and modelSpecs['UseDisorder'] is True:
            seqMatrices.append(d['DISO'])

        ##membrane protein specific features
        useMPSpecificFeatures = 'UseMPSpecificFeatures' in modelSpecs and (
            modelSpecs['UseMPSpecificFeatures'] is True)
        if useMPSpecificFeatures:
            if 'MemAcc' in d:
                seqMatrices.append(d['MemAcc'])
            else:
                print('The data does not have a feature called MemAcc')
                exit(-1)
            if 'MemTopo' in d:
                seqMatrices.append(d['MemTopo'])
            else:
                print('The data does not have a feature called MemTopo')
                exit(-1)

        ## Add sequence-template similarity score here. This is used to predict distance matrix from a sequence-template alignment.
        ## this is mainly used for homology modeling
        if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']:
            #print 'Using template similarity score...'
            if 'tplSimScore' not in d:
                print(
                    'the data has no key tplSimScore, which is needed since you specify to use template information'
                )
                exit(-1)
            if d['tplSimScore'].shape[1] != 11:
                print(
                    'The number of features for query-template similarity shall be equal to 11'
                )
                exit(-1)
            seqMatrices.append(d['tplSimScore'])
        seqFeature = np.concatenate(seqMatrices, axis=1).astype(np.float32)

        ##collecting pairwise features...
        pairfeatures = []
        ##add one specific location feature here, i.e., posFeature[i, j]=min(1, abs(i-j)/30.0 )
        posFeature = LocationFeature(d)
        pairfeatures.append(posFeature)

        cbrtFeature = CubeRootFeature(d)
        pairfeatures.append(cbrtFeature)

        if 'UseCCM' in modelSpecs and (modelSpecs['UseCCM'] is True):
            if 'ccmpredZ' not in d:
                print('Something must be wrong. The data for protein ',
                      d['name'],
                      ' does not have the normalized ccmpred feature!')
                exit(-1)
            pairfeatures.append(d['ccmpredZ'])

        if modelSpecs['UsePSICOV'] is True:
            pairfeatures.append(d['psicovZ'])

        if 'UseOtherPairs' in modelSpecs and (modelSpecs['UseOtherPairs'] is
                                              True):
            pairfeatures.append(d['OtherPairs'])

        ##add template-related distance matrix. This code needs modification later
        ## somewhere we shall also write code to add template-related sequential features such as secondary structure?
        if 'UseTemplate' in modelSpecs and modelSpecs['UseTemplate']:
            #print 'Using template distance matrix...'
            if 'tplDistMatrix' not in d:
                print(
                    'the data for ', d['name'],
                    ' has no tplDistMatrix, which is needed since you specify to use template information'
                )
                exit(-1)

            ## Check to make sure that we use exactly the same set of inter-atom distance information from templates
            ## currently we do not use HB and Beta information from template
            apts = d['tplDistMatrix'].keys()
            assert (set(apts) == set(config.allAtomPairTypes))
            ##assert ( set(apts) == set(config.allAtomPairTypes) or set(apts)==set(config.allLabelNames) )

            tmpPairFeatures = dict()
            for apt, tplDistMatrix in d['tplDistMatrix'].items():
                ##use one flagMatrix to indicate which entries are invalid (due to gaps or disorder) since they shall be same regardless of atom pair type
                if apt == 'CaCa':
                    flagMatrix = np.zeros_like(tplDistMatrix)
                    np.putmask(flagMatrix, tplDistMatrix < 0, 1)
                    pairfeatures.append(flagMatrix)

                strengthMatrix = np.copy(tplDistMatrix)
                np.putmask(strengthMatrix, tplDistMatrix < 3.5, 3.5)
                np.putmask(strengthMatrix, tplDistMatrix < -0.01, 50)
                strengthMatrix = 3.5 / strengthMatrix

                if config.InTPLMemorySaveMode(modelSpecs):
                    tmpPairFeatures[apt] = [strengthMatrix]
                else:
                    tmpPairFeatures[apt] = [
                        strengthMatrix,
                        np.square(strengthMatrix)
                    ]

            ## here we add the tmpPairFeatures to pairfeatures in a fixed order. This can avoid errors introduced by different ordering of keys in a python dict() structure
            ## python of different versions may have different ordering of keys in dict() ?
            pairfeatures.extend(tmpPairFeatures['CbCb'])
            pairfeatures.extend(tmpPairFeatures['CgCg'])
            pairfeatures.extend(tmpPairFeatures['CaCg'])
            pairfeatures.extend(tmpPairFeatures['CaCa'])
            pairfeatures.extend(tmpPairFeatures['NO'])

        if config.InTPLMemorySaveMode(modelSpecs):
            matrixFeature = np.dstack(tuple(pairfeatures)).astype(np.float32)
        else:
            matrixFeature = np.dstack(tuple(pairfeatures))
            #print 'matrixFeature.shape: ', matrixFeature.shape

        oneprotein['sequence'] = d['sequence']
        oneprotein['seqLen'] = seqFeature.shape[0]
        oneprotein['seqFeatures'] = seqFeature
        oneprotein['matrixFeatures'] = matrixFeature

        ##collecting labels...
        if 'atomDistMatrix' in d:
            atomDistMatrix = d['atomDistMatrix']
            oneprotein['atomLabelMatrix'] = dict()

            for response in modelSpecs['responses']:
                responseName = Response2LabelName(response)
                labelType = Response2LabelType(response)
                if responseName not in atomDistMatrix:
                    print('In the raw feature data, ', d['name'],
                          ' does not have matrix for ', responseName)
                    exit(-1)

                ## atomDistMatrix is the raw data, so it does not have information about labelType
                distm = atomDistMatrix[responseName]

                if labelType.startswith('Discrete'):
                    subType = labelType[len('Discrete'):]

                    ## no need to discretize for HB and Beta-Pairing since they are binary matrices
                    if responseName.startswith(
                            'HB') or responseName.startswith('Beta'):
                        oneprotein['atomLabelMatrix'][response] = distm
                    else:
                        labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix(
                            distm, config.distCutoffs[subType],
                            subType.endswith('Plus'))
                        oneprotein['atomLabelMatrix'][response] = labelMatrix

                elif labelType.startswith('LogNormal'):
                    labelMatrix = DistanceUtils.LogDistMatrix(distm)
                    oneprotein['atomLabelMatrix'][response] = labelMatrix

                elif labelType.startswith('Normal'):
                    oneprotein['atomLabelMatrix'][response] = distm
                else:
                    print('unsupported response: ', res)
                    exit(-1)

        elif forTrainValidation:
            print(
                'atomic distance matrix is needed for the training and validation data'
            )
            exit(-1)

        ##at this point, finish collecting features and labels for one protein
        proteinFeatures.append(oneprotein)

        counter += 1
        if (counter % 500 == 1):
            print('assembled features and labels for ', counter, ' proteins.')

    return proteinFeatures
def BuildModel(modelSpecs, forTrain=True):
    rng = np.random.RandomState()

    ## x is for sequential features and y for matrix (or pairwise) features
    x = T.tensor3('x')
    y = T.tensor4('y')

    ## mask for x and y, respectively
    xmask = T.bmatrix('xmask')
    ymask = T.btensor3('ymask')

    xem = None
    ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ):
    if config.EmbeddingUsed(modelSpecs):
        xem = T.tensor3('xem')

## bounding box for crop of a big protein distance matrix. This box allows crop at any position.
    box = None
    if forTrain:
        box = T.ivector('boundingbox')

## trainByRefLoss can be either 1 or -1. When this variable exists, we train the model using both reference loss and the loss of real data
    trainByRefLoss = None
    if forTrain and config.TrainByRefLoss(modelSpecs):
        trainByRefLoss = T.iscalar('trainByRefLoss')

    distancePredictor = ResNet4DistMatrix(rng,
                                          seqInput=x,
                                          matrixInput=y,
                                          mask_seq=xmask,
                                          mask_matrix=ymask,
                                          embedInput=xem,
                                          boundingbox=box,
                                          modelSpecs=modelSpecs)

    ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] )
    labelList = []
    if forTrain:
        ## when this model is used for training. We need to define the label variable
        for response in modelSpecs['responses']:
            labelType = Response2LabelType(response)
            rValDims = GetResponseValueDims(response)

            if labelType.startswith('Discrete'):
                if rValDims > 1:
                    ## if one response is a vector, then we use a 4-d tensor
                    ## wtensor is for 16bit integer
                    labelList.append(T.wtensor4('Tlabel4' + response))
                else:
                    labelList.append(T.wtensor3('Tlabel4' + response))
            else:
                if rValDims > 1:
                    labelList.append(T.tensor4('Tlabel4' + response))
                else:
                    labelList.append(T.tensor3('Tlabel4' + response))

    ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen)
    weightList = []
    if len(labelList) > 0 and config.UseSampleWeight(modelSpecs):
        weightList = [
            T.tensor3('Tweight4' + response)
            for response in modelSpecs['responses']
        ]

## for prediction, both labelList and weightList are empty
    if forTrain:
        return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList, box, trainByRefLoss
    else:
        return distancePredictor, x, y, xmask, ymask, xem