def CalcLabelDistribution(data, modelSpecs):
    ## collect all discrete label matrices
    allLabelMatrices = dict()
    for response in modelSpecs['responses']:
        labelType = Response2LabelType(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            continue

        allLabelMatrices[response] = [
            d['atomLabelMatrix'][response] for d in data
        ]

    ## calculate the discrete label distribution
    allRefProbs = dict()
    for response in modelSpecs['responses']:
        labelName, labelType, subType = config.ParseResponse(response)
        if labelType.startswith('LogNormal') or labelType.startswith('Normal'):
            allRefProbs[response] = np.array([1.] * numRanges).reshape(
                (-1, 1)).astype(np.float32)
            continue

        if modelSpecs.has_key('UseBoundingBox4RefProbs') and (
                modelSpecs['UseBoundingBox4RefProbs'] is True):
            ## here we sample a sub label matrix using BoundingBox to account for the real training scenario
            newLabelMatrices = []
            for lMatrix in allLabelMatrices[response]:
                bounds = SampleBoundingBox(
                    (lMatrix.shape[0], lMatrix.shape[1]),
                    modelSpecs['maxbatchSize'])
                new_lMatrix = lMatrix[bounds[0]:bounds[2],
                                      bounds[1]:bounds[3]].astype(np.int32)
                newLabelMatrices.append(new_lMatrix)
            if labelName in config.allOrientationNames:
                allRefProbs[response] = OrientationUtils.CalcLabelProb(
                    data=newLabelMatrices,
                    numLabels=GetResponseProbDims(response),
                    numRanges=RangeNWeight.GetNumRanges(modelSpecs))
            else:
                allRefProbs[response] = DistanceUtils.CalcLabelProb(
                    data=newLabelMatrices,
                    numLabels=GetResponseProbDims(response),
                    numRanges=RangeNWeight.GetNumRanges(modelSpecs))
        else:
            if labelName in config.allOrientationNames:
                allRefProbs[response] = OrientationUtils.CalcLabelProb(
                    data=[
                        m.astype(np.int32) for m in allLabelMatrices[response]
                    ],
                    numLabels=GetResponseProbDims(response),
                    numRanges=RangeNWeight.GetNumRanges(modelSpecs))
            else:
                allRefProbs[response] = DistanceUtils.CalcLabelProb(
                    data=[
                        m.astype(np.int32) for m in allLabelMatrices[response]
                    ],
                    numLabels=GetResponseProbDims(response),
                    numRanges=RangeNWeight.GetNumRanges(modelSpecs))

    modelSpecs['labelDistributions'] = allRefProbs
    return allRefProbs
def CalcLabelWeightMatrix(LabelMatrix=None,
                          modelSpecs=None,
                          floatType=theano.config.floatX):
    if LabelMatrix is None:
        return None

    RangeBoundaries = RangeNWeight.GetRangeBoundaries(
        numRanges=RangeNWeight.GetNumRanges(modelSpecs))

    shape = LabelMatrix.values()[0].shape
    a = np.mgrid[0:shape[0], 0:shape[1]]
    seqSeparation = abs(a[0] - a[1])
    masks = []
    for i, bound in zip(range(len(RangeBoundaries)), RangeBoundaries):
        if i == 0:
            mask = (seqSeparation >= bound).astype(np.int16)
        else:
            mask = ((seqSeparation >= bound) *
                    (seqSeparation < RangeBoundaries[i - 1])).astype(np.int16)
        masks.append(mask)

    for response in modelSpecs['responses']:
        if not modelSpecs['weight4labels'].has_key(response):
            print 'ERROR: Cannot find the weight factor for response ', response
            exit(1)

##the below procedure is not very effective. We shall improve it later.
    labelWeightMatrices = dict()
    for response in modelSpecs['responses']:
        labelName, labelType, subType = config.ParseResponse(response)

        ## wMatrix is a matrix with dimension config.numRanges * numLabels, where numRanges corresponds to ER, LR, MR, SR, and NR, respectively
        wMatrix = (modelSpecs['weight4labels'][response]).astype(np.float32)
        assert wMatrix.shape[0] == len(RangeBoundaries)

        if config.IsContinuousLabel(labelType):
            low, high = config.GetLabelMinMaxValues(labelName)
            ## if the label is real value, then for each range, there is only a single weight for all values
            tmpWeightMatrices = []
            for w in wMatrix:
                ## the below two sentences may be incorrect. Need further examination.
                M0s = np.zeros_like(LabelMatrix[response], dtype=np.int16)
                tmp = w[M0s]
                if labelName in config.allOrientationNames:
                    ## set the weight for the invalid orientation entry to 0.
                    np.putmask(tmp, LabelMatrix[response] > high, 0)
                    np.putmask(tmp, LabelMatrix[response] < low, 0)
                else:
                    ## set the weight for the invalid distance entry to 0. An invalid entry in the label matrix is indicated by a negative value,e.g., -1
                    np.putmask(tmp, LabelMatrix[response] < low, 0)
                tmpWeightMatrices.append(tmp)
        else:
            tmpWeightMatrices = [w[LabelMatrix[response]] for w in wMatrix]

        labelWeightMatrices[response] = sum([
            m * w for m, w in zip(masks, tmpWeightMatrices)
        ]).astype(floatType)

    return labelWeightMatrices
def CollectTemplateMatrixFeatures(d, modelSpecs):
    #print 'Using template distance and orientation matrix...'
    if not d.has_key('tplDistMatrix'):
        print 'ERROR: the data for ', d[
            'name'], ' has no tplDistMatrix, which is needed since you want to use template information'
        exit(1)

    if not d['tplDistMatrix'].has_key('CbCb'):
        print 'ERROR: tplDistMatrix shall have distance matrices for atom pairs CbCb'
        exit(1)

    CbCbMatrix = d['tplDistMatrix']['CbCb']

    templateMatrixFeatures = []
    for response in modelSpecs['responses']:
        labelName, labelType, subType = config.ParseResponse(response)

        if labelName in config.allAtomPairNames:
            ## process template dist matrix
            tplDistMatrix = d['tplDistMatrix'][labelName]
            strengthMatrix = np.copy(tplDistMatrix)
            np.putmask(strengthMatrix, tplDistMatrix < 2.0, 2.0)
            strengthMatrix = 2.0 / strengthMatrix
            ## for invalid entry (no valid coordinates or insertion in the sequence), assign the strength to 0
            np.putmask(strengthMatrix, tplDistMatrix < 0, 0)

            templateMatrixFeatures.append(strengthMatrix)
            continue

        if labelName in config.allOrientationNames:
            ## process template orientation matrix
            if not d.has_key('tplOriMatrix'):
                print 'ERROR: the data for ', d[
                    'name'], ' has no tplOriMatrix, which is needed since you want to use template orientation information'
                exit(1)
            oriMatrix = d['tplOriMatrix'][labelName]
            ## discretize and convert to one-hot encoded matrix
            oriBins = config.GetCutoffs(response)
            invalidEntrySeparated = subType.endswith(
                'Plus') or subType.endswith('Minus')
            #labelMatrix, _, _ = OrientationUtils.DiscretizeOrientationMatrix(oriMatrix, bins=oriBins, distThreshold4Orientation=modelSpecs['distThreshold4Orientation'], distMatrix=CbCbMatrix, invalidEntrySeparated=invalidEntrySeparated )
            labelMatrix, _, _ = OrientationUtils.DiscretizeOrientationMatrix(
                oriMatrix,
                bins=oriBins,
                distThreshold4Orientation=modelSpecs[
                    'distThreshold4Orientation'],
                distMatrix=CbCbMatrix,
                invalidEntrySeparated=invalidEntrySeparated,
                asResponse=False)
            oneHotLabelMatrix = LabelUtils.GetOneHotLabelMatrix(
                labelMatrix, numLabels=config.GetResponseProbDims(response))

            templateMatrixFeatures.append(oneHotLabelMatrix)
            continue

        print 'WARNING: template information for response not implemented: ', response

    return templateMatrixFeatures
Пример #4
0
def Score(atomDistMatrix,
          potential,
          labelNames,
          outputDetails=False,
          minSeqSep=6,
          maxCstDist=None):

    totalScore = 0.0
    for response, pot in potential.iteritems():
        labelName, labelType, subType = config.ParseResponse(response)
        if labelName not in set(labelNames):
            continue
        if not atomDistMatrix.has_key(labelName):
            print 'WARNING: the atomDistMatrix does not have distance information for atom pair:', labelName
            continue
        if not labelType.startswith('Discrete'):
            print 'unsupported labelType: ', labelType
            exit(1)

        distm = atomDistMatrix[labelName]
        assert distm.shape == (
            pot.shape[0], pot.shape[1]
        ), "the size of the distance-based statitical potential not compatible with the distance matrix"

        ## discretize the distance matrix, an invalid entry -1 will have the largest label number
        labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix(
            distm, config.distCutoffs[subType], invalidDistanceSeparated=False)

        size = pot.shape
        m = np.mgrid[0:size[0], 0:size[1]]
        scoreMatrix = pot[m[0], m[1], labelMatrix]

        if maxCstDist is not None:
            label4maxDist = DistanceUtils.LabelsOfOneDistance(
                maxCstDist, config.distCutoffs[subType])
            np.putmask(scoreMatrix, labelMatrix > label4maxDist, 0)

        totalScore = np.sum(np.triu(scoreMatrix, minSeqSep))

        if outputDetails:
            ## note that if the potential matrix is not symmetric, we have to do something more here
            indices = np.triu_indices(size[0], k=minSeqSep, m=size[1])
            scores = scoreMatrix[indices]
            labels = labelMatrix[indices]
            for i, j, s, label in zip(indices[0], indices[1], scores, labels):
                outinfo = [
                    str(i + 1),
                    str(j + 1), apt,
                    str(label), "{:.4f}".format(s)
                ] + ["{:.3f}".format(v) for v in pot[i, j]]
                outstr = ' '.join(outinfo)
                print outstr

    return totalScore
def DeriveOriContactMatrix(predOriMatrix, response):
    labelName, labelType, subType = config.ParseResponse(response)
    symmetric = config.IsSymmetricLabel(labelName)

    if labelName not in config.allOrientationNames:
        print 'ERROR: unsupported orientation label name in', response
        exit(1)

    if not config.IsDiscreteLabel(labelType):
        print 'ERROR: unsupported orientation label type in', response
        exit(1)

    numLabels = config.GetResponseProbDims(response)
    if subType.endswith('Plus') or subType.endswith('Minus'):
        largestValidLabel = numLabels - 2
    else:
        largestValidLabel = numLabels - 1

    contactMatrix = np.sum(predOriMatrix[:, :, :largestValidLabel], axis=2)

    return contactMatrix
Пример #6
0
def main(argv):
    #printContactMatrix = False
    #nativefile = None
    newName = None
    savefolder = os.getcwd()

    try:
        #opts, args = getopt.getopt(argv,"cg:n:",["contact=", "nativefile=", "name="])
        opts, args = getopt.getopt(argv, "s:n:", ["savefolder=", "name="])
        #print opts, args
    except getopt.GetoptError:
        Usage()
        exit(1)

    if len(args) < 2:
        Usage()
        exit(1)

    baseMatrixFile = args[0]
    subMatrixFiles = args[1:]

    for opt, arg in opts:
        """
                if opt in ("-c", "--contact"):
                        printContactMatrix = True
                elif opt in ("-g", "--nativefile"):
                        nativefile = arg
                        if not os.path.isfile(nativefile):
                                print 'ERROR: invalid ground truth file for contact accuracy evaluation:', nativefile
                                exit(1)
		"""
        if opt in ("-s", "--savefolder"):
            savefolder = arg
        elif opt in ("-n", "--name"):
            newName = arg
        else:
            Usage()
            exit(1)

    baseMatrix = DistanceUtils.LoadRawDistProbFile(baseMatrixFile)
    sequence = baseMatrix[1]
    targetName = baseMatrix[0]

    ## baseMatrix and subMatrix are a tuple of 6 items
    subMatrices = []
    for subMatrixFile in subMatrixFiles:
        subMatrix = DistanceUtils.LoadRawDistProbFile(subMatrixFile)

        ## make sure that both matrix files are of the same type, although they may not equal
        if baseMatrix[4] is None:
            assert (subMatrix[4] is None)
        if baseMatrix[4] is not None:
            assert (subMatrix[4] is not None)

        subMatrices.append(subMatrix)

    ## new distance and contact matrices with response as the keys
    newDistMatrices = {}
    newContMatrices = {}

    ## replace the distance matrix
    for response, m in baseMatrix[2].iteritems():

        tmpResult = m
        for subMatrix, smfile in zip(subMatrices, subMatrixFiles):
            if not subMatrix[2].has_key(response):
                print 'WARNING: there is no response', response, ' in subMatrixFile: ', smfile
                #exit(1)

            subSequence = subMatrix[1]

            ## try by assumming that this domain has only one seq segment
            index = sequence.find(subSequence)
            if index >= 0:
                tmpResult = ReplaceSubMatrix(tmpResult, subMatrix[2][response],
                                             index)
                continue

            ## try by assuming that this domain has two seq segments
            res = FindIndexBySegments(sequence, subSequence)
            if res is None:
                print 'ERROR: cannot map domain sequence to the whole chain sequence!'
                print '    domain Seq= ', subSequence
                print '    chain  Seq= ', sequence
                exit(1)

            tmpResult = ReplaceSubMatrixBySegments(tmpResult,
                                                   subMatrix[2][response],
                                                   starts=res[0],
                                                   sizes=res[1])

        newDistMatrices[response] = tmpResult

        ## derive contact matrix from distance matrix
        labelName, labelType, subType = config.ParseResponse(response)
        if not labelType.startswith('Discrete'):
            print 'ERROR: unsupported labelType by ReplaceSubDistMatrix.py: ', labelType
            exit(1)

        if labelName in config.allAtomPairNames:
            labelOf8 = DistanceUtils.LabelsOfOneDistance(
                config.ContactDefinition, config.distCutoffs[subType])
            newContMatrices[labelName] = ContactUtils.Distance2Contact(
                newDistMatrices[response], labelOf8)

        elif labelName in config.allOrientationNames:
            newContMatrices[
                labelName] = OrientationUtils.DeriveOriContactMatrix(
                    newDistMatrices[response], response)
        else:
            print 'ERROR: unsupported labelName in replaceSubDistMatrix(): ', labelName
            exit(1)
    """
	if newName is not None:
		targetName = newName
	else:
		targetName = targetName + '.mixed'
	"""
    if newName is None:
        fileName = os.path.basename(baseMatrixFile).split('.')[0] + '-mixed'
    else:
        fileName = newName

    ## save the new result
    content4save = (targetName, sequence, newDistMatrices, newContMatrices,
                    baseMatrix[4], baseMatrix[5])

    savefile = os.path.join(savefolder, fileName + '.predictedDistMatrix.pkl')
    with open(savefile, 'wb') as fh:
        cPickle.dump(content4save, fh, protocol=cPickle.HIGHEST_PROTOCOL)
Пример #7
0
        elif bPrintOtherAtomPairs:
                contactFileName = filename + '.' + apt + '.CM.txt'
                contactCASPFileName = filename + '.' + apt + '.CASP.rr'
	else:
		continue

	contactFile = os.path.join(savefolder, contactFileName)
        np.savetxt(contactFile, m, fmt='%1.6f', delimiter=' ')

	contactCASPFile = os.path.join(savefolder, contactCASPFileName)
	if contactOnly:
        	ContactUtils.SaveContactMatrixInCASPFormat(targetName, sequence, m, contactCASPFile, distMatrix=None, probScaleFactor=1)
		continue

	responses = FindStringsStartWith(distProbMatrix.keys(), apt)
	if len(responses) != 1:
		## right now for one apt, only one response is allowed
		print 'ERROR: incorrect distance information for', apt, 'in', predFile
		exit(1)

	response = responses[0]
	labelName, labelType, subType = config.ParseResponse(response)

	if not config.IsDiscreteLabel(labelType):
		print 'ERROR: right now only discrete distance matrix is supported'
		exit(1)

	## convert distance matrix to what's needed by CASP
	distMatrix = DistanceUtils.MergeDistanceBinsBySum(distProbMatrix[response], config.distCutoffs[subType], config.distCutoffs['10C'])
        ContactUtils.SaveContactMatrixInCASPFormat(targetName, sequence, m, contactCASPFile, distMatrix=distMatrix, probScaleFactor=1)
Пример #8
0
def main(argv):

    targetResponse = 'CbCb_Discrete14C'
    savefolder = os.getcwd()

    if len(argv) < 1:
        Usage()
        exit(1)

    try:
        opts, args = getopt.getopt(argv, "r:s:", ["response=", "savefolder="])
        #print opts, args
    except getopt.GetoptError:
        Usage()
        exit(1)

    if len(args) != 1:
        Usage()
        exit(1)

    for opt, arg in opts:
        if opt in ("-r", "--response"):
            targetResponse = arg
        elif opt in ("-s", "--savefolder"):
            savefolder = arg
            if not os.path.isdir(savefolder):
                os.mkdir(savefolder)
        else:
            Usage()
            exit(1)

    predFile = args[0]
    if not os.path.isfile(predFile):
        print 'ERROR: the predicted distance prob matrix file does not exist: ', predFile
        exit(1)

    targetLabelName, targetLabelType, targetSubType = config.ParseResponse(
        targetResponse)

    if targetSubType.endswith('Plus') or targetSubType.endswith('Minus'):
        print 'ERROR: currently the target distance type cannot end with Plus or Minus'
        exit(1)

    if not config.distCutoffs.has_key(targetSubType):
        print 'ERROR: the key ', targetSubType, ' is not defined in config.distCutoffs'
        exit(1)

    dstDistCutoff = config.distCutoffs[targetSubType]

    with open(predFile, 'rb') as fh:
        pred = cPickle.load(fh)
    target, sequence, distProbMatrix, contProbMatrix, labelWeight, labelDistribution = pred[:
                                                                                            6]

    newDistProbMatrix = dict()
    newLabelWeight = dict()
    newLabelDistribution = dict()

    for response in distProbMatrix.keys():
        labelName, labelType, subType = config.ParseResponse(response)
        if labelName not in config.allAtomPairNames:
            continue

        if labelName != targetLabelName:
            continue

        if not config.distCutoffs.has_key(subType):
            print 'ERROR: the dist prob matrix to be reduced has a discretization scheme undefined in config.distCutoffs: ', subType
            exit(1)

        srcDistCutoff = config.distCutoffs[subType]

        ## convert distProbMatrix
        newDistProbMatrix[
            targetResponse] = DistanceUtils.MergeDistanceBinsBySum(
                distProbMatrix[response], srcDistCutoff, dstDistCutoff)

        ## convert labelDistribution
        newLabelDistribution[
            targetResponse] = DistanceUtils.MergeDistanceBinsBySum(
                labelDistribution[response], srcDistCutoff, dstDistCutoff)

        ## convert labelWeight
        newLabelWeight[
            targetResponse] = DistanceUtils.MergeDistanceBinsByAverage(
                labelWeight[response], srcDistCutoff, dstDistCutoff,
                labelDistribution[response])

    fields = os.path.basename(predFile).split('.')
    savefile = '.'.join(
        [fields[0], '.'.join(fields[1:-1]) + '4' + targetSubType, fields[-1]])
    savefile = os.path.join(savefolder, savefile)
    with open(savefile, 'wb') as fh:
        cPickle.dump((target, sequence, newDistProbMatrix, contProbMatrix,
                      newLabelWeight, newLabelDistribution),
                     fh,
                     protocol=cPickle.HIGHEST_PROTOCOL)
    def TopAccuracyByRange(self, zList):

        currentResponse = None
        topRatio = 0.5

        ## in this function, we assume that pred is a tensor3 of floatX and truth is a matrix
        ## pred has shape (dataLen, dataLen, 2) and truth has shape (dataLen, dataLen)
        ## we also assume that label 0 is positive and label 1 is negative
        ## the result is not 100% accurate for non-symmetric response, e.g., hydrogen-bonding matrix
        def TopAccuracy2C(pred=None, truth=None, symmetric=False):

            M1s = T.ones_like(truth, dtype=np.int8)
            LRsel = T.triu(M1s, 24)
            MLRsel = T.triu(M1s, 12)
            SMLRsel = T.triu(M1s, 6)
            MRsel = MLRsel - LRsel
            SRsel = SMLRsel - MLRsel

            dataLen = truth.shape[0]

            pred0 = pred[:, :, 0]

            if symmetric:
                avg_pred = (pred0 + pred0.dimshuffle(1, 0)) / 2.0
            else:
                avg_pred = pred0

            #pred_truth = T.concatenate( (avg_pred, truth.dimshuffle(0, 1, 'x') ), axis=2)
            pred_truth = T.stack([avg_pred, T.cast(truth, 'int32')], axis=2)

            accuracyList = []
            for Rsel in [LRsel, MRsel, MLRsel, SRsel]:
                selected_pred_truth = pred_truth[Rsel.nonzero()]

                ## sort by the predicted value for label 0 from the largest to the smallest
                selected_pred_truth_sorted = selected_pred_truth[(
                    selected_pred_truth[:, 0]).argsort()[::-1]]

                #print 'topRatio =', topRatio
                numTops = T.minimum(T.iround(dataLen * topRatio),
                                    selected_pred_truth_sorted.shape[0])

                selected_sorted_truth = T.cast(
                    selected_pred_truth_sorted[:, -1], 'int32')
                numTruths = T.bincount(selected_sorted_truth, minlength=2)
                numCorrects = T.bincount(selected_sorted_truth[0:numTops],
                                         minlength=2)
                #numTops = T.minimum(numTops, numTruths[0])
                accuracyList.append(
                    T.stack([
                        numCorrects[0] * 1. /
                        (numTops + 0.001), numTops, numTruths[0]
                    ],
                            axis=0))

            return T.stacklists(accuracyList)

        def TopAccuracyNormal(pred=None, truth=None, symmetric=True):
            truth_new = T.ge(truth, config.ContactDefinition)

            if pred.ndim == 2:
                pred_new = -pred.dimshuffle(0, 1, 'x')
            else:
                pred_new = -pred

            return TopAccuracy2C(pred=pred_new,
                                 truth=truth_new,
                                 symmetric=symmetric)

        def TopAccuracyLogNormal(pred=None, truth=None, symmetric=True):
            truth_new = T.ge(truth, T.log(config.ContactDefinition))

            if pred.ndim == 2:
                pred_new = -pred.dimshuffle(0, 1, 'x')
            else:
                pred_new = -pred

            return TopAccuracy2C(pred=pred_new,
                                 truth=truth_new,
                                 symmetric=symmetric)

        ## in this function, we assume that pred is tensor3 of float and truth is a matrix of int8 or int
        ## pred has shape (dataLen, dataLen, numLabels), having the predicted probability of each label
        ## truth has shape (dataLen, dataLen)
        def TopAccuracyMultiC(pred=None,
                              truth=None,
                              subType=None,
                              symmetric=True):
            ## convert pred and truth to 2C

            distBins = config.distCutoffs[subType]
            label8 = DistanceUtils.LabelsOfOneDistance(
                config.ContactDefinition, distBins)
            label15 = DistanceUtils.LabelsOfOneDistance(
                config.InteractionLimit, distBins)

            truth1 = T.cast(T.ge(truth, label8), 'int32')
            truth_new = truth1

            pred1 = T.sum(pred[:, :, :label8], axis=2, keepdims=True)
            pred2 = T.sum(pred[:, :, label8:], axis=2, keepdims=True)
            pred_new = T.concatenate((pred1, pred2), axis=2)

            return TopAccuracy2C(pred=pred_new,
                                 truth=truth_new,
                                 symmetric=symmetric)

        ## this function calculates the top accuracy of predicted orientation angles. Here the accuracy is defined as the percentage of residue pairs that is correctly to be predicted to have a valid orientation angle
        ## this is not a good definition, but this is what we can do here
        ## pred has shape (dataLen, dataLen, numLabels), having the predicted probability of each label
        ## truth has shape (dataLen, dataLen)
        def TopAccuracyOrientation(pred=None,
                                   truth=None,
                                   largestValidLabel=36,
                                   symmetric=True):

            pred1 = T.sum(pred[:, :, :largestValidLabel],
                          axis=2,
                          keepdims=True)
            pred2 = T.sum(pred[:, :, largestValidLabel:],
                          axis=2,
                          keepdims=True)
            pred_new = T.concatenate((pred1, pred2), axis=2)

            truth_new = T.cast(T.ge(truth, largestValidLabel), 'int32')

            return TopAccuracy2C(pred=pred_new,
                                 truth=truth_new,
                                 symmetric=symmetric)

        ##def EvaluateAccuracy(pred_prob, truth, pad_len, response):
        def EvaluateAccuracy(pred_prob, truth, pad_len):
            pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:],
                                           dtype=theano.config.floatX)
            truth_in_correct_shape = truth[pad_len:, pad_len:]

            labelName, labelType, subType = ParseResponse(currentResponse)
            symmetric = config.IsSymmetricLabel(labelName)

            if labelName in config.allOrientationNames:
                if not config.IsDiscreteLabel(labelType):
                    print 'ERROR: unsupported label type for orientation matrix prediction: ', currentResponse
                    exit(1)

                numLabels = GetResponseProbDims(currentResponse)
                if subType.endswith('Plus') or subType.endswith('Minus'):
                    largestValidLabel = numLabels - 2
                else:
                    largestValidLabel = numLabels - 1

                return TopAccuracyOrientation(
                    pred=pred_in_correct_shape,
                    truth=truth_in_correct_shape,
                    largestValidLabel=largestValidLabel,
                    symmetric=symmetric)

            if labelType.startswith('LogNormal'):
                return TopAccuracyLogNormal(pred=pred_in_correct_shape,
                                            truth=truth_in_correct_shape,
                                            symmetric=symmetric)

            elif labelType.startswith('Normal'):
                return TopAccuracyNormal(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         symmetric=symmetric)

            elif labelType.startswith('Discrete'):
                #subType = labelType[len('Discrete'): ]
                if subType.startswith('2C'):
                    return TopAccuracy2C(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         symmetric=symmetric)
                else:
                    return TopAccuracyMultiC(pred=pred_in_correct_shape,
                                             truth=truth_in_correct_shape,
                                             subType=subType,
                                             symmetric=symmetric)
            else:
                print 'ERROR: unsupported label type in EvaluateAccuracy: ', labelType
                exit(1)

        accuracyList = []
        for res, out_prob, z, ratio in zip(self.responses,
                                           self.output_probList, zList,
                                           self.modelSpecs['topRatios']):

            labelName, labelType, subType = config.ParseResponse(res)

            ## currently TopAccuracy only works when the dimension of each z is 3
            assert z.ndim == 3

            if self.mask_1d is not None:
                paddingLens = self.mask_1d.shape[1] - T.sum(self.mask_1d,
                                                            axis=1)
            else:
                paddingLens = T.zeros_like(z[:, 0, 0], dtype=np.int32)

            currentResponse = res
            topRatio = ratio

            ##here we use scan to calculate accuracy for each protein
            result, updates = theano.scan(fn=EvaluateAccuracy,
                                          outputs_info=None,
                                          sequences=[out_prob, z, paddingLens])
            accuracy = T.mean(result, axis=0)
            accuracyList.append(accuracy)

        return T.stacklists(accuracyList)
def main(argv):
    newName = None
    savefolder = os.getcwd()

    try:
        opts, args = getopt.getopt(argv, "s:n:", ["savefolder=", "name="])
        #print opts, args
    except getopt.GetoptError:
        Usage()
        exit(1)

    if len(args) < 2:
        Usage()
        exit(1)

    baseMatrixFile = args[0]
    subMatrixFiles = args[1:]

    for opt, arg in opts:
        if opt in ("-s", "--savefolder"):
            savefolder = arg
        elif opt in ("-n", "--name"):
            newName = arg
        else:
            Usage()
            exit(1)

    baseMatrix = DistanceUtils.LoadRawDistProbFile(baseMatrixFile)
    sequence = baseMatrix[1]
    targetName = baseMatrix[0]

    ## baseMatrix and subMatrix are a tuple of 6 items
    subMatrices = []
    for subMatrixFile in subMatrixFiles:
        subMatrix = DistanceUtils.LoadRawDistProbFile(subMatrixFile)

        ## make sure that both matrix files are of the same type, although they may not equal
        if baseMatrix[4] is None:
            assert (subMatrix[4] is None)
        if baseMatrix[4] is not None:
            assert (subMatrix[4] is not None)

        subMatrices.append(subMatrix)

    ## new distance and contact matrices with response as the keys
    newDistMatrices = {}
    counterMatrices = {}

    ## initialize
    for response, m in baseMatrix[2].iteritems():
        newDistMatrices[response] = deepcopy(m)
        counterMatrices[response] = np.ones(m.shape[:2], dtype=np.int32)

    ## add submatrices onto newDistMatrices
    for subMatrix, smfile in zip(subMatrices, subMatrixFiles):
        print 'Adding submatrix in ', smfile

        subSequence = subMatrix[1]

        ## try to find its position in the original sequence by assumming that this domain has only one seq segment
        index = sequence.find(subSequence)
        if index >= 0:
            for response, m in subMatrix[2].iteritems():
                if not newDistMatrices.has_key(response):
                    print 'WARNING: the original matrix does not have response', response, ' in subMatrixFile:', smfile
                    continue

                AddSubMatrix(newDistMatrices[response], m, index)
                AddSubMatrix(counterMatrices[response],
                             np.ones(m.shape[:2], dtype=np.int32), index)
        else:
            ## try to find its positions in the original sequence by assuming that this domain has two seq segments
            res = FindIndexBySegments(sequence, subSequence)
            if res is None:
                print 'ERROR: cannot map domain sequence to the whole chain sequence!'
                print '    domain Seq= ', subSequence
                print '    chain  Seq= ', sequence
                exit(1)

            for response, m in subMatrix[2].iteritems():
                if not newDistMatrices.has_key(response):
                    print 'WARNING: the original matrix does not have response', response, ' in subMatrixFile:', smfile
                    continue
                AddSubMatrixBySegments(newDistMatrices[response],
                                       m,
                                       starts=res[0],
                                       sizes=res[1])
                AddSubMatrix(counterMatrices[response],
                             np.ones(m.shape[:2], dtype=np.int32),
                             starts=res[0],
                             sizes=res[1])

    ## final processing
    for response, m in newDistMatrices.iteritems():
        newDistMatrices[response] = np.divide(
            newDistMatrices[response], counterMatrices[response][:, :,
                                                                 np.newaxis])

    ## convert distMatrix to contactMatrix
    newContMatrices = {}

    for response, m in newDistMatrices.iteritems():

        ## derive contact matrix from distance matrix
        labelName, labelType, subType = config.ParseResponse(response)
        if not config.IsDiscreteLabel(labelType):
            print 'ERROR: unsupported labelType by ReplaceSubDistMatrix.py: ', labelType
            exit(1)

        if labelName in config.allAtomPairNames:
            labelOf8 = DistanceUtils.LabelsOfOneDistance(
                config.ContactDefinition, config.distCutoffs[subType])
            newContMatrices[labelName] = ContactUtils.Distance2Contact(
                m, labelOf8)

        elif labelName in config.allOrientationNames:
            newContMatrices[
                labelName] = OrientationUtils.DeriveOriContactMatrix(
                    m, response)
        else:
            print 'ERROR: unsupported labelName in replaceSubDistMatrix(): ', labelName
            exit(1)

    content4save = (targetName, sequence, newDistMatrices, newContMatrices,
                    baseMatrix[4], baseMatrix[5])

    ## save the new result
    if newName is None:
        fileName = os.path.basename(baseMatrixFile).split('.')[0] + '-mixed'
    else:
        fileName = newName
    savefile = os.path.join(savefolder, fileName + '.predictedDistMatrix.pkl')
    with open(savefile, 'wb') as fh:
        cPickle.dump(content4save, fh, protocol=cPickle.HIGHEST_PROTOCOL)
def CalcLabelWeight(modelSpecs):
    print 'Calculating label weight ...'

    numRanges = RangeNWeight.GetNumRanges(modelSpecs)

    RangeNWeight.SetWeight4Range(modelSpecs)
    #print 'weight for range: ', modelSpecs['weight4range']

    RangeNWeight.SetWeight43C2C(modelSpecs)
    #print 'LRbias= ', modelSpecs['LRbias']
    #print 'weight43C= ', modelSpecs['weight4Discrete3C']

    allRefProbs = modelSpecs['labelDistributions']
    ##for discrete labels, we calculate their weights by inferring from the weight intialized to 3 bins: 0-8, 8-15 and >15 or -1, which makes inference easier
    modelSpecs['weight4labels'] = dict()

    for response in modelSpecs['responses']:
        labelName, labelType, subType = config.ParseResponse(response)
        numLabels = GetResponseProbDims(response)

        if config.IsContinuousLabel(labelType):
            ## just need to assign range weight for continuous response
            modelSpecs['weight4labels'][response] = modelSpecs[
                'weight4continuous']
            continue

        if not config.IsDiscreteLabel(labelType):
            print 'ERROR: unsupported response in CalcLabelWeight: ', response
            exit(1)

        if labelName in config.allOrientationNames or config.NoWeight4Label(
                modelSpecs):
            modelSpecs['weight4labels'][response] = np.multiply(
                np.ones((numRanges, numLabels), dtype=np.float32),
                modelSpecs['weight4range'])

        elif labelName in ['HB', 'Beta']:
            ## if the response is for HB and Beta-Pairing
            if subType.startswith('2C'):
                modelSpecs['weight4labels'][response] = modelSpecs['weight4' +
                                                                   response]
            else:
                print 'ERROR: unsupported label subtype in CalcLabelWeight: ', response
                exit(1)

        elif labelName in config.allAtomPairNames:
            ## calculate label weight for atom pairs Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO
            if subType.startswith('2C'):
                print 'ERROR: 2C is not supported for contact/distance prediction any more'
                exit(1)
            elif subType.startswith('3C'):
                ## if 3C is used for the response
                modelSpecs['weight4labels'][response] = modelSpecs[
                    'weight4Discrete3C']
            else:
                modelSpecs['weight4labels'][
                    response] = DistanceUtils.CalcLabelWeight(
                        modelSpecs['weight4Discrete3C'], allRefProbs[response],
                        config.distCutoffs[subType])

        else:
            print 'ERROR: unsupported label name in CalcLabelWeight: ', response
            exit(1)

        ## set the weight of the label for the invalid entry (distance or orientation) to 0
        if subType.endswith('Minus'):
            modelSpecs['weight4labels'][response][:, -1] = 0
    """
	## for log
	for response in modelSpecs['responses']:
		print 'weight4labels for response: ', response
		print modelSpecs['weight4labels'][response]
	"""

    return modelSpecs['weight4labels']