Python DataProcessor.LoadPropertyFeatures示例

编程语言: Python

类/类型: DataProcessor

方法/功能: LoadPropertyFeatures

hotexamples.com的示例: 2

Python DataProcessor.LoadPropertyFeatures - 已找到2个示例。这些是从开源项目中提取的最受好评的DataProcessor.LoadPropertyFeatures 来自程序包 NeMo现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

DataProcessor(23)

SplitData2Batches(10)

LoadRealData(10)

SampleProteinInfo(8)

File(6)

AssembleOneBatch(5)

InventoryCountProcessor(5)

InventoryProcessor(5)

ProductsProcessor(3)

ProductProcessor(3)

LoadNativeDistMatrixFromFile(2)

LoadPropertyFeatures(2)

unsize_vector(2)

LoadMetaData(2)

LoadNativeDistMatrix(1)

getDailyReviewsCount(1)

get_cell(1)

get_ewma(1)

get_logistic_reg_prediction(1)

get_moving_average(1)

get_randomforest_prediction(1)

partition_data(1)

get_svc_prediction(1)

prepare_data_for_classification(1)

processFile(1)

select_best_param_svc(1)

split_around_nan(1)

time_vector(1)

truncate_data(1)

find_timeshift(1)

butterworth(1)

find_bump(1)

FitLineFromCsv(1)

LoadDistanceLabelMatrices(1)

Process(1)

Processor(1)

LoadDistanceFeatures(1)

InventoryCountsProcessor(1)

Regress32x128Color00PastDataLoader(1)

SaveListToCsv(1)

apply_svc(1)

ExtractFeaturesNLabels(1)

Database(1)

TFIDFConverter(1)

apply_knn(1)

apply_logistic_regressor(1)

apply_random_forest(1)

SplitMetaData(1)

示例#1

显示文件

文件： RunPropertyPredictor.py 项目： zhujianwei31415/RaptorX-3DModeling

def PredictProperty(models, predictors, predFiles):

    allsequences = dict()

    ##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = predicted_property_list
    allresults4prob = dict()
    allresults = dict()

    for model, predictor in zip(models, predictors):

        predict, inputVariables = predictor

        ## We shall load these files for each model separately since each model may use a different set of features
        predData = DataProcessor.LoadPropertyFeatures(predFiles,
                                                      modelSpecs=model,
                                                      forTrainValidation=False)

        ##make sure the input has the same number of features as the model
        rindex = np.random.randint(0, high=len(predData))
        assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1]

        ## collecting sequences
        for d in predData:
            if not allsequences.has_key(d['name']):
                allsequences[d['name']] = d['sequence']
            elif allsequences[d['name']] != d['sequence']:
                print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files'
                exit(1)

        predSeqData, names = DataProcessor.SplitData2Batches(
            data=predData,
            numDataPoints=30,
            modelSpecs=model,
            forTrainValidation=False)
        print '#predData: ', len(predData), '#batches: ', len(predSeqData)

        for onebatch, names4onebatch in zip(predSeqData, names):
            input = onebatch[:len(inputVariables)]
            result4prob, result = predict(*input)

            ## x1d has shape (batchSize, maxSeqLen, numFeatures) and x1dmask has shape (batchSize, #cols_to_be_masked)
            x1d, x1dmask = input[0:2]
            seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
            maxSeqLen = x1d.shape[1]

            ##result4prob has shape (batchSize, maxSeqLen, sum( responseProbDims{res] for res in modelSpecs['responses'])  )
            assert result4prob.shape[2] == sum([
                config.responseProbDims[Response2LabelType(res)]
                for res in model['responses']
            ])

            ##result has shape (batchSize, maxSeqLen, sum( responseValueDims{res] for res in modelSpecs['responses'])  )
            assert result.shape[2] == sum([
                config.responseValueDims[Response2LabelType(res)]
                for res in model['responses']
            ])

            nameGenerator = (name for name in names4onebatch
                             if not allresults.has_key(name))
            for name in nameGenerator:
                allresults[name] = dict()
                allresults4prob[name] = dict()

            dims = [
                config.responseProbDims[Response2LabelType(res)]
                for res in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            for res, start, end in zip(model['responses'], startPositions,
                                       endPositions):
                nameGenerator = (name for name in names4onebatch
                                 if not allresults4prob[name].has_key(res))
                for name in nameGenerator:
                    allresults4prob[name][res] = []

                ## remove masked positions
                revised_batchres = [
                    tmp[maxSeqLen - seqLen:, :]
                    for tmp, seqLen in zip(result4prob[:, :,
                                                       start:end], seqLens)
                ]

                [
                    allresults4prob[name][res].append(res4one)
                    for res4one, name in zip(revised_batchres, names4onebatch)
                ]

            dims = [
                config.responseValueDims[Response2LabelType(res)]
                for res in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            for res, start, end in zip(model['responses'], startPositions,
                                       endPositions):
                nameGenerator = (name for name in names4onebatch
                                 if not allresults[name].has_key(res))
                for name in nameGenerator:
                    allresults[name][res] = []

                ## remove masked positions
                revised_batchres = [
                    tmp[maxSeqLen - seqLen:, :]
                    for tmp, seqLen in zip(result[:, :, start:end], seqLens)
                ]
                [
                    allresults[name][res].append(res4one)
                    for res4one, name in zip(revised_batchres, names4onebatch)
                ]

    ## calculate the final result, which is the average of all the predictd properties for the same protein and response name
    finalresults = dict()
    for name, results in allresults.iteritems():
        if not finalresults.has_key(name):
            finalresults[name] = dict()
        for response in results.keys():
            tmpresult = np.average(allresults[name][response], axis=0)

            ##convert coding of discrete labels to more meaningful representation
            labelType = Response2LabelType(response)
            if not labelType.startswith('Discrete'):
                finalresults[name][response] = tmpresult

    finalresults4prob = dict()
    for name, results in allresults4prob.iteritems():
        if not finalresults4prob.has_key(name):
            finalresults4prob[name] = dict()
        for response in results.keys():
            finalresults4prob[name][response] = np.average(
                allresults4prob[name][response], axis=0)

            labelType = Response2LabelType(response)
            if labelType.startswith('Discrete'):
                tmpresult = np.argmax(finalresults4prob[name][response],
                                      axis=1)
                finalresults[name][response] = PropertyUtils.Coding2String(
                    tmpresult, response)
    """
	## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average.
	labelDistributions = dict()
	labelWeights = dict()
	for model in models:
		for apt in model['responseNames']:
			if not labelDistributions.has_key(apt):
				labelDistributions[apt] = []
			if not labelWeights.has_key(apt):
				labelWeights[apt] = []

			labelDistributions[apt].append(model['labelRefProbs'][apt])
			labelWeights[apt].append(model['weight4' + model['labelType'] ][apt])

	finalLabelDistributions = dict()
	finalLabelWeights = dict()

	for apt in labelDistributions.keys():
		finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0)
	for apt in labelWeights.keys():
		finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0)
	"""

    return finalresults4prob, finalresults, allsequences

示例#2

显示文件

文件： TrainPropertyPredictor.py 项目： zhujianwei31415/RaptorX-3DModeling

def main(argv):

    #modelSpecs = config.InitializeModelSpecs()
    modelSpecs = InitializeModelSpecs()
    modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs)

    startTime = datetime.datetime.now()

    ##trainData and validData are a list. Each element corresponds to one protein, which is a dict()
    trainData = DataProcessor.LoadPropertyFeatures(modelSpecs['trainFile'],
                                                   modelSpecs=modelSpecs)
    validData = DataProcessor.LoadPropertyFeatures(modelSpecs['validFile'],
                                                   modelSpecs=modelSpecs)
    print '#trainData: ', len(trainData), '#validData: ', len(validData)

    ## where to add code to assign weight to each residue? We need to deal with the residues without 3D coordinates for angle and SS prediction
    ##a, b = DataProcessor.CalcLabelDistributionAndWeight(trainData, modelSpecs)

    modelSpecs['numOfTrainProteins'] = len(trainData)

    beforeBatchTime = datetime.datetime.now()
    print 'time spent on data loading: ', beforeBatchTime - startTime

    print 'Preparing batch data for training...'
    groupSize = modelSpecs['minibatchSize']
    trainSeqDataset, _ = DataProcessor.SplitData2Batches(
        data=trainData, numDataPoints=groupSize, modelSpecs=modelSpecs)
    validSeqDataset, _ = DataProcessor.SplitData2Batches(
        data=validData, numDataPoints=groupSize, modelSpecs=modelSpecs)
    #validSeqDataset = DataProcessor.SplitData2Batches(data=validData, numDataPoints=20000, modelSpecs=modelSpecs)
    print "#trainData minibatches:", len(
        trainSeqDataset), "#validData minibatches:", len(validSeqDataset)

    predSeqDataset = None
    if modelSpecs['predFile'] is not None:
        predData = DataProcessor.LoadPropertyFeatures(modelSpecs['predFile'],
                                                      modelSpecs=modelSpecs,
                                                      forTrainValidation=False)
        print '#predData: ', len(predData)
        predSeqDataset, _ = DataProcessor.SplitData2Batches(
            data=predData, numDataPoints=40, modelSpecs=modelSpecs)
        print "#predData minibatches:", len(predSeqDataset)

## Each protein in trainData contains three or four components: seqFeatures and label
    modelSpecs['n_in_seq'] = trainData[0]['seqFeatures'].shape[1]

    beforeTrainTime = datetime.datetime.now()

    print 'time spent on generating batch data:', beforeTrainTime - beforeBatchTime

    result = TrainModel(modelSpecs=modelSpecs,
                        trainSeqData=trainSeqDataset,
                        validSeqData=validSeqDataset,
                        predSeqData=predSeqDataset)

    ##merge ModelSpecs and result
    resultModel = modelSpecs.copy()
    resultModel.update(result)

    modelFile = GenerateModelFileName(resultModel)
    print 'Writing the resultant model to ', modelFile
    cPickle.dump(resultModel, file(modelFile, 'wb'), cPickle.HIGHEST_PROTOCOL)