def ValidDataLoader2(sharedQ, stopValidDataLoader, validSeqData, modelSpecs, assembleData=True, UseSharedMemory=False):

	bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(modelSpecs)
	if any([bUseCCMraw, bUseFullMI, bUseFullCov]):
		## when full coevolution matrices are used, we shall use float16 to save memory
		floatType = np.float16
	else:
		floatType = theano.config.floatX

	#print 'validDataLoader has event: ', stopValidDataLoader
	for batch in validSeqData:
		if stopValidDataLoader.is_set() or os.getppid()==1:
			#print 'validDataLoader receives the stop signal'
			break

		 ## Load real data for one batch
                data = DataProcessor.LoadRealData(batch, modelSpecs, returnMode='list')

                ## add code here to make sure that the data has the same input dimension as the model specification
                FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)

		if assembleData:
			data = PrepareInput4Validate(data, modelSpecs, floatType=floatType, UseSharedMemory=UseSharedMemory)
		#print 'putting data to validDataLoader queue...'
		sharedQ.put(data)

	print 'validDataLoader has finished loading data'
	sharedQ.close()
示例#2
0
def LoadFeaturePKL(name,
                   location='Feature4Train_2017_E001_PKL/',
                   modelSpecs=None):

    ## load up the basic input features
    filename = os.path.join(location, name + '.inputFeatures.pkl')
    if not os.path.isfile(filename):
        print 'ERROR: the input feature file does not exist: ', filename
        exit(1)
    with open(filename) as fh:
        feature = cPickle.load(fh)

    ## check to see if loading up extra features
    bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(
        modelSpecs)

    bUseExtraCCM = bUseCCMFnorm or bUseCCMsum or bUseCCMraw
    if bUseExtraCCM:
        extrafile = os.path.join(location, name + '.extraCCM.pkl')
        if not os.path.isfile(extrafile):
            print 'ERROR: the file for extra CCM information does not exist: ', extrafile
            exit(1)
        with open(extrafile) as fh:
            extra = cPickle.load(fh)

    if bUseCCMFnorm:
        feature['CCMFnorm'] = extra['Fnorm']
        feature['CCMFnormZ'] = extra['FnormZ']

    seqLen = len(feature['sequence'])
    if bUseCCMsum:
        if not extra.has_key('sumCCM'):
            print 'ERROR: CCM summary is requested, but the file does not have it: ', extrafile
            exit(1)
        feature['sumCCM'] = CCMpredUtils.ExpandMatrix(extra['sumCCM'], seqLen)

    if bUseCCMraw:
        if not extra.has_key('rawCCM'):
            print 'ERROR: CCM raw matrix is requested, but the file does not have it: ', extrafile
            exit(1)
        feature['rawCCM'] = CCMpredUtils.ExpandMatrix(extra['rawCCM'], seqLen)

    if bUseFullCov or bUseFullMI:
        alnfile = os.path.join(location, name + '.a2m')
        if not os.path.isfile(alnfile):
            print 'ERROR: the a2m file does not exist: ', alnfile
            exit(1)
    if bUseFullCov:
        feature['fullCov'] = MSAUtils.CalcPairMatrixFromFile(alnfile)
    if bUseFullMI:
        feature['fullMI'] = MSAUtils.CalcPairMatrixFromFile(alnfile,
                                                            matrixType='mi')

    return feature
def CollectMatrixFeatures(d, modelSpecs, returnMode='array'):
    ##collecting pairwise features...
    pairfeatures_nomean = [
    ]  # a set of pairwise features for which we do not calculate their expected value
    pairfeatures = []

    if not config.NoOldLocationFeatures(modelSpecs):
        posFeature = FeatureUtils.LocationFeature(d)
        pairfeatures_nomean.append(posFeature)

        cbrtFeature = FeatureUtils.CubeRootFeature(d)
        pairfeatures_nomean.append(cbrtFeature)

    if config.UseNewLocationFeatures(modelSpecs):
        posFeatures = FeatureUtils.NewLocationFeature(d)
        pairfeatures_nomean.extend(posFeatures)

    if config.UseCCMZ(modelSpecs):
        if not d.has_key('ccmpredZ'):
            print 'ERROR: CCMpredZ is requested, but the data for protein ', d[
                'name'], ' does not have it!'
            exit(1)
        else:
            pairfeatures.append(d['ccmpredZ'])

    if config.UseRawCCM(modelSpecs):
        if not d.has_key('ccmpred'):
            print 'ERROR: Raw CCMpred is requested, but the data for protein ', d[
                'name'], ' does not have it!'
            exit(1)
        pairfeatures.append(d['ccmpred'])

    if config.UsePSICOV(modelSpecs):
        if not d.has_key('psicovZ'):
            print 'ERROR: psicovZ is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['psicovZ'])

    if config.UseContactPotential(modelSpecs):
        if not d.has_key('OtherPairs'):
            print 'ERROR: pairwise contact potential is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['OtherPairs'][:, :, 0])

    if config.UseMI(modelSpecs):
        if not d.has_key('OtherPairs'):
            print 'ERROR: mutual information is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['OtherPairs'][:, :, 1:3])

    bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(
        modelSpecs)
    if bUseCCMFnorm:
        if not d.has_key('CCMFnorm') or not d.has_key('CCMFnormZ'):
            print 'ERROR: CCM Fnorm and/or FnormZ are requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['CCMFnorm'])
        pairfeatures.append(d['CCMFnormZ'])

    if bUseCCMsum:
        if not d.has_key('sumCCM'):
            print 'ERROR: CCM summary are requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['sumCCM'])

    if bUseCCMraw:
        if not d.has_key('rawCCM'):
            print 'ERROR: CCM raw matrix is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['rawCCM'])

    if bUseFullMI:
        if not d.has_key('fullMI'):
            print 'ERROR: full MI matrix is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['fullMI'])

    if bUseFullCov:
        if not d.has_key('fullCov'):
            print 'ERROR: full covariance matrix is requested, but the data for protein ', d[
                'name'], ' does not have it'
            exit(1)
        pairfeatures.append(d['fullCov'])

    ##add template-based distance and orientation matrices
    if config.UseTemplate(modelSpecs):
        pairfeatures_nomean.extend(CollectTemplateMatrixFeatures(
            d, modelSpecs))

    if returnMode.lower() == 'array':
        matrixFeature = np.dstack(tuple(pairfeatures))
        if len(pairfeatures_nomean) > 0:
            matrixFeature_nomean = np.dstack(tuple(pairfeatures_nomean))
        else:
            seqLen = matrixFeature.shape[0]
            matrixFeature_nomean = np.zeros((seqLen, seqLen, 0),
                                            dtype=config.MyFloat)

        #print 'matrixFeature.shape: ', matrixFeature.shape
        return matrixFeature, matrixFeature_nomean
    else:
        return pairfeatures, pairfeatures_nomean
示例#4
0
def LoadFeaturePKL(name,
                   location='Feature4Train_2017_E001_PKL/',
                   modelSpecs=None):

    ## load up the basic input features
    filename = os.path.join(location, name + '.inputFeatures.pkl')
    if not os.path.isfile(filename):
        print 'ERROR: the input feature file does not exist: ', filename
        exit(1)
    with open(filename) as fh:
        feature = cPickle.load(fh)

    ## check to see if loading up extra features
    bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(
        modelSpecs)

    bUseExtraCCM = bUseCCMFnorm or bUseCCMsum or bUseCCMraw
    if bUseExtraCCM:
        extrafile = os.path.join(location, name + '.extraCCM.pkl')
        if not os.path.isfile(extrafile):
            print 'ERROR: the file for extra CCM information does not exist: ', extrafile
            exit(1)
        with open(extrafile) as fh:
            extra = cPickle.load(fh)

    if bUseCCMFnorm:
        feature['CCMFnorm'] = extra['Fnorm']
        feature['CCMFnormZ'] = extra['FnormZ']

    seqLen = len(feature['sequence'])
    if bUseCCMsum:
        if not extra.has_key('sumCCM'):
            print 'ERROR: CCM summary is requested, but the file does not have it: ', extrafile
            exit(1)
        feature['sumCCM'] = CCMpredUtils.ExpandMatrix(extra['sumCCM'], seqLen)

    if bUseCCMraw:
        if not extra.has_key('rawCCM'):
            print 'ERROR: CCM raw matrix is requested, but the file does not have it: ', extrafile
            exit(1)
        feature['rawCCM'] = CCMpredUtils.ExpandMatrix(extra['rawCCM'], seqLen)

    if bUseFullMI:
        alnfile = os.path.join(location, name + '.a2m')
        if not os.path.isfile(alnfile):
            print 'ERROR: the a2m file does not exist: ', alnfile
            exit(1)
        feature['fullMI'] = MSAUtils.CalcPairMatrixFromFile(alnfile,
                                                            matrixType='mi')

    if bUseFullCov:
        covfile = os.path.join(location, name + '.cov.pkl')
        if not os.path.isfile(covfile):
            alnfile = os.path.join(location, name + '.a2m')
            if not os.path.isfile(alnfile):
                print 'ERROR: the a2m file does not exist:', alnfile
                exit(1)
            feature['fullCov'] = MSAUtils.CalcPairMatrixFromFile(alnfile)
        else:
            with open(covfile, 'rb') as fh:
                feature['fullCov'] = cPickle.load(fh)

    ## check to see if we shall load up ESM information
    layers = config.ParseESMmode(modelSpecs)
    if layers is not None:
        esmfile = os.path.join(location, name + '.esm2.pkl')
        if not os.path.isfile(esmfile):
            print 'ERROR: the file for ESM information does not exist: ', esmfile
            exit(1)

        with open(esmfile, 'rb') as fh:
            esm = cPickle.load(fh)

        esmfeature = []
        for layer in layers:
            layer4key = layer % (esm['numModelLayers'] + 1)
            if not esm.has_key(layer4key):
                print 'ERROR: attention weight for layer ', layer, ' requested but not available in ', esmfile
                exit(1)
            esmfeature.append(esm[layer4key])

        feature['ESM'] = np.concatenate(esmfeature, axis=2)

        #print 'ESM feature has shape', feature['ESM'].shape

    return feature
    def __init__(self,
                 rng,
                 seqInput,
                 matrixInput,
                 mask_seq=None,
                 mask_matrix=None,
                 embedInput=None,
                 boundingbox=None,
                 modelSpecs=None):
        """
	seqInput has shape (batchSize, seqLen, n_in_seq)
	matrixInput has shape (batchSize, seqLen, seqLen, n_in_matrix)
	mask_seq has shape (batchSize, #cols_to_be_masked)
        mask_matrix has shape (batchSize, #rows_to_be_masked, seqLen)
	embedInput has shape (batchSize, seqLen, n_in2)
	boundingbox is a vector of 4 integer elements: top, left, bottom and right. boundingbox shall only be applied to the matrix converted from sequential features.
        """

        assert (modelSpecs is not None)

        self.modelSpecs = modelSpecs
        self.responses = modelSpecs['responses']

        ## set the number of hidden neurons and number of layers
        n_in_seq = modelSpecs['n_in_seq']
        n_in_matrix = modelSpecs['n_in_matrix']
        n_hiddens_seq = modelSpecs['conv1d_hiddens']
        n_hiddens_matrix = modelSpecs['conv2d_hiddens']
        n_hiddens_logreg = modelSpecs['logreg_hiddens']
        seq_repeats = modelSpecs['conv1d_repeats']
        matrix_repeats = modelSpecs['conv2d_repeats']

        ## half win size for convolutional operation
        if modelSpecs['network'].startswith('DilatedResNet'):
            hwsz_matrix = modelSpecs['conv2d_hwszs']
            hwsz_seq = [modelSpecs['conv1d_hwsz']] * len(n_hiddens_seq)
            dilation_seq = [1] * len(n_hiddens_seq)
            dilation_matrix = modelSpecs['conv2d_dilations']
        else:
            hwsz_matrix = modelSpecs['halfWinSize_matrix']
            hwsz_seq = modelSpecs['halfWinSize_seq']

        ## masks to reduce impact of padding zeros
        self.mask_1d = mask_seq
        self.mask_2d = mask_matrix

        self.layers = []

        act = T.nnet.relu
        if modelSpecs['activation'] == 'TANH':
            act = T.tanh

        # sequence convolution
        if modelSpecs['network'].startswith('DilatedResNet'):
            #seqConv = DilatedResNet(rng, input=seqInput, n_in=n_in_seq, n_hiddens=n_hiddens_seq, n_repeats=seq_repeats, halfWinSize=hwsz_seq, dilation=dilation_seq, mask=mask_seq, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network'])
            seqConv = DilatedResNet(rng,
                                    input=seqInput,
                                    n_in=n_in_seq,
                                    n_hiddens=n_hiddens_seq,
                                    n_repeats=seq_repeats,
                                    halfWinSize=hwsz_seq,
                                    dilation=dilation_seq,
                                    mask=mask_seq,
                                    activation=act,
                                    modelSpecs=modelSpecs)
        else:
            seqConv = ResNet(rng,
                             input=seqInput,
                             n_in=n_in_seq,
                             n_hiddens=n_hiddens_seq,
                             n_repeats=seq_repeats,
                             halfWinSize=hwsz_seq,
                             mask=mask_seq,
                             activation=act,
                             batchNorm=modelSpecs['batchNorm'],
                             version=modelSpecs['network'])
        self.layers.append(seqConv)

        ## transform 1d sequence to 2d matrix
        seq2matrixMode = modelSpecs['seq2matrixMode']
        seq2matrixLayers = []
        embedLayers = []

        ## determine if we shall use the sequential features or not. The sequential features include sequence profile (PSSM), predicted secondary structure and predicted solvent accessibility
        ## useSequentialFeatures is True by default
        ##useSequentialFeatures = ( modelSpecs.has_key('UseSequentialFeatures') and (modelSpecs['UseSequentialFeatures'] is True) )

        ## use OuterConcatenation operation to convert sequence features into pairwise features
        if seq2matrixMode.has_key('OuterCat') and config.UseSequentialFeatures:

            ##midpointfeature has shape (batchSize, seqLen, seqLen, n_midpoint_out)
            midpointfeature, n_midpoint_out = MidpointFeature(seqConv.output,
                                                              seqConv.n_out,
                                                              box=boundingbox)

            ##remove noise in midpointfeature
            ## mask_matrix is used to reduce noise introduced by padding positions
            mid_subtensor = midpointfeature[:, :mask_matrix.shape[1], :, :]
            midpointfeature = T.set_subtensor(
                mid_subtensor,
                T.mul(mask_matrix.dimshuffle(0, 1, 2, 'x'), mid_subtensor))
            mid_subtensor2 = midpointfeature[:, :, :mask_matrix.shape[1], :]
            midpointfeature = T.set_subtensor(
                mid_subtensor2,
                T.mul(mask_matrix.dimshuffle(0, 2, 1, 'x'), mid_subtensor2))

            ## here we use convolution with halfWinSize=0 to reduce model complexity
            compressLayer = Conv2D4DistMatrix(
                rng,
                input=midpointfeature,
                n_in=n_midpoint_out,
                n_hiddens=seq2matrixMode['OuterCat'],
                halfWinSize=0,
                mask=mask_matrix)
            #compressLayer = Conv2D4DistMatrix(rng, input=midpointfeature, n_in=n_midpoint_out, n_hiddens=seq2matrixMode['OuterCat'], halfWinSize=0, mask=None )
            seq2matrixLayers.append(compressLayer)

        ## embedding primary sequence and/or predicted secondary structure
        if embedInput is not None:
            from EmbeddingLayer import EmbeddingLayer4AllRange

            if seq2matrixMode.has_key('Seq+SS'):
                n_out_embed = seq2matrixMode['Seq+SS']
            elif seq2matrixMode.has_key('SeqOnly'):
                n_out_embed = seq2matrixMode['SeqOnly']
            else:
                print 'At least one of two embedding modes Seq+SS or SeqOnly shall be specified.'
                exit(1)

            embedLayer = EmbeddingLayer4AllRange(embedInput,
                                                 modelSpecs['n_in_embed'],
                                                 n_out_embed,
                                                 box=boundingbox)
            seq2matrixLayers.append(embedLayer)
            embedLayers.append(embedLayer)
        """
	we do not use this profile embedding any more
	## embedding the sequence profile
	if seq2matrixMode.has_key('Profile') and useSequentialFeatures:
	    from EmbeddingLayer import ProfileEmbeddingLayer
	    pEmbedLayer = ProfileEmbeddingLayer(seqConv.output, seqConv.n_out, seq2matrixMode['Profile'])
	    seq2matrixLayers.append(pEmbedLayer)
	    embedLayers.append(pEmbedLayer)
	"""

        self.layers += seq2matrixLayers

        bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(
            modelSpecs)
        if (bUseCCMraw or bUseFullMI
                or bUseFullCov) and config.CompressMatrixInput(modelSpecs):
            ## here we add a compress layer to reduce the #channels of the original matrix input.
            n_hiddens4MatrixCompress = modelSpecs['hiddens4MatrixCompress']
            compressLayer4MatrixInput = Conv2D4DistMatrix(
                rng,
                input=matrixInput,
                n_in=n_in_matrix,
                n_hiddens=n_hiddens4MatrixCompress,
                halfWinSize=0,
                mask=mask_matrix)
            compressedMatrixInput = compressLayer4MatrixInput.output
            n_compressedMatrix = compressLayer4MatrixInput.n_out
            input_2d = T.concatenate(
                [compressedMatrixInput] +
                [layer.output for layer in seq2matrixLayers],
                axis=3)
            n_input2d = n_compressedMatrix + sum(
                [layer.n_out for layer in seq2matrixLayers])
        else:

            ##old code for merging original matrix input and sequential input
            input_2d = T.concatenate(
                [matrixInput] + [layer.output for layer in seq2matrixLayers],
                axis=3)
            n_input2d = n_in_matrix + sum(
                [layer.n_out for layer in seq2matrixLayers])

        #print 'n_input2d=', n_input2d

        if modelSpecs['network'].startswith('ResNet'):
            matrixConv = ResNet(rng,
                                input=input_2d,
                                n_in=n_input2d,
                                n_hiddens=n_hiddens_matrix,
                                n_repeats=matrix_repeats,
                                halfWinSize=hwsz_matrix,
                                mask=mask_matrix,
                                activation=act,
                                batchNorm=modelSpecs['batchNorm'],
                                version=modelSpecs['network'])

        elif modelSpecs['network'].startswith('DilatedResNet'):
            #matrixConv=DilatedResNet(rng, input=input_2d, n_in=n_input2d, n_hiddens=n_hiddens_matrix, n_repeats=matrix_repeats, halfWinSize=hwsz_matrix, dilation=dilation_matrix, mask=mask_matrix, activation=act, batchNorm=modelSpecs['batchNorm'], version=modelSpecs['network'])
            matrixConv = DilatedResNet(rng,
                                       input=input_2d,
                                       n_in=n_input2d,
                                       n_hiddens=n_hiddens_matrix,
                                       n_repeats=matrix_repeats,
                                       halfWinSize=hwsz_matrix,
                                       dilation=dilation_matrix,
                                       mask=mask_matrix,
                                       activation=act,
                                       modelSpecs=modelSpecs)
        else:
            print 'ERROR: Unimplemented deep network type: ', modelSpecs[
                'network']
            exit(1)

        self.layers.append(matrixConv)

        conv_out = matrixConv.output

        selected = conv_out.dimshuffle(3, 0, 1, 2).flatten(2).dimshuffle(1, 0)
        n_in4logreg = matrixConv.n_out

        self.outputList = []
        self.output_probList = []
        self.predictors = []

        self.params4var = []
        self.paramL14var = 0
        self.paramL24var = 0

        for res in modelSpecs['responses']:

            labelType = Response2LabelType(res)
            predictor = None

            if labelType.startswith('Discrete'):
                assert GetResponseValueDims(res) == 1
                predictor = NN4LogReg(rng=rng,
                                      input=selected,
                                      n_in=n_in4logreg,
                                      n_out=GetResponseProbDims(res),
                                      n_hiddens=n_hiddens_logreg)

            elif labelType.startswith('LogNormal') or labelType.startswith(
                    'Normal'):
                predictor = NN4Normal(rng=rng,
                                      input=selected,
                                      n_in=n_in4logreg,
                                      n_variables=GetResponseValueDims(res),
                                      n_out=GetResponseProbDims(res),
                                      n_hiddens=n_hiddens_logreg)

                ## recording parameters specific for variance prediction
                self.params4var += predictor.params4var
                self.paramL14var += predictor.paramL14var
                self.paramL24var += predictor.paramL24var

            else:
                print 'incorrect response name or label type: ', res
                exit(1)

            self.layers.append(predictor)
            self.predictors.append(predictor)

            ## output in 2d matrix
            output_2d = predictor.y_pred.reshape(
                (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2],
                 GetResponseValueDims(res)))
            output_2d_prob = predictor.output.reshape(
                (conv_out.shape[0], conv_out.shape[1], conv_out.shape[2],
                 GetResponseProbDims(res)))

            self.outputList.append(output_2d)
            self.output_probList.append(output_2d_prob)

        self.output = T.concatenate(self.outputList, axis=3)
        self.output_prob = T.concatenate(self.output_probList, axis=3)

        ## collect all the model parameters and their norms
        self.params = []
        self.paramL2 = 0
        self.paramL1 = 0

        for layer in self.layers:
            self.params += layer.params
            self.paramL2 += layer.paramL2
            self.paramL1 += layer.paramL1
        """
def TrainDataLoader2(sharedQ, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False):
	#print 'trainDataLoader has event: ', stopTrainDataLoader

	bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(modelSpecs)
	if any([bUseCCMraw, bUseFullMI, bUseFullCov]):
		## when full coevolution matrices are used, we shall use float16 to save memory
		floatType = np.float16
	else:
		floatType = theano.config.floatX

	## here we use labelPool to cache the labels of all the training proteins
	## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment
	## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM.
	labelPool = dict()
	labelWeightPool = dict()

	while True:
		if stopTrainDataLoader.is_set() or os.getppid()==1:
			#print 'trainDataLoader receives the stop signal'
			break

		trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
		numOriginals = len(trainDataLocation)
		trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs)
		random.shuffle(trainSeqData)

		#i = 0
		for batch in trainSeqData:
			if stopTrainDataLoader.is_set() or os.getppid()==1:
				#print 'trainDataLoader receives the stop signal'
				break

			data = []
			for protein in batch:
				name = protein['name']
				if labelPool.has_key(name):
					## label is already in the pool
					d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list')
					d['atomLabelMatrix'] = labelPool[name]
				else:
					d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list')
					assert d.has_key('atomLabelMatrix')
					labelPool[name] = d['atomLabelMatrix']

				if config.UseSampleWeight(modelSpecs):
					if not labelWeightPool.has_key(name): 
						labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16)
						labelWeightPool[name] = labelWeightMatrix
						d['labelWeightMatrix'] = labelWeightMatrix
					else:
						d['labelWeightMatrix'] = labelWeightPool[name]

				data.append(d)

			FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)
			if assembleData:
				data = PrepareInput4Train(data, modelSpecs, floatType=floatType, UseSharedMemory=UseSharedMemory)
			#print 'putting data to trainDataLoader queue...'
			sharedQ.put(data)

			"""
			i += 1
			if i%100 == 0:
				print '#batches of train data loaded: ', i
			"""

		#print 'TrainDataLoader with #PID ', os.getpid(), ' currently has ', len(labelPool), ' label matrices  and ', len(labelMatrixPool), ' label weight matrices'
	print 'TrainDataLoader has finished loading data'
	sharedQ.close()