def loadTest(self, y, yMatch): """ Method to load the test data into the object. We might be interested in loading new test data, so we have explicitly defined this is as a method. Parameters: - - - - - y : SubjectFeatures object for a test brain yMatch : MatchingFeaturesTest object containing vertLib attribute detailing which labels each vertex in surface y maps to in the training data """ load = self.load save = self.save features = self.features # load test subject data, save as attribtues tObject = ld.loadH5(y, *['full']) ID = tObject.attrs['ID'] parsedData = ld.parseH5(tObject, features) tObject.close() data = parsedData[ID] mtd = cu.mergeFeatures(data, features) print 'Testing shape: {}'.format(mtd.shape) if self.scaled: scaler = self.scaler mtd = scaler.transform(mtd) threshed = ld.loadMat(yMatch) # Computing label-vertex memberships is time consuming # If already precomputed for given test data at specified threshold, # can supply path to load file. if load: if os.path.isfile(load): ltvm = ld.loadPick(load) # Otherwise, compute label-vertex memberships. else: ltvm = cu.vertexMemberships(threshed, 180) self.ltvm = ltvm # if save is provided, save label-vertex memberships to file if save: try: with open(save, "wb") as outFile: pickle.dump(self.labelToVertexMaps, outFile, -1) except IOError: print('Cannot save label-vertex memberships to file.') return [threshed, mtd, ltvm]
def loadTest(self, y, yMatch): """ Method to load the test data into the object. We might be interested in loading new test data, so we have explicitly defined this is as a method. Parameters: - - - - - y : SubjectFeatures object for a test brain yMatch : MatchingFeaturesTest object containing vertLib attribute detailing which labels each vertex in surface y maps to in the training data """ # load test subject data, save as attribtues tObject = ld.loadH5(y, *['full']) ID = tObject.attrs['ID'] parsedData = ld.parseH5(tObject, self.features) tObject.close() data = parsedData[ID] mtd = cu.mergeFeatures(data, self.features) print 'Testing shape: {}'.format(mtd.shape) if self.scaled: scaler = self.scaler mtd = scaler.transform(mtd) threshed = ld.loadMat(yMatch) ltvm = cu.vertexMemberships(threshed, 180) return [threshed, mtd, ltvm]
def loadData(subjectList, dataMap, features, hemi): """ Generates the training data from a list of subjects. Parameters: - - - - - subjectList : list of subjects to include in training set dataDir : main directory where data exists -- individual features will exist in sub-directories here features : list of features to include hemi : hemisphere to process """ objDict = dataMap['object'].items() objDir = objDict[0][0] objExt = objDict[0][1] midDict = dataMap['midline'].items() midDir = midDict[0][0] midExt = midDict[0][1] matDict = dataMap['matching'].items() matDir = matDict[0][0] matExt = matDict[0][1] data = {} matches = {} for s in subjectList: # Training data trainObject = '{}{}.{}.{}'.format(objDir, s, hemi, objExt) print trainObject midObject = '{}{}.{}.{}'.format(midDir, s, hemi, midExt) matObject = '{}{}.{}.{}'.format(matDir, s, hemi, matExt) # Check to make sure all 3 files exist if os.path.isfile(trainObject) and os.path.isfile( midObject) and os.path.isfile(matObject): # Load midline indices # Subtract 1 for differece between Matlab and Python indexing mids = ld.loadMat(midObject) - 1 mids = set(mids) match = ld.loadMat(matObject) # Load training data and training labels trainH5 = h5py.File(trainObject, mode='r') # Get data corresponding to features of interest subjData = ld.parseH5(trainH5, features) trainH5.close() nSamples = set(np.arange(subjData[s][features[0]].shape[0])) coords = np.asarray(list(nSamples.difference(mids))) for f in subjData[s].keys(): tempData = subjData[s][f] if tempData.ndim == 1: tempData.shape += (1, ) subjData[s][f] = np.squeeze(tempData[coords, :]) match = match[coords, :] data[s] = subjData[s] matches[s] = match return [data, matches]
def loadDataFromList(subjectList, dataDir, features, hemi): """ Generates the training data for the neural network. Parameters: - - - - - subjectList : list of subjects to include in training set dataDir : main directory where data exists -- individual features will exist in sub-directories here features : list of features to include hemi : hemisphere to process """ hemisphere = {}.fromkeys('Left', 'Right') hemisphere['Left'] = 'L' hemisphere['Right'] = 'R' H = hemisphere[hemi] # For now, we hardcode where the data is trainDir = '{}TrainingObjects/FreeSurfer/'.format(dataDir) trainExt = '.{}.TrainingObject.aparc.a2009s.h5'.format(H) midDir = '{}Midlines/'.format(dataDir) midExt = '.{}.Midline_Indices.mat'.format(H) data = {} for s in subjectList: # Training data trainObject = '{}{}{}'.format(trainDir, s, trainExt) midObject = '{}{}{}'.format(midDir, s, midExt) # Check to make sure all 3 files exist if os.path.isfile(trainObject) and os.path.isfile(midObject): # Load midline indices # Subtract 1 for differece between Matlab and Python indexing mids = ld.loadMat(midObject) - 1 mids = set(mids) # Load training data and training labels trainH5 = h5py.File(trainObject, mode='r') # Get data corresponding to features of interest subjData = ld.parseH5(trainH5, features) trainH5.close() nSamples = set(np.arange(subjData[s][features[0]].shape[0])) coords = np.asarray(list(nSamples.difference(mids))) for f in subjData[s].keys(): tempData = subjData[s][f] if tempData.ndim == 1: tempData.shape += (1, ) subjData[s][f] = np.squeeze(tempData[coords, :]) data[s] = subjData[s] return data
def loadTraining(self, trainObject, dataDir, hemisphere, features): """ Parameters: - - - - - trainObject : input training data (either '.p' file, or dictionary) """ # check feature value if not features and not isinstance(features, list): raise ValueError('Features cannot be empty. Must be a list.') else: self.features = features # load the training data loadingFeatures = copy.copy(features) loadingFeatures.append('label') if isinstance(trainObject, str): trainData = ld.loadH5(trainObject, *['full']) elif isinstance(trainObject, h5py._hl.files.File) or isinstance( trainObject, dict): trainData = trainObject elif isinstance(trainObject, list): trainData = loadDataFromList(trainObject, dataDir, loadingFeatures, hemisphere) else: raise ValueError('Training object is of incorrect type.') if not trainData: raise ValueError('Training data cannot be empty.') if isinstance(trainData, h5py._hl.files.File): parseFeatures = copy.deepcopy(self.features) parseFeatures.append('label') parsedData = ld.parseH5(trainData, parseFeatures) trainData.close() trainData = parsedData # get subject IDs in training data subjects = trainData.keys() # if exclude_testing is set, the data for these subjects when fitting the models if self.exclude_testing: subjects = list( set(subjects).difference(set(self.exclude_testing))) # if random is set, select random subset of size random from viable training subjects if not self.random: randomSample = len(subjects) else: randomSample = min(self.random, len(subjects)) sample = np.random.choice(subjects, size=randomSample, replace=False) trainData = {s: trainData[s] for s in sample} training = [] labels = [] #trainFeatures = list(set(self.features).difference({'label'})) print 'Model features: {}'.format(features) nf = [] for f in self.features: if f != 'label': nf.append(f) for subj in trainData.keys(): training.append(cu.mergeFeatures(trainData[subj], nf)) labels.append(cu.mergeFeatures(trainData[subj], ['label'])) trainData = np.squeeze(np.row_stack(training)) labelVector = np.squeeze(np.concatenate(labels)) self.labels = set(labelVector).difference({0, -1}) if self.scale: scaler = preprocessing.StandardScaler(with_mean=True, with_std=True) trainData = scaler.fit_transform(trainData) self.scaler = scaler self.scaled = True # isolate training data corresponding to each label labelData = cu.partitionData(trainData, labelVector, self.labels) response = cu.buildResponseVector(self.labels, labelData) self.input_dim = labelData[labelData.keys()[0]].shape[1] # check quality of training data to ensure all features have same length, # all response vectors have the same number of samples, and that all training data # has the same features cond = True if not compareTrainingDataKeys(labelData, response): print( 'WARNING: Label data and label response do not have same keys.' ) cond = False if not compareTrainingDataSize(labelData, response): print('WARNING: Label data and label response are not same shape.') cond = False if not cond: raise ValueError('Training data is flawed.') return [labelData, response]