def byMinimum(data, response, matches, labels): """ Downsamples the training data to match size of smallest-sample label. Parameters: - - - - - data : dictionary of training data, where keys are subjects and values are the vertex-wise data arrays labels : dictionary of training labels, where keys are subjects and values are vertex-wise label assignments matches : dictionary of matches, where keys are subjects, and values are vertex-to-label frequency arrays labels : set of unique labels across all training subjects Returns: - - - - pData : downsampled data array pLabels : downsampled response vectors pMatches : downsampled matches """ data = du.mergeValueArrays(data) response = du.mergeValueLists(response) matches = du.mergeValueArrays(matches) minSize = sys.maxint pData = du.splitArrayByResponse(data, response, labels) pMatches = du.splitArrayByResponse(matches, response, labels) pLabels = du.buildResponseVector(pData) # compute minimum size sample array for lab in labels: tempData = pData[lab] minSize = min(minSize, tempData.shape[0]) # downsample remaining arrays for lab in labels: tempData = pData[lab] tempMatches = pMatches[lab] tempLabels = pLabels[lab] inds = np.random.choice(np.arange(tempData.shape[0]), size=minSize, replace=False) pData[lab] = tempData[inds, :] pMatches[lab] = tempMatches[inds, :] pLabels[lab] = tempLabels[inds, :] return [pData, pLabels, pMatches]
def validation(inputData, eval_factor): """ Processing the validation data from the training set. The validation data is used to monitor the performance of the model, as the model is trained. It is expected to withold the validation data from the test data. The validation is used merely to inform parameter selection. Parameters: - - - - - training : list of 3 dictionaries (0 = features, 1 = labels, 2 = matches) eval_size : fraction of training size to use as validation set """ data = inputData[0] labels = inputData[1] matches = inputData[2] subjects = data.keys() # By default, will select at least 1 validation subject from list full = len(subjects) val = max(1, int(np.floor(eval_factor * full))) print 'Total training subjects: {}'.format(full) # subject lists for training and validation sets train = list(np.random.choice(subjects, size=(full - val), replace=False)) valid = list(set(subjects).difference(set(train))) inter = set(train).intersection(set(valid)) print '{} training, {} validation.'.format(len(train), len(valid)) print '{} overlap between groups.'.format(len(inter)) training = du.subselectDictionary(train, [data, labels, matches]) validation = du.subselectDictionary(valid, [data, labels, matches]) validation[0] = du.mergeValueArrays(validation[0]) validation[1] = du.mergeValueLists(validation[1]) validation[2] = du.mergeValueArrays(validation[2]) return [training, validation]
def downsample(inputData, method, L=None): """ Wrapper to downsample training data. """ methodFuncs = {'equal': ds.byMinimum, 'core': ds.byCore} if not L: L = np.arange(1, 181) else: L = np.arange(1, L + 1) x = inputData[0] y = inputData[1] m = inputData[2] [x, y, m] = methodFuncs[method](x, y, m, L) x = du.mergeValueArrays(x) y = du.mergeValueLists(y) m = du.mergeValueArrays(m) return [x, y, m]
def mapLabelsToData(dataDict, labelDict, labelSet): """ Partitions the training data for all subjects by label. Parameters: - - - - - dataDict : dictionary mapping subject names to data arrays labelDict : dictionary mapping subject names to cortical maps labelSet : set of unique labels across all training subjects Returns: - - - - pData : partitioned data pLabels : partitioned labels """ assert dataDict.keys() == labelDict.keys() mData = du.mergeValueArrays(dataDict) mLabels = du.mergeValueLists(labelDict) partData = du.splitArrayByResponse(mData, mLabels, labelSet) return partData
def fit(self, x_train, y_train, neighbors, L, classifier=None, model_type='ori', **kwargs): """ Method to initialize training data and fit the classifiers. Parameters: - - - - - x_train : training feature data, partitioned by response y_train : training response vectors, partitioned by response model_type : type of classification scheme for multi-class A prediction models kwargs : optional arguments for classifier """ if not classifier: classifier = rfc(n_estimators=self.n_estimators, max_depth=self.max_depth, n_jobs=-1) labels = np.arange(1, L + 1) self.labels = labels self.neighbors = neighbors x_train = du.mergeValueArrays(x_train) y_train = du.mergeValueLists(y_train) self.input_dim = x_train.shape[1] labelKeys = x_train.keys() # get valid arguments for supplied classifier # get valid parameters passed by user # update classifier parameters # save base models classifier_params = inspect.getargspec(classifier.__init__) classArgs = cu.parseKwargs(classifier_params, kwargs) classifier.set_params(**classArgs) print 'depth: {}'.format(classifier.max_depth) print 'nEst: {}'.format(classifier.n_estimators) model_selector = { 'oVo': OneVsOneClassifier(classifier), 'oVr': OneVsRestClassifier(classifier), 'ori': classifier } models = {}.fromkeys(labels) for i, lab in enumerate(labels): if lab in labelKeys and lab in neighbors.keys(): # compute confusion set of labels labelNeighbors = set([lab]).union( neighbors[lab]).intersection(labels) # copy the model (due to passing by object-reference) models[lab] = copy.deepcopy(model_selector[model_type]) # extract data for confusion set, train model training = du.mergeValueArrays(x_train, keys=labelNeighbors) response = du.mergeValueLists(y_train, keys=labelNeighbors) models[lab].fit(training, np.squeeze(response)) self.models = models
df_size = pd.DataFrame(columns=cols) for subj in subjects: featureFile = ''.join([featureDir, subj, featureExt]) labelFile = ''.join([labelDir, subj, labelExt]) if os.path.exists(featureFile) and os.path.exists(labelFile): print 'Processing %s' % subj features = h5py.File(featureFile, mode='r') features = h5py.File(featureFile, mode='r') dataDict = features[subj] dataArray = du.mergeValueArrays(dataDict, keys=featureKeys) features.close() label = ld.loadGii(labelFile, darray=np.arange(1)) regSim = hmg.regionalSimilarity(dataArray, label) regSize = {}.fromkeys(regSim.keys()) for k in regSize.keys(): indx = np.where(label == k)[0] regSize[k] = len(indx) df_hmg = df_hmg.append(regSim, ignore_index=True) df_size = df_size.append(regSize, ignore_index=True) if df_hmg.shape[0] != 0: