예제 #1
0
def byMinimum(data, response, matches, labels):
    """
    Downsamples the training data to match size of smallest-sample label.
    
    Parameters:
    - - - - -
        data : dictionary of training data, where keys are subjects
                        and values are the vertex-wise data arrays
        labels : dictionary of training labels, where keys are subjects
                            and values are vertex-wise label assignments
        matches : dictionary of matches, where keys are subjects, and
                            values are vertex-to-label frequency arrays
        labels : set of unique labels across all training subjects
    Returns:
    - - - -
        pData : downsampled data array
        pLabels : downsampled response vectors
        pMatches : downsampled matches
    """

    data = du.mergeValueArrays(data)
    response = du.mergeValueLists(response)
    matches = du.mergeValueArrays(matches)

    minSize = sys.maxint

    pData = du.splitArrayByResponse(data, response, labels)
    pMatches = du.splitArrayByResponse(matches, response, labels)

    pLabels = du.buildResponseVector(pData)

    # compute minimum size sample array
    for lab in labels:

        tempData = pData[lab]
        minSize = min(minSize, tempData.shape[0])

    # downsample remaining arrays
    for lab in labels:

        tempData = pData[lab]
        tempMatches = pMatches[lab]
        tempLabels = pLabels[lab]

        inds = np.random.choice(np.arange(tempData.shape[0]),
                                size=minSize,
                                replace=False)
        pData[lab] = tempData[inds, :]
        pMatches[lab] = tempMatches[inds, :]
        pLabels[lab] = tempLabels[inds, :]

    return [pData, pLabels, pMatches]
def validation(inputData, eval_factor):
    """
    Processing the validation data from the training set.  The validation 
    data is used to monitor the performance of the model, as the model is 
    trained.  It is expected to withold the validation data 
    from the test data.  The validation is used merely to inform 
    parameter selection.
    
    Parameters:
    - - - - -
        training : list of 3 dictionaries (0 = features, 1 = labels, 2 = matches)
        eval_size : fraction of training size to use as validation set
    """

    data = inputData[0]
    labels = inputData[1]
    matches = inputData[2]

    subjects = data.keys()

    # By default, will select at least 1 validation subject from list
    full = len(subjects)
    val = max(1, int(np.floor(eval_factor * full)))

    print 'Total training subjects: {}'.format(full)

    # subject lists for training and validation sets
    train = list(np.random.choice(subjects, size=(full - val), replace=False))
    valid = list(set(subjects).difference(set(train)))

    inter = set(train).intersection(set(valid))
    print '{} training, {} validation.'.format(len(train), len(valid))
    print '{} overlap between groups.'.format(len(inter))

    training = du.subselectDictionary(train, [data, labels, matches])
    validation = du.subselectDictionary(valid, [data, labels, matches])

    validation[0] = du.mergeValueArrays(validation[0])
    validation[1] = du.mergeValueLists(validation[1])
    validation[2] = du.mergeValueArrays(validation[2])

    return [training, validation]
def downsample(inputData, method, L=None):
    """
    Wrapper to downsample training data.
    """

    methodFuncs = {'equal': ds.byMinimum, 'core': ds.byCore}

    if not L:
        L = np.arange(1, 181)
    else:
        L = np.arange(1, L + 1)

    x = inputData[0]
    y = inputData[1]
    m = inputData[2]

    [x, y, m] = methodFuncs[method](x, y, m, L)

    x = du.mergeValueArrays(x)
    y = du.mergeValueLists(y)
    m = du.mergeValueArrays(m)

    return [x, y, m]
def mapLabelsToData(dataDict, labelDict, labelSet):
    """
    Partitions the training data for all subjects by label.
    
    Parameters:
    - - - - -
        dataDict : dictionary mapping subject names to data arrays
        labelDict : dictionary mapping subject names to cortical maps
        labelSet : set of unique labels across all training subjects
    Returns:
    - - - -
        pData : partitioned data
        pLabels : partitioned labels
    """

    assert dataDict.keys() == labelDict.keys()

    mData = du.mergeValueArrays(dataDict)
    mLabels = du.mergeValueLists(labelDict)

    partData = du.splitArrayByResponse(mData, mLabels, labelSet)

    return partData
예제 #5
0
    def fit(self,
            x_train,
            y_train,
            neighbors,
            L,
            classifier=None,
            model_type='ori',
            **kwargs):
        """
        Method to initialize training data and fit the classifiers.
        
        Parameters:
        - - - - -

            x_train : training feature data, partitioned by response
            
            y_train : training response vectors, partitioned by response
            
            model_type : type of classification scheme for multi-class 
                         A   prediction models

            kwargs : optional arguments for classifier
        """

        if not classifier:
            classifier = rfc(n_estimators=self.n_estimators,
                             max_depth=self.max_depth,
                             n_jobs=-1)

        labels = np.arange(1, L + 1)
        self.labels = labels
        self.neighbors = neighbors

        x_train = du.mergeValueArrays(x_train)
        y_train = du.mergeValueLists(y_train)

        self.input_dim = x_train.shape[1]

        labelKeys = x_train.keys()

        # get valid arguments for supplied classifier
        # get valid parameters passed by user
        # update classifier parameters
        # save base models
        classifier_params = inspect.getargspec(classifier.__init__)
        classArgs = cu.parseKwargs(classifier_params, kwargs)
        classifier.set_params(**classArgs)

        print 'depth: {}'.format(classifier.max_depth)
        print 'nEst: {}'.format(classifier.n_estimators)

        model_selector = {
            'oVo': OneVsOneClassifier(classifier),
            'oVr': OneVsRestClassifier(classifier),
            'ori': classifier
        }

        models = {}.fromkeys(labels)

        for i, lab in enumerate(labels):
            if lab in labelKeys and lab in neighbors.keys():

                # compute confusion set of labels
                labelNeighbors = set([lab]).union(
                    neighbors[lab]).intersection(labels)

                # copy the model (due to passing by object-reference)
                models[lab] = copy.deepcopy(model_selector[model_type])

                # extract data for confusion set, train model
                training = du.mergeValueArrays(x_train, keys=labelNeighbors)
                response = du.mergeValueLists(y_train, keys=labelNeighbors)

                models[lab].fit(training, np.squeeze(response))

        self.models = models
예제 #6
0
df_size = pd.DataFrame(columns=cols)

for subj in subjects:

    featureFile = ''.join([featureDir, subj, featureExt])
    labelFile = ''.join([labelDir, subj, labelExt])

    if os.path.exists(featureFile) and os.path.exists(labelFile):

        print 'Processing %s' % subj
        features = h5py.File(featureFile, mode='r')

        features = h5py.File(featureFile, mode='r')
        dataDict = features[subj]

        dataArray = du.mergeValueArrays(dataDict, keys=featureKeys)
        features.close()

        label = ld.loadGii(labelFile, darray=np.arange(1))

        regSim = hmg.regionalSimilarity(dataArray, label)
        regSize = {}.fromkeys(regSim.keys())

        for k in regSize.keys():
            indx = np.where(label == k)[0]
            regSize[k] = len(indx)

        df_hmg = df_hmg.append(regSim, ignore_index=True)
        df_size = df_size.append(regSize, ignore_index=True)

if df_hmg.shape[0] != 0: