예제 #1
0
파일: crossvalsets.py 프로젝트: caiw/pyrsa
def sets_k_fold_rdm(rdms, k_rdm=None, random=True, rdm_descriptor='index'):
    """ generates training and test set combinations by splitting into k
    similar sized groups. This version splits both over rdms and over patterns
    resulting in k_rdm * k_pattern (training, test) pairs.

    Args:
        rdms(rsatoolbox.rdm.RDMs): rdms to use
        rdm_descriptor(String): descriptor to select rdm groups
        k_rdm(int): number of rdm groups
        random(bool): whether the assignment shall be randomized

    Returns:
        train_set(list): list of tuples (rdms, pattern_idx)
        test_set(list): list of tuples (rdms, pattern_idx)

    """
    rdm_select = rdms.rdm_descriptors[rdm_descriptor]
    rdm_select = np.unique(rdm_select)
    if k_rdm is None:
        k_rdm = default_k_rdm(len(rdm_select))
    assert k_rdm <= len(rdm_select), \
        'Can make at most as many groups as rdms'
    if random:
        np.random.shuffle(rdm_select)
    group_size_rdm = np.floor(len(rdm_select) / k_rdm)
    additional_rdms = len(rdm_select) % k_rdm
    train_set = []
    test_set = []
    for i_group in range(k_rdm):
        test_idx = np.arange(i_group * group_size_rdm,
                             (i_group + 1) * group_size_rdm)
        if i_group < additional_rdms:
            test_idx = np.concatenate((test_idx, [-(i_group + 1)]))
        train_idx = np.setdiff1d(np.arange(len(rdm_select)), test_idx)
        rdm_idx_test = [rdm_select[int(idx)] for idx in test_idx]
        rdm_idx_train = [rdm_select[int(idx)] for idx in train_idx]
        rdms_test = rdms.subsample(rdm_descriptor, rdm_idx_test)
        rdms_train = rdms.subsample(rdm_descriptor, rdm_idx_train)
        train_set.append([rdms_train, np.arange(rdms_train.n_cond)])
        test_set.append([rdms_test, np.arange(rdms_train.n_cond)])
    ceil_set = train_set
    return train_set, test_set, ceil_set
예제 #2
0
파일: crossvalsets.py 프로젝트: caiw/pyrsa
def sets_k_fold(rdms,
                k_rdm=None,
                k_pattern=None,
                random=True,
                pattern_descriptor='index',
                rdm_descriptor='index'):
    """ generates training and test set combinations by splitting into k
    similar sized groups. This version splits both over rdms and over patterns
    resulting in k_rdm * k_pattern (training, test) pairs.

    If a k is set to 1 the corresponding dimension is not crossvalidated.

    Args:
        rdms(rsatoolbox.rdm.RDMs): rdms to use
        pattern_descriptor(String): descriptor to select pattern groups
        rdm_descriptor(String): descriptor to select rdm groups
        k_rdm(int): number of rdm groups
        k_pattern(int): number of pattern groups
        random(bool): whether the assignment shall be randomized

    Returns:
        train_set(list): list of tuples (rdms, pattern_idx)
        test_set(list): list of tuples (rdms, pattern_idx)
        ceil_set(list): list of tuples (rdms, pattern_idx)

    """
    rdm_select = rdms.rdm_descriptors[rdm_descriptor]
    rdm_select = np.unique(rdm_select)
    if k_rdm is None:
        k_rdm = default_k_rdm(len(rdm_select))
    pattern_descriptor, pattern_select = \
        add_pattern_index(rdms, pattern_descriptor)
    if k_pattern is None:
        k_pattern = default_k_pattern(len(pattern_select))
    assert k_rdm <= len(rdm_select), \
        'Can make at most as many groups as rdms'
    if random:
        np.random.shuffle(rdm_select)
    group_size_rdm = np.floor(len(rdm_select) / k_rdm)
    additional_rdms = len(rdm_select) % k_rdm
    train_set = []
    test_set = []
    ceil_set = []
    for i_group in range(k_rdm):
        test_idx = np.arange(i_group * group_size_rdm,
                             (i_group + 1) * group_size_rdm)
        if i_group < additional_rdms:
            test_idx = np.concatenate((test_idx, [-(i_group + 1)]))
        if k_rdm <= 1:
            train_idx = test_idx
        else:
            train_idx = np.setdiff1d(np.arange(len(rdm_select)), test_idx)
        rdm_idx_test = [rdm_select[int(idx)] for idx in test_idx]
        rdm_idx_train = [rdm_select[int(idx)] for idx in train_idx]
        rdms_test = rdms.subsample(rdm_descriptor, rdm_idx_test)
        rdms_train = rdms.subsample(rdm_descriptor, rdm_idx_train)
        train_new, test_new, _ = sets_k_fold_pattern(
            rdms_train,
            k=k_pattern,
            pattern_descriptor=pattern_descriptor,
            random=random)
        ceil_new = test_new.copy()
        for i_pattern in range(k_pattern):
            test_new[i_pattern][0] = rdms_test.subset_pattern(
                by=pattern_descriptor, value=test_new[i_pattern][1])
        train_set += train_new
        test_set += test_new
        ceil_set += ceil_new
    return train_set, test_set, ceil_set
예제 #3
0
파일: crossvalsets.py 프로젝트: caiw/pyrsa
def sets_random(rdms,
                n_rdm=None,
                n_pattern=None,
                n_cv=2,
                pattern_descriptor='index',
                rdm_descriptor='index'):
    """ generates training and test set combinations by selecting random
    test sets of n_rdm RDMs and n_pattern patterns and using the rest of
    the data as the training set.

    If a n is set to 0 the corresponding dimension is not crossvalidated.

    Args:
        rdms(rsatoolbox.rdm.RDMs): rdms to split
        pattern_descriptor(String): descriptor to select pattern groups
        rdm_descriptor(String): descriptor to select rdm groups
        n_rdm(int): number of rdms per test set
        n_pattern(int): number of patterns per test set

    Returns:
        train_set(list): list of tuples (rdms, pattern_idx)
        test_set(list): list of tuples (rdms, pattern_idx)
        ceil_set(list): list of tuples (rdms, pattern_idx)

    """
    rdm_select = rdms.rdm_descriptors[rdm_descriptor]
    rdm_select = np.unique(rdm_select)
    if n_rdm is None:
        k_rdm = default_k_rdm(len(rdm_select))
        n_rdm = int(np.floor(len(rdm_select) / k_rdm))
    pattern_descriptor, pattern_select = \
        add_pattern_index(rdms, pattern_descriptor)
    if n_pattern is None:
        k_pattern = default_k_pattern(len(pattern_select))
        n_pattern = int(np.floor(len(pattern_select) / k_pattern))
    train_set = []
    test_set = []
    ceil_set = []
    for _i_group in range(n_cv):
        # shuffle
        np.random.shuffle(rdm_select)
        np.random.shuffle(pattern_select)
        # choose indices based on n_rdm
        if n_rdm == 0:
            train_idx = np.arange(len(rdm_select))
            test_idx = np.arange(len(rdm_select))
        else:
            test_idx = np.arange(n_rdm)
            train_idx = np.arange(n_rdm, len(rdm_select))
        # take subset of rdms
        rdm_idx_test = [rdm_select[int(idx)] for idx in test_idx]
        rdm_idx_train = [rdm_select[int(idx)] for idx in train_idx]
        rdms_test = rdms.subsample(rdm_descriptor, rdm_idx_test)
        rdms_train = rdms.subsample(rdm_descriptor, rdm_idx_train)
        # choose indices based on n_pattern
        if n_pattern == 0:
            train_idx = np.arange(len(pattern_select))
            test_idx = np.arange(len(pattern_select))
        else:
            test_idx = np.arange(n_pattern)
            train_idx = np.arange(n_pattern, len(pattern_select))
        pattern_idx_test = [pattern_select[int(idx)] for idx in test_idx]
        pattern_idx_train = [pattern_select[int(idx)] for idx in train_idx]
        rdms_test = rdms_test.subset_pattern(pattern_descriptor,
                                             pattern_idx_test)
        rdms_ceil = rdms_train.subset_pattern(pattern_descriptor,
                                              pattern_idx_test)
        rdms_train = rdms_train.subset_pattern(pattern_descriptor,
                                               pattern_idx_train)
        test_set.append([rdms_test, pattern_idx_test])
        train_set.append([rdms_train, pattern_idx_train])
        ceil_set.append([rdms_ceil, pattern_idx_test])
    return train_set, test_set, ceil_set
예제 #4
0
 def test_default_k_rdm(self):
     from rsatoolbox.util.inference_util import default_k_rdm
     self.assertEqual(default_k_rdm(5), 2)
     self.assertEqual(default_k_rdm(11), 3)
     self.assertEqual(default_k_rdm(19), 4)
     self.assertEqual(default_k_rdm(100), 5)
예제 #5
0
def bootstrap_cv_random(models,
                        data,
                        method='cosine',
                        fitter=None,
                        n_pattern=None,
                        n_rdm=None,
                        N=1000,
                        n_cv=2,
                        pattern_descriptor='index',
                        rdm_descriptor='index',
                        random=True,
                        boot_type='both',
                        use_correction=True):
    """evaluates a set of models by a evaluating a few random crossvalidation
    folds per bootstrap.

    If a k is set to 1 no crossvalidation is performed over the
    corresponding dimension.

    As especially crossvalidation over patterns/conditions creates
    variance in the cv result for a single variance the default setting
    of n_cv=1 inflates the estimated variance. Setting this value
    higher will decrease this effect at the cost of more computation time.

    by default ks are set by rsatoolbox.util.inference_util.default_k_pattern
    and rsatoolbox.util.inference_util.default_k_rdm based on the number of
    rdms and patterns provided. the ks are then in the range 2-5.

    Args:
        models(rsatoolbox.model.Model): models to be evaluated
        data(rsatoolbox.rdm.RDMs): RDM data to use
        method(string): comparison method to use
        fitter(function): fitting method for models
        k_pattern(int): #folds over patterns
        k_rdm(int): #folds over rdms
        N(int): number of bootstrap samples (default: 1000)
        n_cv(int) : number of crossvalidation runs per sample (default: 1)
        pattern_descriptor(string): descriptor to group patterns
        rdm_descriptor(string): descriptor to group rdms
        random(bool): randomize group assignments (default: True)
        boot_type(String): which dimension to bootstrap over (default: 'both')
            alternatives: 'rdm', 'pattern'
        use_correction(bool): switch for the correction for the
            variance caused by crossvalidation (default: True)

    Returns:
        numpy.ndarray: matrix of evaluations (N x k)

    """
    if n_pattern is None:
        n_pattern_all = len(
            np.unique(data.pattern_descriptors[pattern_descriptor]))
        k_pattern = default_k_pattern((1 - 1 / np.exp(1)) * n_pattern_all)
        n_pattern = int(np.floor(n_pattern_all / k_pattern))
    if n_rdm is None:
        n_rdm_all = len(np.unique(data.rdm_descriptors[rdm_descriptor]))
        k_rdm = default_k_rdm((1 - 1 / np.exp(1)) * n_rdm_all)
        n_rdm = int(np.floor(n_rdm_all / k_rdm))
    if isinstance(models, Model):
        models = [models]
    evaluations = np.zeros((N, len(models), n_cv))
    noise_ceil = np.zeros((2, N, n_cv))
    for i_sample in tqdm.trange(N):
        if boot_type == 'both':
            sample, rdm_idx, pattern_idx = bootstrap_sample(
                data,
                rdm_descriptor=rdm_descriptor,
                pattern_descriptor=pattern_descriptor)
        elif boot_type == 'pattern':
            sample, pattern_idx = bootstrap_sample_pattern(
                data, pattern_descriptor=pattern_descriptor)
            rdm_idx = np.unique(data.rdm_descriptors[rdm_descriptor])
        elif boot_type == 'rdm':
            sample, rdm_idx = bootstrap_sample_rdm(
                data, rdm_descriptor=rdm_descriptor)
            pattern_idx = np.unique(
                data.pattern_descriptors[pattern_descriptor])
        else:
            raise ValueError('boot_type not understood')
        if len(np.unique(rdm_idx)) > n_rdm \
           and len(np.unique(pattern_idx)) >= 3 + n_pattern:
            train_set, test_set, ceil_set = sets_random(
                sample,
                pattern_descriptor=pattern_descriptor,
                rdm_descriptor=rdm_descriptor,
                n_pattern=n_pattern,
                n_rdm=n_rdm,
                n_cv=n_cv)
            if n_rdm > 0 or n_pattern > 0:
                nc = cv_noise_ceiling(sample,
                                      ceil_set,
                                      test_set,
                                      method=method,
                                      pattern_descriptor=pattern_descriptor)
            else:
                nc = boot_noise_ceiling(sample,
                                        method=method,
                                        rdm_descriptor=rdm_descriptor)
            noise_ceil[:, i_sample] = nc
            for idx in range(len(test_set)):
                test_set[idx][1] = _concat_sampling(pattern_idx,
                                                    test_set[idx][1])
                train_set[idx][1] = _concat_sampling(pattern_idx,
                                                     train_set[idx][1])
            cv_result = crossval(models,
                                 sample,
                                 train_set,
                                 test_set,
                                 method=method,
                                 fitter=fitter,
                                 pattern_descriptor=pattern_descriptor,
                                 calc_noise_ceil=False)
            evaluations[i_sample, :, :] = cv_result.evaluations[0]
        else:  # sample does not allow desired crossvalidation
            evaluations[i_sample, :, :] = np.nan
            noise_ceil[:, i_sample] = np.nan
    if boot_type == 'both':
        cv_method = 'bootstrap_crossval'
        dof = min(data.n_rdm, data.n_cond) - 1
    elif boot_type == 'pattern':
        cv_method = 'bootstrap_crossval_pattern'
        dof = data.n_cond - 1
    elif boot_type == 'rdm':
        cv_method = 'bootstrap_crossval_rdm'
        dof = data.n_rdm - 1
    eval_ok = ~np.isnan(evaluations[:, 0, 0])
    if use_correction and n_cv > 1:
        # we essentially project from the two points for 1 repetition and
        # for n_cv repetitions to infinitely many cv repetitions
        evals_mean = np.mean(evaluations[eval_ok], -1)
        evals_1 = evaluations[eval_ok]
        noise_ceil_mean = np.mean(noise_ceil[:, eval_ok], -1)
        noise_ceil_1 = noise_ceil[:, eval_ok]
        var_mean = np.cov(np.concatenate([evals_mean.T, noise_ceil_mean]))
        var_1 = []
        for i in range(n_cv):
            var_1.append(
                np.cov(
                    np.concatenate([evals_1[:, :, i].T, noise_ceil_1[:, :,
                                                                     i]])))
        var_1 = np.mean(np.array(var_1), axis=0)
        # this is the main formula for the correction:
        variances = (n_cv * var_mean - var_1) / (n_cv - 1)
    else:
        if use_correction:
            raise Warning('correction requested, but only one cv run' +
                          ' per sample requested. This is invalid!' +
                          ' We do not use the correction for now.')
        evals_nonan = np.mean(np.mean(evaluations[eval_ok], -1), -1)
        noise_ceil_nonan = np.mean(noise_ceil[:, eval_ok], -1)
        variances = np.cov(np.concatenate([evals_nonan.T, noise_ceil_nonan]))
    result = Result(models,
                    evaluations,
                    method=method,
                    cv_method=cv_method,
                    noise_ceiling=noise_ceil,
                    variances=variances,
                    dof=dof)
    return result
예제 #6
0
def bootstrap_crossval(models,
                       data,
                       method='cosine',
                       fitter=None,
                       k_pattern=None,
                       k_rdm=None,
                       N=1000,
                       n_cv=2,
                       pattern_descriptor='index',
                       rdm_descriptor='index',
                       random=True,
                       boot_type='both',
                       use_correction=True):
    """evaluates a set of models by k-fold crossvalidation within a bootstrap

    Crossvalidation creates variance in the results for a single bootstrap
    sample, because different assginments to the training and test group
    lead to different results. To correct for this, we apply a formula
    which estimates the variance we expect if we evaluated all possible
    crossvalidation assignments from n_cv different assignments per bootstrap
    sample.
    In our statistical evaluations we saw that many bootstrap samples and
    few different crossvalidation assignments are optimal to minimize the
    variance of the variance estimate. Thus, this function by default
    applies this correction formula and sets n_cv=2, i.e. performs only two
    different assignments per fold.
    This function nonetheless performs full crossvalidation schemes, i.e.
    in every bootstrap sample all crossvalidation folds are evaluated such
    that each RDM and each condition is in the test set n_cv times. For the
    even more optimized version which computes only two randomly chosen test
    sets see bootstrap_cv_random.

    The k_[] parameters control the cross-validation per sample. They give
    the number of crossvalidation folds to be created along this dimension.
    If a k is set to 1 no crossvalidation is performed over the
    corresponding dimension.
    by default ks are set by rsatoolbox.util.inference_util.default_k_pattern
    and rsatoolbox.util.inference_util.default_k_rdm based on the number of
    rdms and patterns provided. the ks are then in the range 2-5.

    Using the []_descriptor inputs you may make the crossvalidation and
    bootstrap aware of groups of rdms or conditions to be handled en block.
    Conditions with the same entry will be sampled in or out of the bootstrap
    together and will be assigned to cross-calidation folds together.

    Using the boot_type argument you may choose the dimension to bootstrap.
    By default both conditions and RDMs are resampled. You may alternatively
    choose to resample only one of them by passing 'rdm' or 'pattern'.

    models should be a list of models. data the RDMs object to evaluate against
    method the method for comparing the predictions and the data. fitter may
    provide a non-default funcion or list of functions to fit the models.

    Args:
        models(rsatoolbox.model.Model): models to be evaluated
        data(rsatoolbox.rdm.RDMs): RDM data to use
        method(string): comparison method to use
        fitter(function): fitting method for models
        k_pattern(int): #folds over patterns
        k_rdm(int): #folds over rdms
        N(int): number of bootstrap samples (default: 1000)
        n_cv(int) : number of crossvalidation runs per sample (default: 1)
        pattern_descriptor(string): descriptor to group patterns
        rdm_descriptor(string): descriptor to group rdms
        random(bool): randomize group assignments (default: True)
        boot_type(String): which dimension to bootstrap over (default: 'both')
            alternatives: 'rdm', 'pattern'
        use_correction(bool): switch for the correction for the
            variance caused by crossvalidation (default: True)

    Returns:
        numpy.ndarray: matrix of evaluations (N x k)

    """
    if k_pattern is None:
        n_pattern = len(np.unique(
            data.pattern_descriptors[pattern_descriptor]))
        k_pattern = default_k_pattern((1 - 1 / np.exp(1)) * n_pattern)
    if k_rdm is None:
        n_rdm = len(np.unique(data.rdm_descriptors[rdm_descriptor]))
        k_rdm = default_k_rdm((1 - 1 / np.exp(1)) * n_rdm)
    if isinstance(models, Model):
        models = [models]
    evaluations = np.empty((N, len(models), k_pattern * k_rdm, n_cv))
    noise_ceil = np.empty((2, N, n_cv))
    for i_sample in tqdm.trange(N):
        if boot_type == 'both':
            sample, rdm_idx, pattern_idx = bootstrap_sample(
                data,
                rdm_descriptor=rdm_descriptor,
                pattern_descriptor=pattern_descriptor)
        elif boot_type == 'pattern':
            sample, pattern_idx = bootstrap_sample_pattern(
                data, pattern_descriptor=pattern_descriptor)
            rdm_idx = np.unique(data.rdm_descriptors[rdm_descriptor])
        elif boot_type == 'rdm':
            sample, rdm_idx = bootstrap_sample_rdm(
                data, rdm_descriptor=rdm_descriptor)
            pattern_idx = np.unique(
                data.pattern_descriptors[pattern_descriptor])
        else:
            raise ValueError('boot_type not understood')
        if len(np.unique(rdm_idx)) >= k_rdm \
           and len(np.unique(pattern_idx)) >= 3 * k_pattern:
            for i_rep in range(n_cv):
                evals, cv_nc = _internal_cv(models, sample, pattern_descriptor,
                                            rdm_descriptor, pattern_idx,
                                            k_pattern, k_rdm, method, fitter)
                noise_ceil[:, i_sample, i_rep] = cv_nc
                evaluations[i_sample, :, :, i_rep] = evals[0]
        else:  # sample does not allow desired crossvalidation
            evaluations[i_sample, :, :] = np.nan
            noise_ceil[:, i_sample] = np.nan
    if boot_type == 'both':
        cv_method = 'bootstrap_crossval'
        dof = min(data.n_rdm, data.n_cond) - 1
    elif boot_type == 'pattern':
        cv_method = 'bootstrap_crossval_pattern'
        dof = data.n_cond - 1
    elif boot_type == 'rdm':
        cv_method = 'bootstrap_crossval_rdm'
        dof = data.n_rdm - 1
    eval_ok = ~np.isnan(evaluations[:, 0, 0, 0])
    if use_correction and n_cv > 1:
        # we essentially project from the two points for 1 repetition and
        # for n_cv repetitions to infinitely many cv repetitions
        evals_mean = np.mean(np.mean(evaluations[eval_ok], -1), -1)
        evals_1 = np.mean(evaluations[eval_ok], -2)
        noise_ceil_mean = np.mean(noise_ceil[:, eval_ok], -1)
        noise_ceil_1 = noise_ceil[:, eval_ok]
        var_mean = np.cov(np.concatenate([evals_mean.T, noise_ceil_mean]))
        var_1 = []
        for i in range(n_cv):
            var_1.append(
                np.cov(
                    np.concatenate([evals_1[:, :, i].T, noise_ceil_1[:, :,
                                                                     i]])))
        var_1 = np.mean(np.array(var_1), axis=0)
        # this is the main formula for the correction:
        variances = (n_cv * var_mean - var_1) / (n_cv - 1)
    else:
        if use_correction:
            raise Warning('correction requested, but only one cv run' +
                          ' per sample requested. This is invalid!' +
                          ' We do not use the correction for now.')
        evals_nonan = np.mean(np.mean(evaluations[eval_ok], -1), -1)
        noise_ceil_nonan = np.mean(noise_ceil[:, eval_ok], -1)
        variances = np.cov(np.concatenate([evals_nonan.T, noise_ceil_nonan]))
    result = Result(models,
                    evaluations,
                    method=method,
                    cv_method=cv_method,
                    noise_ceiling=noise_ceil,
                    variances=variances,
                    dof=dof)
    return result