Пример #1
0
    def test_add(self):
        n1 = 30
        n2 = 20
        dx = 2
        dy = 1

        X1 = np.random.randn(n1, dx)
        Y1 = np.random.rand(n1, dy)
        X2 = np.random.rand(n2, dx)
        Y2 = np.random.randn(n2, dy) + 1

        pdata1 = data.PairedData(X1, Y1)
        pdata2 = data.PairedData(X2, Y2)
        # merge
        pdata = pdata1 + pdata2

        # check internals
        X = pdata.X
        Y = pdata.Y
        np.testing.assert_array_almost_equal(X[:n1], X1)
        np.testing.assert_array_almost_equal(X[n1:], X2)
        np.testing.assert_array_almost_equal(Y[:n1], Y1)
        np.testing.assert_array_almost_equal(Y[n1:], Y2)
        self.assertTrue(pdata != pdata1)
        self.assertTrue(pdata != pdata2)
        # test size
        self.assertEqual(pdata.sample_size(), n1 + n2)
        self.assertEqual(pdata1.sample_size(), n1)
        self.assertEqual(pdata2.sample_size(), n2)
Пример #2
0
def get_problem_pickle(folder_path, prob_label):
    """
    - folder_path: path to a folder containing the data file relative to
      fsic/data/ folder.
    - prob_label: string of the form described in parse_prob_label() so that
      fsic/data/(folder_path)/(name).p exists.
        _n%d specifies the sample size to resample in each trial.

    Return a (PairedSource object, n, is_h0). 
    """
    dataset_dir = glo.data_file(folder_path)
    if not os.path.exists(dataset_dir):
        raise ValueError('dataset directory does not exist: %s' % dataset_dir)
    pl = parse_prob_label(prob_label)
    data_path = os.path.join(dataset_dir, pl['name'] + '.p')
    if not os.path.exists(data_path):
        raise ValueError('dataset does not exist: %s' % data_path)

    loaded = glo.pickle_load(data_path)
    # Expected "loaded" to be a dictionary {'X': ..., 'Y': ..., ...}
    X, Y = loaded['X'], loaded['Y']

    is_h0 = pl['is_h0']
    is_c = pl['is_classification']
    if is_c:
        assert Y.shape[1] == 1, 'Y should have one column. Shape = %s' % str(
            Y.shape)
        classes = Y[:, 0]
        # If the data is a classification problem, make a 1-of-K coding
        # of the label. We assume that Y has one column.
        # modify Y
        if len(np.unique(Y)) > 2:
            # multiclass problem. Use 1-of-K coding.
            # Only for #classes > 2
            Y = util.one_of_K_code(classes)

    is_std = pl['is_std']
    n = pl['n']
    ndx = pl['ndx']
    ndy = pl['ndy']

    # Standardization after resampling can cause a 0 standard deviation.
    # We will do it as the first step.
    #ps = data.PSStandardize(ps) if is_std else ps
    if is_std:
        X = util.standardize(X)
        Y = util.standardize(Y)

    pdata = data.PairedData(X, Y, label=prob_label)
    ps = data.PSStraResample(pdata,
                             classes) if is_c else data.PSResample(pdata)
    ps = data.PSNullShuffle(ps) if is_h0 else ps
    if not (ndx == 0 and ndy == 0):
        ps = data.PSGaussNoiseDims(ps, ndx, ndy)
    return ps, n, is_h0
Пример #3
0
def job_nfsicJ10_cperm_stoopt(paired_source, tr, te, r):
    """
    - Copula transform the data
    - Use permutations to simulate from the null distribution.
    """
    n_permute = 500

    with util.ContextTimer() as t:
        # copula transform to both X and Y
        cop_map = fea.MarginalCDFMap()
        xtr, ytr = tr.xy()
        xte, yte = te.xy()

        xtr = cop_map.gen_features(xtr)
        ytr = cop_map.gen_features(ytr)
        xte = cop_map.gen_features(xte)
        yte = cop_map.gen_features(yte)

        tr = data.PairedData(xtr, ytr)
        te = data.PairedData(xte, yte)

        to_return = job_nfsicJ10_stoopt(paired_source, tr, te, r, n_permute)
    to_return['time_secs'] = t.secs
    return to_return
Пример #4
0
    def test_sample(self):
        for s in [27, 91]:
            n_ori = 200
            p_fracs = [0.1, 0.5, 0.4]
            X = np.random.randn(n_ori, 3)
            Y = np.array([0] * int(p_fracs[0] * n_ori) +
                         [1] * int(p_fracs[1] * n_ori) +
                         [2] * int(p_fracs[2] * n_ori))[:, np.newaxis]
            pdata_ori = data.PairedData(X, Y)
            ps = data.PSStraResample(pdata_ori, Y[:, 0])

            m = 79
            pdata = ps.sample(m, seed=s)
            self.assertEqual(pdata.sample_size(), m)

            _, y = pdata.xy()
            yu, counts = np.unique(y, return_counts=True)
            for i, _ in enumerate(yu):
                self.assertTrue(counts[i] - int(p_fracs[i] * m) <= 1)
Пример #5
0
def get_pdata_mean(n, dx=2):
    X = np.random.randn(n, dx)
    Y = np.mean(X, 1)[:, np.newaxis] + np.random.randn(n, 1) * 0.01
    return data.PairedData(X, Y, label='mean')