def test_add(self): n1 = 30 n2 = 20 dx = 2 dy = 1 X1 = np.random.randn(n1, dx) Y1 = np.random.rand(n1, dy) X2 = np.random.rand(n2, dx) Y2 = np.random.randn(n2, dy) + 1 pdata1 = data.PairedData(X1, Y1) pdata2 = data.PairedData(X2, Y2) # merge pdata = pdata1 + pdata2 # check internals X = pdata.X Y = pdata.Y np.testing.assert_array_almost_equal(X[:n1], X1) np.testing.assert_array_almost_equal(X[n1:], X2) np.testing.assert_array_almost_equal(Y[:n1], Y1) np.testing.assert_array_almost_equal(Y[n1:], Y2) self.assertTrue(pdata != pdata1) self.assertTrue(pdata != pdata2) # test size self.assertEqual(pdata.sample_size(), n1 + n2) self.assertEqual(pdata1.sample_size(), n1) self.assertEqual(pdata2.sample_size(), n2)
def get_problem_pickle(folder_path, prob_label): """ - folder_path: path to a folder containing the data file relative to fsic/data/ folder. - prob_label: string of the form described in parse_prob_label() so that fsic/data/(folder_path)/(name).p exists. _n%d specifies the sample size to resample in each trial. Return a (PairedSource object, n, is_h0). """ dataset_dir = glo.data_file(folder_path) if not os.path.exists(dataset_dir): raise ValueError('dataset directory does not exist: %s' % dataset_dir) pl = parse_prob_label(prob_label) data_path = os.path.join(dataset_dir, pl['name'] + '.p') if not os.path.exists(data_path): raise ValueError('dataset does not exist: %s' % data_path) loaded = glo.pickle_load(data_path) # Expected "loaded" to be a dictionary {'X': ..., 'Y': ..., ...} X, Y = loaded['X'], loaded['Y'] is_h0 = pl['is_h0'] is_c = pl['is_classification'] if is_c: assert Y.shape[1] == 1, 'Y should have one column. Shape = %s' % str( Y.shape) classes = Y[:, 0] # If the data is a classification problem, make a 1-of-K coding # of the label. We assume that Y has one column. # modify Y if len(np.unique(Y)) > 2: # multiclass problem. Use 1-of-K coding. # Only for #classes > 2 Y = util.one_of_K_code(classes) is_std = pl['is_std'] n = pl['n'] ndx = pl['ndx'] ndy = pl['ndy'] # Standardization after resampling can cause a 0 standard deviation. # We will do it as the first step. #ps = data.PSStandardize(ps) if is_std else ps if is_std: X = util.standardize(X) Y = util.standardize(Y) pdata = data.PairedData(X, Y, label=prob_label) ps = data.PSStraResample(pdata, classes) if is_c else data.PSResample(pdata) ps = data.PSNullShuffle(ps) if is_h0 else ps if not (ndx == 0 and ndy == 0): ps = data.PSGaussNoiseDims(ps, ndx, ndy) return ps, n, is_h0
def job_nfsicJ10_cperm_stoopt(paired_source, tr, te, r): """ - Copula transform the data - Use permutations to simulate from the null distribution. """ n_permute = 500 with util.ContextTimer() as t: # copula transform to both X and Y cop_map = fea.MarginalCDFMap() xtr, ytr = tr.xy() xte, yte = te.xy() xtr = cop_map.gen_features(xtr) ytr = cop_map.gen_features(ytr) xte = cop_map.gen_features(xte) yte = cop_map.gen_features(yte) tr = data.PairedData(xtr, ytr) te = data.PairedData(xte, yte) to_return = job_nfsicJ10_stoopt(paired_source, tr, te, r, n_permute) to_return['time_secs'] = t.secs return to_return
def test_sample(self): for s in [27, 91]: n_ori = 200 p_fracs = [0.1, 0.5, 0.4] X = np.random.randn(n_ori, 3) Y = np.array([0] * int(p_fracs[0] * n_ori) + [1] * int(p_fracs[1] * n_ori) + [2] * int(p_fracs[2] * n_ori))[:, np.newaxis] pdata_ori = data.PairedData(X, Y) ps = data.PSStraResample(pdata_ori, Y[:, 0]) m = 79 pdata = ps.sample(m, seed=s) self.assertEqual(pdata.sample_size(), m) _, y = pdata.xy() yu, counts = np.unique(y, return_counts=True) for i, _ in enumerate(yu): self.assertTrue(counts[i] - int(p_fracs[i] * m) <= 1)
def get_pdata_mean(n, dx=2): X = np.random.randn(n, dx) Y = np.mean(X, 1)[:, np.newaxis] + np.random.randn(n, 1) * 0.01 return data.PairedData(X, Y, label='mean')