def test_ordering(self): pipeline = Pipeline() pipeline.add_step('wpca', BinaryWPCA()) mean1 = [.5, -.5] mean2 = [-.5, .5] cov = [[1., .9], [.9, 1.]] X1 = np.random.multivariate_normal(mean1, cov, 10000) X2 = np.random.multivariate_normal(mean2, cov, 10000) X = np.vstack((X1, X2)) y = np.append(np.zeros(10000), np.ones(10000)) w = np.ones(20000) pipeline.fit(X, y, w) Xt, _, _ = pipeline.transform(X, y, w) is_diag = lambda X: np.allclose(X - np.diag(np.diagonal(X)), np.zeros(X.shape)) self.assertTrue(is_diag(np.cov(Xt.T))) sel1 = (y == 0) sel2 = (y == 1) distance = lambda x1, w1, x2, w2: wasserstein_distance( u_values=x1, v_values=x2, u_weights=w1, v_weights=w2) dist1 = distance(x1=X[sel1, 0], w1=w[sel1], x2=Xt[sel2, 0], w2=w[sel2]) dist2 = distance(x1=X[sel1, 1], w1=w[sel1], x2=Xt[sel2, 1], w2=w[sel2]) self.assertLess(dist1, dist2)
def test_fit_decoupling(self): pipeline = Pipeline() indices = [2, 4] pipeline.add_step('center', Center(), indices) X = np.random.rand(5, 10) X[:, indices[0]] = np.arange(5).astype(float) X[:, indices[1]] = np.arange(5).astype(float) * 2 mean = np.mean(X[:, indices], axis=0) self.assertAlmostEqual(mean[0], 2.) self.assertAlmostEqual(mean[1], 4.) y = np.arange(5) w = np.arange(5) pipeline.fit(X, y, w) Xt, _, _ = pipeline.transform(np.ones_like(X), y, w) means = Xt.mean(axis=0) self.assertAlmostEqual(means[0], 1.) self.assertAlmostEqual(means[1], 1.) self.assertAlmostEqual(means[2], -1.) self.assertAlmostEqual(means[3], 1.) self.assertAlmostEqual(means[4], -3.)
def test_PCA(self): pipeline = Pipeline() pipeline.add_step('pca', PCA(standardize=False)) X = np.array([[1., 2.], [2., 4.], [3., 6.]]) w = np.array([1., 1., 1.]) y = np.zeros(3) abspcc = lambda a, b: abs(np.corrcoef(a, b)[0][1]) self.assertGreater(abspcc(X[:, 0], X[:, 1]), .99) pipeline.fit(X, y, w) pca = pipeline.get_step('pca') R = pca.R self.assertTrue(np.allclose(R.dot(R.T), np.identity(R.shape[0]))) Xt, yt, wt = pipeline.transform(X, y, w) self.assertTrue(np.allclose(y, yt)) self.assertTrue(np.allclose(w, wt)) cov = np.cov(Xt.T) self.assertAlmostEqual(cov[0, 0], 5.) self.assertAlmostEqual(cov[1, 0], 0.) self.assertAlmostEqual(cov[0, 1], 0.) self.assertAlmostEqual(cov[1, 1], 0.)
def test_weighting(self): pipeline = Pipeline() pipeline.add_step('pca', PCA(standardize=False)) X1 = np.array([[1., 2.], [2., 4.], [3., 6.], [4., 8.], [5., 10.]]) w1 = np.array([0., 1., 2., 1., 1.]) y1 = np.array([0, 0, 0, 1, 2]) X2 = np.array([[2., 4.], [3., 6.], [3., 6.], [4., 8.], [5., 10.]]) w2 = np.array([0., 1., 2., 1., 1.]) y2 = np.array([0, 0, 0, 1, 2]) pipeline.fit(X1, y1, w1) Xt1, _, _ = pipeline.transform(X1, y1, w1) means1 = Xt1.mean(axis=0) stds1 = Xt1.std(axis=0, ddof=1) pipeline.fit(X2, y2, w2) Xt2, _, _ = pipeline.transform(X2, y2, w2) means2 = Xt1.mean(axis=0) stds2 = Xt1.std(axis=0, ddof=1) for mean1, mean2 in zip(means1, means2): self.assertAlmostEqual(mean1, mean2) for std1, std2 in zip(stds1, stds2): self.assertAlmostEqual(std1, std2)
def test_standardize(self): pipeline = Pipeline() pipeline.add_step('pca', PCA(standardize=True)) X = np.array([[1., 2.], [2., 4.], [3., 6.], [4., 8.], [5., 10.]]) w = np.array([1., 1., 1., 1., 1.]) y = np.array([0, 0, 0, 1, 2]) pipeline.fit(X, y, w) Xt, _, _ = pipeline.transform(X, y, w) for mean, std in zip(Xt.mean(axis=0), Xt.std(axis=0, ddof=1)): self.assertAlmostEqual(mean, 0.) self.assertAlmostEqual(std, 1.)
def test_PCA_ignore(self): pipeline = Pipeline() pipeline.add_step('pca', PCA(ignore=[1, 2], standardize=False)) X = np.array([[1., 2.], [2., 4.], [3., 6.], [4., 8.], [5., 10.]]) w = np.array([1., 1., 1., 1., 1.]) y = np.array([0, 0, 0, 1, 2]) pipeline.fit(X, y, w) Xt, _, _ = pipeline.transform(X, y, w) cov = np.cov(Xt[(y != 1) & (y != 2)].T) self.assertAlmostEqual(cov[0, 0], 5.) self.assertAlmostEqual(cov[1, 0], 0.) self.assertAlmostEqual(cov[0, 1], 0.) self.assertAlmostEqual(cov[1, 1], 0.)
def test_standardize(self): pipeline = Pipeline() pipeline.add_step('std', Standardizer()) X = np.array([[1., 2.], [2., 4.], [3., 6.], [4., 8.]]) y = np.zeros(4) w = np.array([2., 3., 3., 4.]) pipeline.fit(X, y, w) step = pipeline.get_step('std') self.assertAlmostEqual(step.mean[0], 2.75) self.assertAlmostEqual(step.mean[1], 5.5) self.assertAlmostEqual(step.std[0], np.sqrt(14.25) / 3.) self.assertAlmostEqual(step.std[1], np.sqrt(57.) / 3.) Xt, _, _ = pipeline.transform(X, y, w) X = np.array([ (np.array([1., 2., 3., 4.]) - 2.75) / (np.sqrt(14.25) / 3.), (np.array([2., 4., 6., 8.]) - 5.5) / (np.sqrt(57.) / 3.) ]).T self.assertTrue(np.allclose(X, Xt))
self.mean = None def fit(self, X, y, w): self.mean = X.mean() def transform(self, X, y, w): X = np.ones_like(X) * self.mean return X, y, w def generate_data(m=10, n=5): X = np.random.rand(m, n) y = np.random.randint(low=0, high=2, size=m) w = np.random.rand(m) return X, y, w if __name__ == '__main__': pipeline = Pipeline() pipeline \ .add_step('step1', TestStep(), indices=[1, 3]) \ .add_step('step2', TestStep(), indices=[2, 4]) X1, y1, w1 = generate_data() pipeline.fit(X1, y1, w1) X2, y2, w2 = generate_data() Xt, yt, wt = pipeline.transform(X2, y2, w2) print(Xt)