def test_transform_with_dtype(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X2 = X.transform(lambda x: x**2, column=0) assert_equal(X2.dtype, (np.ndarray, np.ndarray)) X2 = X.transform(lambda x: tuple((x**2).tolist()), column=0, dtype=tuple) assert_equal(X2.dtype, (tuple, np.ndarray)) assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray})) X2 = X.transform(lambda x: x**2, column=1, dtype=list) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda a, b: (a**2, (b**0.5).tolist()), column=[0, 1], dtype=(np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda b, a: ((b**0.5).tolist(), a**2), column=[1, 0], dtype=(list, np.ndarray)) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
def test_transform(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X1 = [(x[0], x[1] ** 2) for x in X.collect()] X2 = X.transform(lambda a, b: (a, b ** 2)).collect() assert_multiple_tuples_equal(X1, X2) X1 = [(x[0], x[1] ** 2) for x in X.collect()] X2 = X.transform(lambda x: x ** 2, column=1).collect() assert_multiple_tuples_equal(X1, X2) X1 = [(x[0] ** 2, x[1]) for x in X.collect()] X2 = X.transform(lambda x: x ** 2, column=0).collect() assert_multiple_tuples_equal(X1, X2) X1 = [(x[0] ** 2, x[1] ** 0.5) for x in X.collect()] X2 = X.transform(lambda a, b: (a ** 2, b ** 0.5), column=[0, 1]) assert_multiple_tuples_equal(X1, X2.collect()) X1 = [(x[0] ** 2, x[1] ** 0.5) for x in X.collect()] X2 = X.transform(lambda b, a: (b ** 0.5, a ** 2), column=[1, 0]) assert_multiple_tuples_equal(X1, X2.collect())
def test_transform(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X1 = [(x[0], x[1]**2) for x in X.collect()] X2 = X.transform(lambda a, b: (a, b**2)) assert_multiple_tuples_equal(X1, X2.collect()) X1 = [(x[0], x[1]**2) for x in X.collect()] X2 = X.transform(lambda x: x**2, column=1) assert_multiple_tuples_equal(X1, X2.collect()) X1 = [(x[0]**2, x[1]) for x in X.collect()] X2 = X.transform(lambda x: x**2, column=0) assert_multiple_tuples_equal(X1, X2.collect()) X1 = [(x[0]**2, x[1]**0.5) for x in X.collect()] X2 = X.transform(lambda a, b: (a**2, b**0.5), column=[0, 1]) assert_multiple_tuples_equal(X1, X2.collect()) X1 = [(x[0]**2, x[1]**0.5) for x in X.collect()] X2 = X.transform(lambda b, a: (b**0.5, a**2), column=[1, 0]) assert_multiple_tuples_equal(X1, X2.collect())
def test_transform_with_dtype(self): data1 = np.arange(400).reshape((100, 4)) data2 = np.arange(200).reshape((100, 2)) rdd1 = self.sc.parallelize(data1, 4) rdd2 = self.sc.parallelize(data2, 4) X = DictRDD(rdd1.zip(rdd2), bsize=5) X2 = X.transform(lambda x: x ** 2, column=0) assert_equal(X2.dtype, (np.ndarray, np.ndarray)) X2 = X.transform(lambda x: tuple((x ** 2).tolist()), column=0, dtype=tuple) assert_equal(X2.dtype, (tuple, np.ndarray)) assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray})) X2 = X.transform(lambda x: x ** 2, column=1, dtype=list) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda a, b: (a ** 2, (b ** 0.5).tolist()), column=[0, 1], dtype=(np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list})) X2 = X.transform(lambda b, a: ((b ** 0.5).tolist(), a ** 2), column=[1, 0], dtype=(list, np.ndarray)) assert_equal(X2.dtype, (np.ndarray, list)) assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))