def check(actual, expected): actual_scols, actual_labels = actual expected_column_names, expected_labels = expected self.assertEqual(len(actual_scols), len(expected_column_names)) for actual_scol, expected_column_name in zip( actual_scols, expected_column_names): expected_scol = sdf[expected_column_name] self.assertTrue(spark_column_equals(actual_scol, expected_scol)) self.assertEqual(actual_labels, expected_labels)
def test_lit(self): self.assertTrue( spark_column_equals(SF.lit(np.int64(1)), F.lit(1).astype(LongType()))) self.assertTrue( spark_column_equals(SF.lit(np.int32(1)), F.lit(1).astype(IntegerType()))) self.assertTrue( spark_column_equals(SF.lit(np.int8(1)), F.lit(1).astype(ByteType()))) self.assertTrue( spark_column_equals(SF.lit(np.byte(1)), F.lit(1).astype(ByteType()))) self.assertTrue( spark_column_equals(SF.lit(np.float32(1)), F.lit(float(1)).astype(FloatType()))) self.assertTrue(spark_column_equals(SF.lit(1), F.lit(1)))
def test_from_pandas(self): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME]) self.assert_eq(internal.index_names, [None]) self.assert_eq(internal.column_labels, [("a", ), ("b", )]) self.assert_eq(internal.data_spark_column_names, ["a", "b"]) self.assertTrue( spark_column_equals(internal.spark_column_for(("a", )), sdf["a"])) self.assertTrue( spark_column_equals(internal.spark_column_for(("b", )), sdf["b"])) self.assert_eq(internal.to_pandas_frame, pdf) # non-string column name pdf1 = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]}) internal = InternalFrame.from_pandas(pdf1) sdf = internal.spark_frame self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME]) self.assert_eq(internal.index_names, [None]) self.assert_eq(internal.column_labels, [(0, ), (1, )]) self.assert_eq(internal.data_spark_column_names, ["0", "1"]) self.assertTrue( spark_column_equals(internal.spark_column_for((0, )), sdf["0"])) self.assertTrue( spark_column_equals(internal.spark_column_for((1, )), sdf["1"])) self.assert_eq(internal.to_pandas_frame, pdf1) # multi-index pdf.set_index("a", append=True, inplace=True) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq( internal.index_spark_column_names, [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)], ) self.assert_eq(internal.index_names, [None, ("a", )]) self.assert_eq(internal.column_labels, [("b", )]) self.assert_eq(internal.data_spark_column_names, ["b"]) self.assertTrue( spark_column_equals(internal.spark_column_for(("b", )), sdf["b"])) self.assert_eq(internal.to_pandas_frame, pdf) # multi-index columns pdf.columns = pd.MultiIndex.from_tuples([("x", "b")]) internal = InternalFrame.from_pandas(pdf) sdf = internal.spark_frame self.assert_eq( internal.index_spark_column_names, [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)], ) self.assert_eq(internal.index_names, [None, ("a", )]) self.assert_eq(internal.column_labels, [("x", "b")]) self.assert_eq(internal.data_spark_column_names, ["(x, b)"]) self.assertTrue( spark_column_equals(internal.spark_column_for(("x", "b")), sdf["(x, b)"])) self.assert_eq(internal.to_pandas_frame, pdf)