Пример #1
0
    def test_from_pandas(self):
        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(internal.index_map,
                       OrderedDict({SPARK_DEFAULT_INDEX_NAME: None}))
        self.assert_eq(internal.column_labels, [("a", ), ("b", )])
        self.assert_eq(internal.data_spark_column_names, ["a", "b"])
        self.assertTrue(
            internal.spark_column_for(("a", ))._jc.equals(sdf["a"]._jc))
        self.assertTrue(
            internal.spark_column_for(("b", ))._jc.equals(sdf["b"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # multi-index
        pdf.set_index("a", append=True, inplace=True)

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_map,
            OrderedDict([(SPARK_INDEX_NAME_FORMAT(0), None),
                         (SPARK_INDEX_NAME_FORMAT(1), ("a", ))]),
        )
        self.assert_eq(internal.column_labels, [("b", )])
        self.assert_eq(internal.data_spark_column_names, ["b"])
        self.assertTrue(
            internal.spark_column_for(("b", ))._jc.equals(sdf["b"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # multi-index columns
        pdf.columns = pd.MultiIndex.from_tuples([("x", "b")])

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_map,
            OrderedDict([(SPARK_INDEX_NAME_FORMAT(0), None),
                         (SPARK_INDEX_NAME_FORMAT(1), ("a", ))]),
        )
        self.assert_eq(internal.column_labels, [("x", "b")])
        self.assert_eq(internal.data_spark_column_names, ["(x, b)"])
        self.assertTrue(
            internal.spark_column_for(
                ("x", "b"))._jc.equals(sdf["(x, b)"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)
Пример #2
0
    def test_from_pandas(self):
        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME])
        self.assert_eq(internal.index_names, [None])
        self.assert_eq(internal.column_labels, [("a",), ("b",)])
        self.assert_eq(internal.data_spark_column_names, ["a", "b"])
        self.assertTrue(internal.spark_column_for(("a",))._jc.equals(sdf["a"]._jc))
        self.assertTrue(internal.spark_column_for(("b",))._jc.equals(sdf["b"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # non-string column name
        pdf1 = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]})

        internal = InternalFrame.from_pandas(pdf1)
        sdf = internal.spark_frame

        self.assert_eq(internal.index_spark_column_names, [SPARK_DEFAULT_INDEX_NAME])
        self.assert_eq(internal.index_names, [None])
        self.assert_eq(internal.column_labels, [(0,), (1,)])
        self.assert_eq(internal.data_spark_column_names, ["0", "1"])
        self.assertTrue(internal.spark_column_for((0,))._jc.equals(sdf["0"]._jc))
        self.assertTrue(internal.spark_column_for((1,))._jc.equals(sdf["1"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf1)

        # multi-index
        pdf.set_index("a", append=True, inplace=True)

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_spark_column_names,
            [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)],
        )
        self.assert_eq(internal.index_names, [None, ("a",)])
        self.assert_eq(internal.column_labels, [("b",)])
        self.assert_eq(internal.data_spark_column_names, ["b"])
        self.assertTrue(internal.spark_column_for(("b",))._jc.equals(sdf["b"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)

        # multi-index columns
        pdf.columns = pd.MultiIndex.from_tuples([("x", "b")])

        internal = InternalFrame.from_pandas(pdf)
        sdf = internal.spark_frame

        self.assert_eq(
            internal.index_spark_column_names,
            [SPARK_INDEX_NAME_FORMAT(0), SPARK_INDEX_NAME_FORMAT(1)],
        )
        self.assert_eq(internal.index_names, [None, ("a",)])
        self.assert_eq(internal.column_labels, [("x", "b")])
        self.assert_eq(internal.data_spark_column_names, ["(x, b)"])
        self.assertTrue(internal.spark_column_for(("x", "b"))._jc.equals(sdf["(x, b)"]._jc))

        self.assert_eq(internal.to_pandas_frame, pdf)