示例#1
0
    def test_concat_multiindex_sort(self):
        # SPARK-39314: Respect ps.concat sort parameter to follow pandas behavior
        idx = pd.MultiIndex.from_tuples([("Y", "A"), ("Y", "B"), ("X", "C"),
                                         ("X", "D")])
        pdf = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=idx)
        psdf = ps.from_pandas(pdf)

        ignore_indexes = [True, False]
        joins = ["inner", "outer"]
        sorts = [True]
        if LooseVersion(pd.__version__) >= LooseVersion("1.4"):
            sorts += [False]
        objs = [
            ([psdf, psdf.reset_index()], [pdf, pdf.reset_index()]),
            ([psdf.reset_index(), psdf], [pdf.reset_index(), pdf]),
        ]
        for ignore_index, join, sort in itertools.product(
                ignore_indexes, joins, sorts):
            for i, (psdfs, pdfs) in enumerate(objs):
                self.assert_eq(
                    ps.concat(psdfs,
                              ignore_index=ignore_index,
                              join=join,
                              sort=sort),
                    pd.concat(pdfs,
                              ignore_index=ignore_index,
                              join=join,
                              sort=sort),
                )
示例#2
0
 def complex_psdf(self):
     pssers = {
         "this_array": self.psser,
         "that_array": ps.Series([[2, 3, 4]]),
         "this_struct": ps.Index([("x", 1)]).to_series().reset_index(drop=True),
         "that_struct": ps.Index([("a", 2)]).to_series().reset_index(drop=True),
     }
     return ps.concat(pssers, axis=1)
示例#3
0
    def test_concat_column_axis(self):
        pdf1 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}, index=[1, 2, 3])
        pdf1.columns.names = ["AB"]
        pdf2 = pd.DataFrame({"C": [1, 2, 3], "D": [4, 5, 6]}, index=[1, 3, 5])
        pdf2.columns.names = ["CD"]
        kdf1 = ps.from_pandas(pdf1)
        kdf2 = ps.from_pandas(pdf2)

        kdf3 = kdf1.copy()
        kdf4 = kdf2.copy()
        pdf3 = pdf1.copy()
        pdf4 = pdf2.copy()

        columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")], names=["X", "AB"])
        pdf3.columns = columns
        kdf3.columns = columns

        columns = pd.MultiIndex.from_tuples([("X", "C"), ("X", "D")], names=["Y", "CD"])
        pdf4.columns = columns
        kdf4.columns = columns

        ignore_indexes = [True, False]
        joins = ["inner", "outer"]

        objs = [
            ([kdf1.A, kdf1.A.rename("B")], [pdf1.A, pdf1.A.rename("B")]),
            ([kdf3[("X", "A")], kdf3[("X", "B")]], [pdf3[("X", "A")], pdf3[("X", "B")]],),
            (
                [kdf3[("X", "A")], kdf3[("X", "B")].rename("ABC")],
                [pdf3[("X", "A")], pdf3[("X", "B")].rename("ABC")],
            ),
            (
                [kdf3[("X", "A")].rename("ABC"), kdf3[("X", "B")]],
                [pdf3[("X", "A")].rename("ABC"), pdf3[("X", "B")]],
            ),
        ]

        for ignore_index, join in itertools.product(ignore_indexes, joins):
            for i, (kdfs, pdfs) in enumerate(objs):
                with self.subTest(ignore_index=ignore_index, join=join, pdfs=pdfs, pair=i):
                    actual = ps.concat(kdfs, axis=1, ignore_index=ignore_index, join=join)
                    expected = pd.concat(pdfs, axis=1, ignore_index=ignore_index, join=join)
                    self.assert_eq(
                        repr(actual.sort_values(list(actual.columns)).reset_index(drop=True)),
                        repr(expected.sort_values(list(expected.columns)).reset_index(drop=True)),
                    )
示例#4
0
    def test_concat_index_axis(self):
        pdf = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5], "C": [6, 7, 8]})
        # TODO: pdf.columns.names = ["ABC"]
        psdf = ps.from_pandas(pdf)

        ignore_indexes = [True, False]
        joins = ["inner", "outer"]
        sorts = [True, False]

        objs = [
            ([psdf, psdf], [pdf, pdf]),
            ([psdf, psdf.reset_index()], [pdf, pdf.reset_index()]),
            ([psdf.reset_index(), psdf], [pdf.reset_index(), pdf]),
            ([psdf, psdf[["C", "A"]]], [pdf, pdf[["C", "A"]]]),
            ([psdf[["C", "A"]], psdf], [pdf[["C", "A"]], pdf]),
            ([psdf, psdf["C"]], [pdf, pdf["C"]]),
            ([psdf["C"], psdf], [pdf["C"], pdf]),
            ([psdf["C"], psdf, psdf["A"]], [pdf["C"], pdf, pdf["A"]]),
            ([psdf, psdf["C"], psdf["A"]], [pdf, pdf["C"], pdf["A"]]),
        ]

        for ignore_index, join, sort in itertools.product(
                ignore_indexes, joins, sorts):
            for i, (psdfs, pdfs) in enumerate(objs):
                with self.subTest(ignore_index=ignore_index,
                                  join=join,
                                  sort=sort,
                                  pdfs=pdfs,
                                  pair=i):
                    self.assert_eq(
                        ps.concat(psdfs,
                                  ignore_index=ignore_index,
                                  join=join,
                                  sort=sort),
                        pd.concat(pdfs,
                                  ignore_index=ignore_index,
                                  join=join,
                                  sort=sort),
                        almost=(join == "outer"),
                    )

        self.assertRaisesRegex(TypeError, "first argument must be",
                               lambda: ps.concat(psdf))
        self.assertRaisesRegex(TypeError, "cannot concatenate object",
                               lambda: ps.concat([psdf, 1]))

        psdf2 = psdf.set_index("B", append=True)
        self.assertRaisesRegex(ValueError,
                               "Index type and names should be same",
                               lambda: ps.concat([psdf, psdf2]))

        self.assertRaisesRegex(ValueError, "No objects to concatenate",
                               lambda: ps.concat([]))

        self.assertRaisesRegex(ValueError, "All objects passed",
                               lambda: ps.concat([None, None]))

        pdf3 = pdf.copy()
        psdf3 = psdf.copy()

        columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"),
                                             ("Y", "C")])
        # TODO: colums.names = ["XYZ", "ABC"]
        pdf3.columns = columns
        psdf3.columns = columns

        objs = [
            ([psdf3, psdf3], [pdf3, pdf3]),
            ([psdf3, psdf3.reset_index()], [pdf3, pdf3.reset_index()]),
            ([psdf3.reset_index(), psdf3], [pdf3.reset_index(), pdf3]),
            ([psdf3,
              psdf3[[("Y", "C"),
                     ("X", "A")]]], [pdf3, pdf3[[("Y", "C"), ("X", "A")]]]),
            ([psdf3[[("Y", "C"), ("X", "A")]],
              psdf3], [pdf3[[("Y", "C"), ("X", "A")]], pdf3]),
        ]

        for ignore_index, sort in itertools.product(ignore_indexes, sorts):
            for i, (psdfs, pdfs) in enumerate(objs):
                with self.subTest(ignore_index=ignore_index,
                                  join="outer",
                                  sort=sort,
                                  pdfs=pdfs,
                                  pair=i):
                    self.assert_eq(
                        ps.concat(psdfs,
                                  ignore_index=ignore_index,
                                  join="outer",
                                  sort=sort),
                        pd.concat(pdfs,
                                  ignore_index=ignore_index,
                                  join="outer",
                                  sort=sort),
                    )

        # Skip tests for `join="inner" and sort=False` since pandas is flaky.
        for ignore_index in ignore_indexes:
            for i, (psdfs, pdfs) in enumerate(objs):
                with self.subTest(ignore_index=ignore_index,
                                  join="inner",
                                  sort=True,
                                  pdfs=pdfs,
                                  pair=i):
                    self.assert_eq(
                        ps.concat(psdfs,
                                  ignore_index=ignore_index,
                                  join="inner",
                                  sort=True),
                        pd.concat(pdfs,
                                  ignore_index=ignore_index,
                                  join="inner",
                                  sort=True),
                    )

        self.assertRaisesRegex(
            ValueError,
            "MultiIndex columns should have the same levels",
            lambda: ps.concat([psdf, psdf3]),
        )
        self.assertRaisesRegex(
            ValueError,
            "MultiIndex columns should have the same levels",
            lambda: ps.concat([psdf3[("Y", "C")], psdf3]),
        )

        pdf4 = pd.DataFrame({
            "A": [0, 2, 4],
            "B": [1, 3, 5],
            "C": [10, 20, 30]
        })
        psdf4 = ps.from_pandas(pdf4)
        self.assertRaisesRegex(
            ValueError,
            r"Only can inner \(intersect\) or outer \(union\) join the other axis.",
            lambda: ps.concat([psdf, psdf4], join=""),
        )

        self.assertRaisesRegex(
            ValueError,
            r"Only can inner \(intersect\) or outer \(union\) join the other axis.",
            lambda: ps.concat([psdf, psdf4], join="", axis=1),
        )

        self.assertRaisesRegex(
            ValueError,
            r"Only can inner \(intersect\) or outer \(union\) join the other axis.",
            lambda: ps.concat([psdf.A, psdf4.B], join="", axis=1),
        )

        self.assertRaisesRegex(
            ValueError,
            r"Labels have to be unique; however, got duplicated labels \['A'\].",
            lambda: ps.concat([psdf.A, psdf4.A], join="inner", axis=1),
        )