Пример #1
0
 def test_to_html_compat(self):
     df = (tm.makeCustomDataframe(
         4,
         3,
         data_gen_f=lambda *args: np.random.rand(),
         c_idx_names=False,
         r_idx_names=False,
     ).applymap("{:.3f}".format).astype(float))
     out = df.to_html()
     res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0]
     tm.assert_frame_equal(res, df)
Пример #2
0
 def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols):
     df = tm.makeCustomDataframe(
         nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type
     )
     result, expected = self._return_result_expected(
         df,
         1000,
         r_idx_type,
         c_idx_type,
     )
     tm.assert_frame_equal(result, expected, check_names=False)
Пример #3
0
    def test_to_csv_dups_cols(self):

        df = DataFrame(
            np.random.randn(1000, 30),
            columns=list(range(15)) + list(range(15)),
            dtype="float64",
        )

        with tm.ensure_clean() as filename:
            df.to_csv(filename)  # single dtype, fine
            result = read_csv(filename, index_col=0)
            result.columns = df.columns
            tm.assert_frame_equal(result, df)

        df_float = DataFrame(np.random.randn(1000, 3), dtype="float64")
        df_int = DataFrame(np.random.randn(1000, 3), dtype="int64")
        df_bool = DataFrame(True, index=df_float.index, columns=range(3))
        df_object = DataFrame("foo", index=df_float.index, columns=range(3))
        df_dt = DataFrame(Timestamp("20010101"),
                          index=df_float.index,
                          columns=range(3))
        df = pd.concat([df_float, df_int, df_bool, df_object, df_dt],
                       axis=1,
                       ignore_index=True)

        cols = []
        for i in range(5):
            cols.extend([0, 1, 2])
        df.columns = cols

        with tm.ensure_clean() as filename:
            df.to_csv(filename)
            result = read_csv(filename, index_col=0)

            # date cols
            for i in ["0.4", "1.4", "2.4"]:
                result[i] = to_datetime(result[i])

            result.columns = df.columns
            tm.assert_frame_equal(result, df)

        # GH3457

        N = 10
        df = tm.makeCustomDataframe(N, 3)
        df.columns = ["a", "a", "b"]

        with tm.ensure_clean() as filename:
            df.to_csv(filename)

            # read_csv will rename the dups columns
            result = read_csv(filename, index_col=0)
            result = result.rename(columns={"a.1": "a"})
            tm.assert_frame_equal(result, df)
Пример #4
0
 def test_to_csv_dup_cols(self, nrows):
     df = tm.makeCustomDataframe(nrows, 3)
     cols = list(df.columns)
     cols[:2] = ["dupe", "dupe"]
     cols[-2:] = ["dupe", "dupe"]
     ix = list(df.index)
     ix[:2] = ["rdupe", "rdupe"]
     ix[-2:] = ["rdupe", "rdupe"]
     df.index = ix
     df.columns = cols
     result, expected = self._return_result_expected(df, 1000, dupe_col=True)
     tm.assert_frame_equal(result, expected, check_names=False)
Пример #5
0
    def test_concat_invalid(self):

        # trying to concat a ndframe with a non-ndframe
        df1 = tm.makeCustomDataframe(10, 2)
        for obj in [1, {}, [1, 2], (1, 2)]:

            msg = (
                f"cannot concatenate object of type '{type(obj)}'; "
                "only Series and DataFrame objs are valid"
            )
            with pytest.raises(TypeError, match=msg):
                concat([df1, obj])
Пример #6
0
    def test_join_with_period_index(self, join_type):
        df = tm.makeCustomDataframe(
            10,
            10,
            data_gen_f=lambda *args: np.random.randint(2),
            c_idx_type="p",
            r_idx_type="dt",
        )
        s = df.iloc[:5, 0]

        expected = df.columns.astype("O").join(s.index, how=join_type)
        result = df.columns.join(s.index, how=join_type)
        tm.assert_index_equal(expected, result)
Пример #7
0
 def test_does_not_convert_mixed_integer(self):
     df = tm.makeCustomDataframe(
         10,
         10,
         data_gen_f=lambda *args, **kwargs: np.random.randn(),
         r_idx_type="i",
         c_idx_type="dt",
     )
     cols = df.columns.join(df.index, how="outer")
     joined = cols.join(df.columns)
     assert cols.dtype == np.dtype("O")
     assert cols.dtype == joined.dtype
     tm.assert_numpy_array_equal(cols.values, joined.values)
Пример #8
0
    def test_join_does_not_recur(self):
        df = tm.makeCustomDataframe(
            3,
            2,
            data_gen_f=lambda *args: np.random.randint(2),
            c_idx_type="p",
            r_idx_type="dt",
        )
        s = df.iloc[:2, 0]

        res = s.index.join(df.columns, how="outer")
        expected = Index([s.index[0], s.index[1], df.columns[0], df.columns[1]], object)
        tm.assert_index_equal(res, expected)
Пример #9
0
    def test_excel_010_hemstring(
        self, merge_cells, c_idx_nlevels, r_idx_nlevels, use_headers, path
    ):
        def roundtrip(data, header=True, parser_hdr=0, index=True):
            data.to_excel(path, header=header, merge_cells=merge_cells, index=index)

            with ExcelFile(path) as xf:
                return pd.read_excel(
                    xf, sheet_name=xf.sheet_names[0], header=parser_hdr
                )

        # Basic test.
        parser_header = 0 if use_headers else None
        res = roundtrip(DataFrame([0]), use_headers, parser_header)

        assert res.shape == (1, 2)
        assert res.iloc[0, 0] is not np.nan

        # More complex tests with multi-index.
        nrows = 5
        ncols = 3

        # ensure limited functionality in 0.10
        # override of gh-2370 until sorted out in 0.11

        df = tm.makeCustomDataframe(
            nrows, ncols, r_idx_nlevels=r_idx_nlevels, c_idx_nlevels=c_idx_nlevels
        )

        # This if will be removed once multi-column Excel writing
        # is implemented. For now fixing gh-9794.
        if c_idx_nlevels > 1:
            msg = (
                "Writing to Excel with MultiIndex columns and no index "
                "\\('index'=False\\) is not yet implemented."
            )
            with pytest.raises(NotImplementedError, match=msg):
                roundtrip(df, use_headers, index=False)
        else:
            res = roundtrip(df, use_headers)

            if use_headers:
                assert res.shape == (nrows, ncols + r_idx_nlevels)
            else:
                # First row taken as columns.
                assert res.shape == (nrows - 1, ncols + r_idx_nlevels)

            # No NaNs.
            for r in range(len(res.index)):
                for c in range(len(res.columns)):
                    assert res.iloc[r, c] is not np.nan
Пример #10
0
    def test_to_csv_cols_reordering(self):
        # GH3454
        chunksize = 5
        N = int(chunksize * 2.5)

        df = tm.makeCustomDataframe(N, 3)
        cs = df.columns
        cols = [cs[2], cs[0]]

        with tm.ensure_clean() as path:
            df.to_csv(path, columns=cols, chunksize=chunksize)
            rs_c = read_csv(path, index_col=0)

        tm.assert_frame_equal(df[cols], rs_c, check_names=False)
Пример #11
0
    def test_loc_empty_list_indexer_is_ok(self):

        df = tm.makeCustomDataframe(5, 2)
        # vertical empty
        tm.assert_frame_equal(
            df.loc[:, []], df.iloc[:, :0], check_index_type=True, check_column_type=True
        )
        # horizontal empty
        tm.assert_frame_equal(
            df.loc[[], :], df.iloc[:0, :], check_index_type=True, check_column_type=True
        )
        # horizontal empty
        tm.assert_frame_equal(
            df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
        )
Пример #12
0
    def test_excel_multindex_roundtrip(
        self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels, request
    ):
        # see gh-4679
        with tm.ensure_clean(ext) as pth:
            if (c_idx_levels == 1 and c_idx_names) and not (
                r_idx_levels == 3 and not r_idx_names
            ):
                mark = pytest.mark.xfail(
                    reason="Column index name cannot be serialized unless "
                    "it's a MultiIndex"
                )
                request.node.add_marker(mark)

            # Empty name case current read in as
            # unnamed levels, not Nones.
            check_names = r_idx_names or r_idx_levels <= 1

            df = tm.makeCustomDataframe(
                5, 5, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels
            )
            df.to_excel(pth)

            act = pd.read_excel(
                pth,
                index_col=list(range(r_idx_levels)),
                header=list(range(c_idx_levels)),
            )
            tm.assert_frame_equal(df, act, check_names=check_names)

            df.iloc[0, :] = np.nan
            df.to_excel(pth)

            act = pd.read_excel(
                pth,
                index_col=list(range(r_idx_levels)),
                header=list(range(c_idx_levels)),
            )
            tm.assert_frame_equal(df, act, check_names=check_names)

            df.iloc[-1, :] = np.nan
            df.to_excel(pth)
            act = pd.read_excel(
                pth,
                index_col=list(range(r_idx_levels)),
                header=list(range(c_idx_levels)),
            )
            tm.assert_frame_equal(df, act, check_names=check_names)
Пример #13
0
 def test_slice_locs_with_type_mismatch(self):
     df = tm.makeTimeDataFrame()
     stacked = df.stack()
     idx = stacked.index
     with pytest.raises(TypeError, match="^Level type mismatch"):
         idx.slice_locs((1, 3))
     with pytest.raises(TypeError, match="^Level type mismatch"):
         idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2))
     df = tm.makeCustomDataframe(5, 5)
     stacked = df.stack()
     idx = stacked.index
     with pytest.raises(TypeError, match="^Level type mismatch"):
         idx.slice_locs(timedelta(seconds=30))
     # TODO: Try creating a UnicodeDecodeError in exception message
     with pytest.raises(TypeError, match="^Level type mismatch"):
         idx.slice_locs(df.index[1], (16, "a"))
Пример #14
0
    def test_to_csv_new_dupe_cols(self):
        import pandas as pd

        def _check_df(df, cols=None):
            with tm.ensure_clean() as path:
                df.to_csv(path, columns=cols, chunksize=chunksize)
                rs_c = pd.read_csv(path, index_col=0)

                # we wrote them in a different order
                # so compare them in that order
                if cols is not None:

                    if df.columns.is_unique:
                        rs_c.columns = cols
                    else:
                        indexer, missing = df.columns.get_indexer_non_unique(
                            cols)
                        rs_c.columns = df.columns.take(indexer)

                    for c in cols:
                        obj_df = df[c]
                        obj_rs = rs_c[c]
                        if isinstance(obj_df, Series):
                            tm.assert_series_equal(obj_df, obj_rs)
                        else:
                            tm.assert_frame_equal(obj_df,
                                                  obj_rs,
                                                  check_names=False)

                # wrote in the same order
                else:
                    rs_c.columns = df.columns
                    tm.assert_frame_equal(df, rs_c, check_names=False)

        chunksize = 5
        N = int(chunksize * 2.5)

        # dupe cols
        df = tm.makeCustomDataframe(N, 3)
        df.columns = ["a", "a", "b"]
        _check_df(df, None)

        # dupe cols with selection
        cols = ["b", "a"]
        _check_df(df, cols)
Пример #15
0
def test_header_multi_index(all_parsers):
    parser = all_parsers
    expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)

    data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2

C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
    result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1])
    tm.assert_frame_equal(result, expected)
Пример #16
0
    def test_register_writer(self):
        class DummyClass(ExcelWriter):
            called_save = False
            called_write_cells = False
            called_sheets = False
            _supported_extensions = ("xlsx", "xls")
            _engine = "dummy"

            def book(self):
                pass

            def _save(self):
                type(self).called_save = True

            def _write_cells(self, *args, **kwargs):
                type(self).called_write_cells = True

            @property
            def sheets(self):
                type(self).called_sheets = True

            @classmethod
            def assert_called_and_reset(cls):
                assert cls.called_save
                assert cls.called_write_cells
                assert not cls.called_sheets
                cls.called_save = False
                cls.called_write_cells = False

        register_writer(DummyClass)

        with option_context("io.excel.xlsx.writer", "dummy"):
            path = "something.xlsx"
            with tm.ensure_clean(path) as filepath:
                with ExcelWriter(filepath) as writer:
                    assert isinstance(writer, DummyClass)
                df = tm.makeCustomDataframe(1, 1)
                df.to_excel(filepath)
            DummyClass.assert_called_and_reset()

        with tm.ensure_clean("something.xls") as filepath:
            df.to_excel(filepath, engine="dummy")
        DummyClass.assert_called_and_reset()
Пример #17
0
def test_tabulator_dataframe_replace_data(document, comm):
    df = makeMixedDataFrame()
    table = Tabulator(df)

    model = table.get_root(document, comm)

    table.value = makeCustomDataframe(2, 2)

    assert len(model.columns) == 3
    c1, c2, c3 = model.columns
    assert c1.field == 'R0'
    assert c2.field == 'C_l0_g0'
    assert c3.field == 'C_l0_g1'
    assert model.configuration == {
        'columns': [{'field': 'R0'}, {'field': 'C_l0_g0'}, {'field': 'C_l0_g1'}],
        'selectable': True
    }
    expected = {
        'C_l0_g0': np.array(['R0C0', 'R1C0'], dtype=object),
        'C_l0_g1': np.array(['R0C1', 'R1C1'], dtype=object),
        'R0': np.array(['R_l0_g0', 'R_l0_g1'], dtype=object)
    }
    for col, values in model.source.data.items():
        np.testing.assert_array_equal(values, expected[col])
Пример #18
0
def make_dataframe(
        nrows: int,
        ncols: int,
        data_gen_f: Optional[Callable[[int, int], Any]] = None) -> DataFrame:
    """Local mapping of `pandas._testing.makeCustomDataframe`.

    Resulting `DataFrame` will have neither a columns name nor an index
    name. Indices will be a zero-based integer list.

    Parameter names and descriptions are based on those found in
    `pandas._testing.py`.
        https://github.com/pandas-dev/pandas/blob/b687cd4d9e520666a956a60849568a98dd00c672/pandas/_testing.py#L1956

    Args:
        nrows (int): Number of rows.
        ncols (int): Number of columns.
        data_gen_f (func): Function f(row,col) that returns a value for
            the given position.

    Returns:
        DataFrame: Generated `DataFrame` object.
    """
    from pandas._testing import makeCustomDataframe

    # pandas bug (?) in makeCustomIndex when nentries = 1
    if ncols == 1:
        return DataFrame(
            {"C_l0_g0": [make_dataframe_value(x, 0) for x in range(nrows)]})
    return makeCustomDataframe(
        nrows,
        ncols,
        c_idx_names=False,
        r_idx_names=False,
        data_gen_f=data_gen_f,
        r_idx_type="i",
    )
Пример #19
0
def df(request):
    data_type = request.param

    if data_type == "delims":
        return pd.DataFrame({"a": ['"a,\t"b|c', "d\tef´"], "b": ["hi'j", "k''lm"]})
    elif data_type == "utf8":
        return pd.DataFrame({"a": ["µasd", "Ωœ∑´"], "b": ["øπ∆˚¬", "œ∑´®"]})
    elif data_type == "utf16":
        return pd.DataFrame(
            {"a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"], "b": ["abc", "def"]}
        )
    elif data_type == "string":
        return tm.makeCustomDataframe(
            5, 3, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None]
        )
    elif data_type == "long":
        max_rows = get_option("display.max_rows")
        return tm.makeCustomDataframe(
            max_rows + 1,
            3,
            data_gen_f=lambda *args: randint(2),
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    elif data_type == "nonascii":
        return pd.DataFrame({"en": "in English".split(), "es": "en español".split()})
    elif data_type == "colwidth":
        _cw = get_option("display.max_colwidth") + 1
        return tm.makeCustomDataframe(
            5,
            3,
            data_gen_f=lambda *args: "x" * _cw,
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    elif data_type == "mixed":
        return DataFrame(
            {
                "a": np.arange(1.0, 6.0) + 0.01,
                "b": np.arange(1, 6).astype(np.int64),
                "c": list("abcde"),
            }
        )
    elif data_type == "float":
        return tm.makeCustomDataframe(
            5,
            3,
            data_gen_f=lambda r, c: float(r) + 0.01,
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    elif data_type == "int":
        return tm.makeCustomDataframe(
            5,
            3,
            data_gen_f=lambda *args: randint(2),
            c_idx_type="s",
            r_idx_type="i",
            c_idx_names=[None],
            r_idx_names=[None],
        )
    else:
        raise ValueError
Пример #20
0
 def test_to_csv_nrows(self, nrows):
     df = tm.makeCustomDataframe(nrows, 4, r_idx_type="dt", c_idx_type="s")
     result, expected = self._return_result_expected(df, 1000, "dt", "s")
     tm.assert_frame_equal(result, expected, check_names=False)
Пример #21
0
    def test_dups_fancy_indexing(self):

        # GH 3455

        df = tm.makeCustomDataframe(10, 3)
        df.columns = ["a", "a", "b"]
        result = df[["b", "a"]].columns
        expected = Index(["b", "a", "a"])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]],
                       columns=list("aaaaaaa"))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]])
        result.columns = list("aaaaaaa")

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {
                "test": [5, 7, 9, 11],
                "test1": [4.0, 5, 6, 7],
                "other": list("abcd")
            },
            index=["A", "A", "B", "C"],
        )
        rows = ["C", "B"]
        expected = DataFrame(
            {
                "test": [11, 9],
                "test1": [7.0, 6],
                "other": ["d", "c"]
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ["C", "B", "E"]
        with pytest.raises(KeyError, match="with any missing labels"):
            df.loc[rows]

        # see GH5553, make sure we use the right indexer
        rows = ["F", "G", "H", "C", "B", "E"]
        with pytest.raises(KeyError, match="with any missing labels"):
            df.loc[rows]

        # List containing only missing label
        dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD"))
        with pytest.raises(
                KeyError,
                match=re.escape(
                    "\"None of [Index(['E'], dtype='object')] are in the [index]\""
                ),
        ):
            dfnu.loc[["E"]]

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with pytest.raises(KeyError, match="with any missing labels"):
            df.loc[[0, 8, 0]]

        df = DataFrame({"A": list("abc")})
        with pytest.raises(KeyError, match="with any missing labels"):
            df.loc[[0, 8, 0]]

        # non unique with non unique selector
        df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"])
        with pytest.raises(KeyError, match="with any missing labels"):
            df.loc[["A", "A", "E"]]
Пример #22
0
 def test_to_csv_chunksize(self):
     chunksize = 1000
     df = tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2)
     result, expected = self._return_result_expected(df, chunksize, rnlvl=2)
     tm.assert_frame_equal(result, expected, check_names=False)
Пример #23
0
    def test_to_csv_multiindex(self, float_frame, datetime_frame):

        frame = float_frame
        old_index = frame.index
        arrays = np.arange(len(old_index) * 2).reshape(2, -1)
        new_index = MultiIndex.from_arrays(arrays, names=["first", "second"])
        frame.index = new_index

        with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:

            frame.to_csv(path, header=False)
            frame.to_csv(path, columns=["A", "B"])

            # round trip
            frame.to_csv(path)

            df = self.read_csv(path, index_col=[0, 1], parse_dates=False)

            # TODO to_csv drops column name
            tm.assert_frame_equal(frame, df, check_names=False)
            assert frame.index.names == df.index.names

            # needed if setUp becomes a class method
            float_frame.index = old_index

            # try multiindex with dates
            tsframe = datetime_frame
            old_index = tsframe.index
            new_index = [old_index, np.arange(len(old_index))]
            tsframe.index = MultiIndex.from_arrays(new_index)

            tsframe.to_csv(path, index_label=["time", "foo"])
            recons = self.read_csv(path, index_col=[0, 1])

            # TODO to_csv drops column name
            tm.assert_frame_equal(tsframe, recons, check_names=False)

            # do not load index
            tsframe.to_csv(path)
            recons = self.read_csv(path, index_col=None)
            assert len(recons.columns) == len(tsframe.columns) + 2

            # no index
            tsframe.to_csv(path, index=False)
            recons = self.read_csv(path, index_col=None)
            tm.assert_almost_equal(recons.values, datetime_frame.values)

            # needed if setUp becomes class method
            datetime_frame.index = old_index

        with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
            # GH3571, GH1651, GH3141

            def _make_frame(names=None):
                if names is True:
                    names = ["first", "second"]
                return DataFrame(
                    np.random.randint(0, 10, size=(3, 3)),
                    columns=MultiIndex.from_tuples(
                        [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names
                    ),
                    dtype="int64",
                )

            # column & index are multi-index
            df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1])
            tm.assert_frame_equal(df, result)

            # column is mi
            df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=0)
            tm.assert_frame_equal(df, result)

            # dup column names?
            df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2])
            tm.assert_frame_equal(df, result)

            # writing with no index
            df = _make_frame()
            df.to_csv(path, index=False)
            result = read_csv(path, header=[0, 1])
            tm.assert_frame_equal(df, result)

            # we lose the names here
            df = _make_frame(True)
            df.to_csv(path, index=False)
            result = read_csv(path, header=[0, 1])
            assert com.all_none(*result.columns.names)
            result.columns.names = df.columns.names
            tm.assert_frame_equal(df, result)

            # whatsnew example
            df = _make_frame()
            df.to_csv(path)
            result = read_csv(path, header=[0, 1], index_col=[0])
            tm.assert_frame_equal(df, result)

            df = _make_frame(True)
            df.to_csv(path)
            result = read_csv(path, header=[0, 1], index_col=[0])
            tm.assert_frame_equal(df, result)

            # invalid options
            df = _make_frame(True)
            df.to_csv(path)

            for i in [6, 7]:
                msg = f"len of {i}, but only 5 lines in file"
                with pytest.raises(ParserError, match=msg):
                    read_csv(path, header=list(range(i)), index_col=0)

            # write with cols
            msg = "cannot specify cols with a MultiIndex"
            with pytest.raises(TypeError, match=msg):
                df.to_csv(path, columns=["foo", "bar"])

        with tm.ensure_clean("__tmp_to_csv_multiindex__") as path:
            # empty
            tsframe[:0].to_csv(path)
            recons = self.read_csv(path)

            exp = tsframe[:0]
            exp.index = []

            tm.assert_index_equal(recons.columns, exp.columns)
            assert len(recons) == 0
Пример #24
0
    def test_to_csv_moar(self):
        def _do_test(
            df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False
        ):

            kwargs = dict(parse_dates=False)
            if cnlvl:
                if rnlvl is not None:
                    kwargs["index_col"] = list(range(rnlvl))
                kwargs["header"] = list(range(cnlvl))

                with tm.ensure_clean("__tmp_to_csv_moar__") as path:
                    df.to_csv(path, encoding="utf8", chunksize=chunksize)
                    recons = self.read_csv(path, **kwargs)
            else:
                kwargs["header"] = 0

                with tm.ensure_clean("__tmp_to_csv_moar__") as path:
                    df.to_csv(path, encoding="utf8", chunksize=chunksize)
                    recons = self.read_csv(path, **kwargs)

            def _to_uni(x):
                if not isinstance(x, str):
                    return x.decode("utf8")
                return x

            if dupe_col:
                # read_Csv disambiguates the columns by
                # labeling them dupe.1,dupe.2, etc'. monkey patch columns
                recons.columns = df.columns
            if rnlvl and not cnlvl:
                delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)]
                ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
                recons.index = ix
                recons = recons.iloc[:, rnlvl - 1 :]

            type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O")
            if r_dtype:
                if r_dtype == "u":  # unicode
                    r_dtype = "O"
                    recons.index = np.array(
                        [_to_uni(label) for label in recons.index], dtype=r_dtype
                    )
                    df.index = np.array(
                        [_to_uni(label) for label in df.index], dtype=r_dtype
                    )
                elif r_dtype == "dt":  # unicode
                    r_dtype = "O"
                    recons.index = np.array(
                        [Timestamp(label) for label in recons.index], dtype=r_dtype
                    )
                    df.index = np.array(
                        [Timestamp(label) for label in df.index], dtype=r_dtype
                    )
                elif r_dtype == "p":
                    r_dtype = "O"
                    idx_list = to_datetime(recons.index)
                    recons.index = np.array(
                        [Timestamp(label) for label in idx_list], dtype=r_dtype
                    )
                    df.index = np.array(
                        list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype
                    )
                else:
                    r_dtype = type_map.get(r_dtype)
                    recons.index = np.array(recons.index, dtype=r_dtype)
                    df.index = np.array(df.index, dtype=r_dtype)
            if c_dtype:
                if c_dtype == "u":
                    c_dtype = "O"
                    recons.columns = np.array(
                        [_to_uni(label) for label in recons.columns], dtype=c_dtype
                    )
                    df.columns = np.array(
                        [_to_uni(label) for label in df.columns], dtype=c_dtype
                    )
                elif c_dtype == "dt":
                    c_dtype = "O"
                    recons.columns = np.array(
                        [Timestamp(label) for label in recons.columns], dtype=c_dtype
                    )
                    df.columns = np.array(
                        [Timestamp(label) for label in df.columns], dtype=c_dtype
                    )
                elif c_dtype == "p":
                    c_dtype = "O"
                    col_list = to_datetime(recons.columns)
                    recons.columns = np.array(
                        [Timestamp(label) for label in col_list], dtype=c_dtype
                    )
                    col_list = df.columns.to_timestamp()
                    df.columns = np.array(
                        [Timestamp(label) for label in col_list], dtype=c_dtype
                    )
                else:
                    c_dtype = type_map.get(c_dtype)
                    recons.columns = np.array(recons.columns, dtype=c_dtype)
                    df.columns = np.array(df.columns, dtype=c_dtype)

            tm.assert_frame_equal(df, recons, check_names=False)

        N = 100
        chunksize = 1000
        ncols = 4
        base = chunksize // ncols
        for nrows in [
            2,
            10,
            N - 1,
            N,
            N + 1,
            N + 2,
            2 * N - 2,
            2 * N - 1,
            2 * N,
            2 * N + 1,
            2 * N + 2,
            base - 1,
            base,
            base + 1,
        ]:
            _do_test(
                tm.makeCustomDataframe(nrows, ncols, r_idx_type="dt", c_idx_type="s"),
                "dt",
                "s",
            )

        for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")]:
            for ncols in [1, 2, 3, 4]:
                base = chunksize // ncols
                for nrows in [
                    2,
                    10,
                    N - 1,
                    N,
                    N + 1,
                    N + 2,
                    2 * N - 2,
                    2 * N - 1,
                    2 * N,
                    2 * N + 1,
                    2 * N + 2,
                    base - 1,
                    base,
                    base + 1,
                ]:
                    _do_test(
                        tm.makeCustomDataframe(
                            nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type
                        ),
                        r_idx_type,
                        c_idx_type,
                    )

        for ncols in [1, 2, 3, 4]:
            base = chunksize // ncols
            for nrows in [
                10,
                N - 2,
                N - 1,
                N,
                N + 1,
                N + 2,
                2 * N - 2,
                2 * N - 1,
                2 * N,
                2 * N + 1,
                2 * N + 2,
                base - 1,
                base,
                base + 1,
            ]:
                _do_test(tm.makeCustomDataframe(nrows, ncols))

        for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]:
            df = tm.makeCustomDataframe(nrows, 3)
            cols = list(df.columns)
            cols[:2] = ["dupe", "dupe"]
            cols[-2:] = ["dupe", "dupe"]
            ix = list(df.index)
            ix[:2] = ["rdupe", "rdupe"]
            ix[-2:] = ["rdupe", "rdupe"]
            df.index = ix
            df.columns = cols
            _do_test(df, dupe_col=True)

        _do_test(DataFrame(index=np.arange(10)))
        _do_test(
            tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2
        )
        for ncols in [2, 3, 4]:
            base = int(chunksize // ncols)
            for nrows in [
                10,
                N - 2,
                N - 1,
                N,
                N + 1,
                N + 2,
                2 * N - 2,
                2 * N - 1,
                2 * N,
                2 * N + 1,
                2 * N + 2,
                base - 1,
                base,
                base + 1,
            ]:
                _do_test(tm.makeCustomDataframe(nrows, ncols, r_idx_nlevels=2), rnlvl=2)
                _do_test(tm.makeCustomDataframe(nrows, ncols, c_idx_nlevels=2), cnlvl=2)
                _do_test(
                    tm.makeCustomDataframe(
                        nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2
                    ),
                    rnlvl=2,
                    cnlvl=2,
                )
Пример #25
0
 def test_to_csv_params(self, nrows, df_params, func_params, ncols):
     df = tm.makeCustomDataframe(nrows, ncols, **df_params)
     result, expected = self._return_result_expected(
         df, 1000, **func_params)
     tm.assert_frame_equal(result, expected, check_names=False)
Пример #26
0
 def test_select_dtypes_typecodes(self):
     # GH 11990
     df = tm.makeCustomDataframe(30, 3, data_gen_f=lambda x, y: np.random.random())
     expected = df
     FLOAT_TYPES = list(np.typecodes["AllFloat"])
     tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)