Пример #1
0
    def compare(self, current_data, all_data, vf, version):
        # GH12277 encoding default used to be latin-1, now utf-8
        if LooseVersion(version) < "0.18.0":
            data = read_msgpack(vf, encoding="latin-1")
        else:
            data = read_msgpack(vf)
        self.check_min_structure(data, version)
        for typ, dv in data.items():
            assert typ in all_data, "unpacked data contains " 'extra key "{0}"'.format(
                typ
            )
            for dt, result in dv.items():
                assert (
                    dt in current_data[typ]
                ), 'data["{0}"] contains extra ' 'key "{1}"'.format(typ, dt)
                try:
                    expected = current_data[typ][dt]
                except KeyError:
                    continue

                # use a specific comparator
                # if available
                comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt)
                comparator = getattr(self, comp_method, None)
                if comparator is not None:
                    comparator(result, expected, typ, version)
                else:
                    check_arbitrary(result, expected)

        return data
Пример #2
0
    def test_string_io(self):

        df = DataFrame(np.random.randn(10, 2))
        s = df.to_msgpack(None)
        result = read_msgpack(s)
        tm.assert_frame_equal(result, df)

        s = df.to_msgpack()
        result = read_msgpack(s)
        tm.assert_frame_equal(result, df)

        s = df.to_msgpack()
        result = read_msgpack(BytesIO(s))
        tm.assert_frame_equal(result, df)

        s = to_msgpack(None, df)
        result = read_msgpack(s)
        tm.assert_frame_equal(result, df)

        with ensure_clean(self.path) as p:

            s = df.to_msgpack()
            fh = open(p, "wb")
            fh.write(s)
            fh.close()
            result = read_msgpack(p)
            tm.assert_frame_equal(result, df)
Пример #3
0
    def get_dataset(self, name, apply_exclusion=False):
        """Retrieve a dataset"""
        name = self.dataset_exists(name)
        if self.data_format == "msg_pack":
            try:
                import mbf_pandas_msgpack
            except (ImportError, AttributeError):
                raise ImportError(
                    "Please install mbf-pandas-msgpack to read this old school biobank file"
                )
            with self.zf.open(name) as op:
                try:
                    df = mbf_pandas_msgpack.read_msgpack(op.read())
                except KeyError as e:
                    if "KeyError: u'category'" in str(e):
                        raise ValueError(
                            "Your pandas is too old. You need at least version 0.18"
                        )
        elif self.data_format == "parquet":
            try:
                import pyarrow
            except ImportError:
                try:
                    import fastparquet
                except ImportError:
                    raise ValueError(
                        "marburg_biobank needs either pyarrow or fastparquet")

            ds = self.zf.namelist()
            ii = 0
            dfs = []
            sub_name = name + "/" + str(ii)
            while sub_name in ds:
                dfs.append(self.__load_df_from_parquet(sub_name))
                ii += 1
                sub_name = name + "/" + str(ii)
            if not dfs:  # not actually a unit splitted dataframe - meta?
                df = self.__load_df_from_parquet(name)
            elif len(dfs) == 1:
                df = dfs[0]
            else:
                categoricals = set()
                for df in dfs:
                    for c, dt in df.dtypes.items():
                        if dt.name == "category":
                            categoricals.add(c)
                df = pd.concat(dfs)
                reps = {c: pd.Categorical(df[c]) for c in categoricals}
                if reps:
                    df = df.assign(**reps)
        else:
            raise ValueError(
                "Unexpected data format. Do you need to upgrade marburg_biobank?"
            )
        if apply_exclusion:
            try:
                df = self.apply_exclusion(name, df)
            except CantApplyExclusion:
                return df
        return df
Пример #4
0
    def test_iterator(self):

        self.setUp()
        l = [self.frame["float"], self.frame["float"].A, self.frame["float"].B, None]

        with ensure_clean(self.path) as path:
            to_msgpack(path, *l)
            for i, packed in enumerate(read_msgpack(path, iterator=True)):
                check_arbitrary(packed, l[i])
Пример #5
0
def pd_read(fn, index_column=None, **kwargs):
    ext = os.path.splitext(fn)[-1]
    if ".zip" in ext:
        ext = os.path.splitext(fn[:-4])[-1]
    if ext == ".msgpack":
        from mbf_pandas_msgpack import read_msgpack

        df = read_msgpack(fn)
    else:
        df = getattr(pd, f"read_{ext[1:]}")(fn, **kwargs)
    if index_column:
        df = df.set_index(index_column)
    return df
Пример #6
0
    def test_invalid_arg(self):
        # GH10369
        class A(object):
            def __init__(self):
                self.read = 0

        with pytest.raises(ValueError):
            read_msgpack(path_or_buf=None)
        with pytest.raises(ValueError):
            read_msgpack(path_or_buf={})
        with pytest.raises(ValueError):
            read_msgpack(path_or_buf=A())
Пример #7
0
 def test_1_3(self):
     df = mbf_pandas_msgpack.read_msgpack(
         "samples/sample_pandas_1.3.0.msgpack")
     assert_frame_equal(df, supposed)
Пример #8
0
    def test_iterator_with_string_io(self):

        dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)]
        s = to_msgpack(None, *dfs)
        for i, result in enumerate(read_msgpack(s, iterator=True)):
            tm.assert_frame_equal(result, dfs[i])
Пример #9
0
 def encode_decode(self, x, compress=None, **kwargs):
     with ensure_clean(self.path) as p:
         to_msgpack(p, x, compress=compress, **kwargs)
         return read_msgpack(p, **kwargs)