Пример #1
0
 def assert_eq(df, df_expected=None, raw=False):
     if df_expected is None:
         df_expected = df
     df_actual = deserialize_df(serialize_df(df))
     if raw:
         assert df_expected.native == df_actual.native
     else:
         df_eq(df_expected, df_actual, throw=True)
Пример #2
0
def test_serialize_df(tmpdir):
    def assert_eq(df, df_expected=None, raw=False):
        if df_expected is None:
            df_expected = df
        df_actual = deserialize_df(serialize_df(df))
        if raw:
            assert df_expected.native == df_actual.native
        else:
            df_eq(df_expected, df_actual, throw=True)

    fs = FileSystem()
    assert deserialize_df(serialize_df(None)) is None
    assert_eq(ArrayDataFrame([], "a:int,b:int"))
    assert_eq(ArrayDataFrame([[None, None]], "a:int,b:int"))
    assert_eq(ArrayDataFrame([[None, "abc"]], "a:int,b:str"))
    assert_eq(ArrayDataFrame([[None, [1, 2], dict(x=1)]],
                             "a:int,b:[int],c:{x:int}"),
              raw=True)
    assert_eq(
        IterableDataFrame([[None, [1, 2], dict(x=1)]],
                          "a:int,b:[int],c:{x:int}"),
        ArrayDataFrame([[None, [1, 2], dict(x=1)]], "a:int,b:[int],c:{x:int}"),
        raw=True,
    )
    assert_eq(PandasDataFrame([[None, None]], "a:int,b:int"))
    assert_eq(PandasDataFrame([[None, "abc"]], "a:int,b:str"))

    raises(
        InvalidOperationError,
        lambda: serialize_df(ArrayDataFrame([], "a:int,b:int"), 0),
    )

    path = os.path.join(tmpdir, "1.pkl")

    df = ArrayDataFrame([[None, None]], "a:int,b:int")
    s = serialize_df(df, 0, path, fs)
    df_eq(df, deserialize_df(s, fs), throw=True)
    df_eq(df, deserialize_df(s), throw=True)

    s = serialize_df(df, 0, path)
    df_eq(df, deserialize_df(s), throw=True)

    raises(ValueError, lambda: deserialize_df('{"x":1}'))
Пример #3
0
 def run(self, cursor: PartitionCursor,
         df: LocalDataFrame) -> LocalDataFrame:
     data = serialize_df(df, self.to_file_threshold, self.temp_path)
     row = cursor.key_value_array + [data]
     return ArrayDataFrame([row], self.output_schema)