예제 #1
0
파일: test_io.py 프로젝트: WangCHX/fugue
def test_parquet_io(tmpdir):
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    df2 = ArrayDataFrame([[[1, 2]]], "a:[int]")
    # {a:int} will become {a:long} because pyarrow lib has issue
    df3 = ArrayDataFrame([[dict(a=1)]], "a:{a:long}")
    for df in [df1, df2, df3]:
        path = os.path.join(tmpdir, "a.parquet")
        save_df(df, path)
        actual = load_df(path)
        df_eq(df, actual, throw=True)

    save_df(df1, path)
    actual = load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    # can't specify wrong columns
    raises(Exception, lambda: load_df(path, columns="bb:str,a:int"))

    # load directory
    fs = FileSystem()
    folder = os.path.join(tmpdir, "folder")
    fs.makedirs(folder)
    f0 = os.path.join(folder, "_SUCCESS")
    f1 = os.path.join(folder, "1.parquet")
    f2 = os.path.join(folder, "3.parquet")
    fs.touch(f0)
    save_df(df1, f1)
    save_df(df1, f2)
    actual = load_df(folder, "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load multiple paths
    actual = load_df([f1, f2], "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # overwrite = False
    raises(FileExistsError, lambda: save_df(df1, f1, mode="error"))
    # can't overwrite directory
    raises(
        IsADirectoryError,
        lambda: save_df(df1, folder, format_hint="parquet", mode="overwrite"),
    )
    # wrong mode
    raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
예제 #2
0
def test_parquet_io(tmpdir, spark_session):
    si = SparkIO(spark_session, FileSystem())
    df1 = _df([["1", 2, 3]], "a:str,b:int,c:long")
    df2 = _df([[[1, 2]]], "a:[int]")
    # {a:int} will become {a:long} because pyarrow lib has issue
    df3 = _df([[dict(a=1)]], "a:{a:long}")
    for df in [df1, df2, df3]:
        path = os.path.join(tmpdir, "a.parquet")
        si.save_df(df, path)
        actual = si.load_df(path)
        df_eq(df, actual, throw=True)

    si.save_df(df1, path)
    actual = si.load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = si.load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int"))

    # load directory
    fs = FileSystem()
    folder = os.path.join(tmpdir, "folder")
    fs.makedirs(folder)
    f0 = os.path.join(folder, "_SUCCESS")
    f1 = os.path.join(folder, "1.parquet")
    f2 = os.path.join(folder, "3.parquet")
    fs.touch(f0)
    si.save_df(df1, f1, force_single=True)
    si.save_df(df1, f2, force_single=True)
    assert fs.isfile(f1)
    actual = si.load_df(folder, "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load multiple paths
    actual = si.load_df([f1, f2], "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")
    actual = si.load_df([f1, f2], "parquet", columns="b:str,a:str")
    df_eq(actual, [["2", "1"], ["2", "1"]], "a:str,b:int,c:long")

    # overwrite = False
    raises((FileExistsError, AnalysisException),
           lambda: si.save_df(df1, f1, mode="error"))
    # wrong mode
    raises(Exception, lambda: si.save_df(df1, f1, mode="dummy"))