示例#1
0
def test_csv_io(tmpdir):
    fs = FileSystem()
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.csv")
    # without header
    save_df(df1, path)
    assert fs.readtext(path).startswith("1,2,3")
    raises(InvalidOperationError, lambda: load_df(path, header=False))
    actual = load_df(path,
                     columns=["a", "b", "c"],
                     header=False,
                     infer_schema=True)
    assert [[1, 2, 3]] == actual.as_array()
    assert actual.schema == "a:long,b:long,c:long"
    actual = load_df(path, columns="a:double,b:str,c:str", header=False)
    assert [[1.0, "2", "3"]] == actual.as_array()
    assert actual.schema == "a:double,b:str,c:str"
    # with header
    save_df(df1, path, header=True)
    assert fs.readtext(path).startswith("a,b,c")
    actual = load_df(path, header=True)
    assert [["1", "2", "3"]] == actual.as_array()
    actual = load_df(path, header=True, infer_schema=True)
    assert [[1, 2, 3]] == actual.as_array()
    actual = load_df(path, columns=["b", "a"], header=True, infer_schema=True)
    assert [[2, 1]] == actual.as_array()
    actual = load_df(path, columns="b:str,a:double", header=True)
    assert [["2", 1.0]] == actual.as_array()
    raises(KeyError,
           lambda: load_df(path, columns="b:str,x:double", header=True))

    raises(NotImplementedError,
           lambda: load_df(path, columns="b:str,x:double", header=2))
示例#2
0
def test_json(tmpdir):
    fs = FileSystem()
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.json")
    save_df(df1, path)
    actual = load_df(path)
    df_eq(actual, [[1, 2, 3]], "a:long,b:long,c:long")
    actual = load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
示例#3
0
 def save_df(
     self,
     df: DataFrame,
     path: str,
     format_hint: Any = None,
     mode: str = "overwrite",
     partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
     force_single: bool = False,
     **kwargs: Any,
 ) -> None:
     if not partition_spec.empty:
         self.log.warning(  # pragma: no cover
             f"partition_spec is not respected in {self}.save_df"
         )
     df = self.to_df(df).as_local()
     save_df(df, path, format_hint=format_hint, mode=mode, fs=self.fs, **kwargs)
示例#4
0
 def save_df(
     self,
     df: SparkDataFrame,
     uri: str,
     format_hint: Optional[str] = None,
     partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
     mode: str = "overwrite",
     force_single: bool = False,
     **kwargs: Any,
 ) -> None:
     if not force_single:
         p = FileParser(uri, format_hint)
         writer = self._get_writer(df.native, partition_spec)
         writer.format(p.file_format).options(**kwargs).mode(mode)
         writer.save(uri)
     else:
         ldf = df.as_local()
         save_df(ldf, uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs)
示例#5
0
 def save_df(
     self,
     df: DataFrame,
     path: str,
     format_hint: Any = None,
     mode: str = "overwrite",
     partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
     force_single: bool = False,
     **kwargs: Any,
 ) -> None:
     if not partition_spec.empty:
         self.log.warning(  # pragma: no cover
             "partition_spec is not respected in %s.save_df", self)
     self.fs.makedirs(os.path.dirname(path), recreate=True)
     df = self.to_df(df)
     save_df(df,
             path,
             format_hint=format_hint,
             mode=mode,
             fs=self.fs,
             **kwargs)
示例#6
0
def test_parquet_io(tmpdir):
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    df2 = ArrayDataFrame([[[1, 2]]], "a:[int]")
    # {a:int} will become {a:long} because pyarrow lib has issue
    df3 = ArrayDataFrame([[dict(a=1)]], "a:{a:long}")
    for df in [df1, df2, df3]:
        path = os.path.join(tmpdir, "a.parquet")
        save_df(df, path)
        actual = load_df(path)
        df_eq(df, actual, throw=True)

    save_df(df1, path)
    actual = load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    # can't specify wrong columns
    raises(Exception, lambda: load_df(path, columns="bb:str,a:int"))

    # load directory
    fs = FileSystem()
    for name in ["folder.parquet", "folder"]:
        folder = os.path.join(tmpdir, name)
        fs.makedirs(folder)
        f0 = os.path.join(folder, "_SUCCESS")
        f1 = os.path.join(folder, "1.parquet")
        f2 = os.path.join(folder, "3.parquet")
        fs.touch(f0)
        save_df(df1, f1)
        save_df(df1, f2)

    actual = load_df(folder, "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load multiple paths
    actual = load_df([f1, f2], "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load folder
    actual = load_df(folder, "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    actual = load_df(os.path.join(tmpdir, "folder.parquet"))
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load pattern
    actual = load_df(os.path.join(tmpdir, "folder", "*.parquet"))
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # overwrite folder with single file
    save_df(actual, os.path.join(tmpdir, "folder.parquet"), mode="overwrite")
    actual = load_df(os.path.join(tmpdir, "folder.parquet"))
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # overwrite = False
    raises(FileExistsError, lambda: save_df(df1, f1, mode="error"))
    raises(
        FileExistsError,
        lambda: save_df(
            df1, os.path.join(tmpdir, "folder.parquet"), mode="error"),
    )

    # wrong mode
    raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
示例#7
0
def test_avro_io(tmpdir):
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.avro")
    save_df(df1, path)
    actual = load_df(path)

    df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long")
    actual = load_df(path, columns=["a", "b"])
    df_eq(actual, [["1", 3]], "a:str,b:long")

    actual = load_df(path, columns="a:str,b:int,c:long")
    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")

    actual = load_df(path, columns=["b", "c"], infer_schema=True)
    df_eq(actual, [[2, 3]], "b:long,c:long")

    # provide schema and columns -> throw error
    raises(
        Exception,
        lambda: save_df(
            path,
            columns="a:str,b:int,c:long",
            schema={
                "type":
                "record",
                "name":
                "Root",
                "fields": [
                    {
                        "name": "station",
                        "type": "string"
                    },
                    {
                        "name": "time",
                        "type": "long"
                    },
                    {
                        "name": "temp",
                        "type": "int"
                    },
                ],
            },
        ),
    )

    # provide schema and infer_schema is True -> throw error
    raises(
        Exception,
        lambda: save_df(
            path,
            columns=None,
            schema={
                "type":
                "record",
                "name":
                "Root",
                "fields": [
                    {
                        "name": "station",
                        "type": "string"
                    },
                    {
                        "name": "time",
                        "type": "long"
                    },
                    {
                        "name": "temp",
                        "type": "int"
                    },
                ],
            },
            infer_schema=True,
        ),
    )
示例#8
0
def test_avro_io(tmpdir):
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    df2 = PandasDataFrame([["hello", 2, 3]], "a:str,b:int,c:long")
    path1 = os.path.join(tmpdir, "df1.avro")
    path2 = os.path.join(tmpdir, "df2.avro")
    save_df(df1, path1)
    actual = load_df(path1)

    df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long")
    actual = load_df(path1, columns=["a", "b"])
    df_eq(actual, [["1", 3]], "a:str,b:long")

    actual = load_df(path1, columns="a:str,b:int,c:long")
    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")

    actual = load_df(path1, columns="a:str,b:int,c:long",
                     infer_schema=True)  # TODO raise error when both provided?
    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")

    actual = load_df(path1, columns=["b", "c"], infer_schema=True)
    df_eq(actual, [[2, 3]], "b:long,c:long")

    # save in append mode
    path3 = os.path.join(tmpdir, "append.avro")
    save_df(df1, path3)
    save_df(df2, path3, append=True)
    actual = load_df(path1, columns="a:str,b:int,c:long")
    df_eq(actual, [['1', 2, 3], ['hello', 2, 3]], "a:str,b:int,c:long")

    # save times_as_micros =False (i.e milliseconds instead)
    df4 = PandasDataFrame([["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
    path4 = os.path.join(tmpdir, "df4.avro")
    save_df(df4, path4)
    actual = load_df(path4, columns="a:datetime,b:int,c:long")
    df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
    save_df(df4, path4, times_as_micros=False)
    actual = load_df(path4, columns="a:datetime,b:int,c:long")
    df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")

    # provide avro schema
    schema = {
        'type':
        'record',
        'name':
        'Root',
        'fields': [
            {
                'name': 'a',
                'type': 'string'
            },
            {
                'name': 'b',
                'type': 'int'
            },
            {
                'name': 'c',
                'type': 'long'
            },
        ],
    }
    save_df(df1, path1, schema=schema)
    actual = load_df(path1, columns="a:str,b:int,c:long")
    df_eq(actual, [['1', 2, 3]], "a:str,b:int,c:long")

    # provide wrong types in columns arg
    save_df(df2, path2, schema=schema)
    raises(
        FugueDataFrameOperationError,
        lambda: load_df(df2, path2, columns="a:int,b:int,c:long"),
    )

    # load with process_record function
    actual = load_df(path2,
                     columns="a:str,b:int,c:long",
                     process_record=lambda s: {
                         'a': str.upper(s['a']),
                         'b': s['b'],
                         'c': s['c']
                     })
    df_eq(actual, [['HELLO', 2, 3]], "a:str,b:int,c:long")

    # provide wrong type in avro schema
    schema = {
        'type':
        'record',
        'name':
        'Root',
        'fields': [
            {
                'name': 'a',
                'type': 'int'
            },
            {
                'name': 'b',
                'type': 'int'
            },
            {
                'name': 'c',
                'type': 'long'
            },
        ],
    }
    raises(TypeError, lambda: save_df(df2, path2, schema=schema))