Exemplo n.º 1
0
def test_csv_io(tmpdir):
    fs = FileSystem()
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.csv")
    # without header
    save_df(df1, path)
    assert fs.readtext(path).startswith("1,2,3")
    raises(InvalidOperationError, lambda: load_df(path, header=False))
    actual = load_df(path,
                     columns=["a", "b", "c"],
                     header=False,
                     infer_schema=True)
    assert [[1, 2, 3]] == actual.as_array()
    assert actual.schema == "a:long,b:long,c:long"
    actual = load_df(path, columns="a:double,b:str,c:str", header=False)
    assert [[1.0, "2", "3"]] == actual.as_array()
    assert actual.schema == "a:double,b:str,c:str"
    # with header
    save_df(df1, path, header=True)
    assert fs.readtext(path).startswith("a,b,c")
    actual = load_df(path, header=True)
    assert [["1", "2", "3"]] == actual.as_array()
    actual = load_df(path, header=True, infer_schema=True)
    assert [[1, 2, 3]] == actual.as_array()
    actual = load_df(path, columns=["b", "a"], header=True, infer_schema=True)
    assert [[2, 1]] == actual.as_array()
    actual = load_df(path, columns="b:str,a:double", header=True)
    assert [["2", 1.0]] == actual.as_array()
    raises(KeyError,
           lambda: load_df(path, columns="b:str,x:double", header=True))

    raises(NotImplementedError,
           lambda: load_df(path, columns="b:str,x:double", header=2))
Exemplo n.º 2
0
def _safe_load_json(path: str, **kwargs: Any) -> pd.DataFrame:
    kw = {"orient": "records", "lines": True, **kwargs}
    try:
        return pd.read_json(path, **kw)
    except (IsADirectoryError, PermissionError):
        fs = FileSystem()
        return pd.concat([
            pd.read_json(pfs.path.join(path, os.path.basename(x.path)), **kw)
            for x in fs.opendir(path).glob("*.json")
        ])
Exemplo n.º 3
0
 def __init__(self, conf: Any = None):
     p = ParamDict(FUGUE_DASK_DEFAULT_CONF)
     p.update(ParamDict(conf))
     super().__init__(p)
     self._fs = FileSystem()
     self._log = logging.getLogger()
     self._default_sql_engine = QPDDaskEngine(self)
Exemplo n.º 4
0
 def test_load_csv_folder(self):
     e = self.engine
     native = NativeExecutionEngine()
     a = ArrayDataFrame([[6.1, 1.1]], "c:double,a:double")
     b = ArrayDataFrame([[2.1, 7.1], [4.1, 8.1]], "c:double,a:double")
     path = os.path.join(self.tmpdir, "a", "b")
     native.save_df(a,
                    os.path.join(path, "a.csv"),
                    format_hint="csv",
                    header=True)
     native.save_df(b,
                    os.path.join(path, "b.csv"),
                    format_hint="csv",
                    header=True)
     FileSystem().touch(os.path.join(path, "_SUCCESS"))
     c = e.load_df(
         path,
         format_hint="csv",
         header=True,
         infer_schema=True,
         columns=["a", "c"],
     )
     df_eq(c, [[1.1, 6.1], [7.1, 2.1], [8.1, 4.1]],
           "a:double,c:double",
           throw=True)
Exemplo n.º 5
0
def save_df(
    df: DaskDataFrame,
    uri: str,
    format_hint: Optional[str] = None,
    mode: str = "overwrite",
    fs: Optional[FileSystem] = None,
    **kwargs: Any,
) -> None:
    assert_or_throw(
        mode in ["overwrite", "error"],
        lambda: NotImplementedError(f"{mode} is not supported"),
    )
    p = FileParser(uri, format_hint).assert_no_glob()
    if fs is None:
        fs = FileSystem()
    if fs.exists(uri):
        assert_or_throw(mode == "overwrite", FileExistsError(uri))
        try:
            fs.remove(uri)
        except Exception:
            try:
                fs.removetree(uri)
            except Exception:  # pragma: no cover
                pass
    _FORMAT_SAVE[p.file_format](df, p, **kwargs)
Exemplo n.º 6
0
def test_csv_io(tmpdir, spark_session):
    fs = FileSystem()
    si = SparkIO(spark_session, fs)
    df1 = _df([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.csv")
    # without header
    si.save_df(df1, path)
    raises(InvalidOperationError, lambda: si.load_df(path, header=False))
    actual = si.load_df(path, columns=["a", "b", "c"], header=False)
    assert [["1", "2", "3"]] == actual.as_array()
    assert actual.schema == "a:str,b:str,c:str"
    actual = si.load_df(path, columns="a:double,b:str,c:str", header=False)
    assert [[1.0, "2", "3"]] == actual.as_array()
    assert actual.schema == "a:double,b:str,c:str"
    # with header
    si.save_df(df1, path, header=True)
    actual = si.load_df(path, header=True)
    assert [["1", "2", "3"]] == actual.as_array()
    actual = si.load_df(path, columns=["b", "a"], header=True)
    assert [["2", "1"]] == actual.as_array()
    actual = si.load_df(path, columns="b:str,a:double", header=True)
    assert [["2", 1.0]] == actual.as_array()
    raises(Exception,
           lambda: si.load_df(path, columns="b:str,x:double", header=True))

    raises(NotImplementedError,
           lambda: si.load_df(path, columns="b:str,x:double", header=2))
Exemplo n.º 7
0
 def __init__(self, conf: Any = None):
     p = ParamDict(FUGUE_DASK_DEFAULT_CONF)
     p.update(ParamDict(conf))
     super().__init__(p)
     self._fs = FileSystem()
     self._log = logging.getLogger()
     self._native = NativeExecutionEngine(conf=conf)
Exemplo n.º 8
0
 def test_load_parquet_folder(self):
     e = self.engine
     native = NativeExecutionEngine()
     a = ArrayDataFrame([[6, 1]], "c:int,a:long")
     b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     native.save_df(a, os.path.join(path, "a.parquet"))
     native.save_df(b, os.path.join(path, "b.parquet"))
     FileSystem().touch(os.path.join(path, "_SUCCESS"))
     c = e.load_df(path, format_hint="parquet", columns=["a", "c"])
     df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)
Exemplo n.º 9
0
 def test_load_avro_folder(self):
     # TODO: switch to c:int,a:long when we can preserve schema to avro
     e = self.engine
     native = NativeExecutionEngine()
     a = ArrayDataFrame([[6, 1]], "c:long,a:long")
     b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     native.save_df(a, os.path.join(path, "a.avro"))
     native.save_df(b, os.path.join(path, "b.avro"))
     FileSystem().touch(os.path.join(path, "_SUCCESS"))
     c = e.load_df(path, format_hint="avro", columns=["a", "c"])
     df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True)
Exemplo n.º 10
0
 def _compute(
     self,
     df: Iterable[Dict[str, Any]],
     entrypoint: Callable[[str, Dict[str, Any]], Any],
 ) -> Iterable[Dict[str, Any]]:
     ck_fs = FileSystem().makedirs(self._checkpoint_path, recreate=True)
     for row in df:
         for trial in get_trials_from_row(row):
             rjudge = RemoteTrialJudge(entrypoint)
             self._objective.copy().run(trial, rjudge, ck_fs)
             if rjudge.report is not None:
                 yield rjudge.report.fill_dict(dict(row))
Exemplo n.º 11
0
def test_json(tmpdir):
    fs = FileSystem()
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.json")
    save_df(df1, path)
    actual = load_df(path)
    df_eq(actual, [[1, 2, 3]], "a:long,b:long,c:long")
    actual = load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
Exemplo n.º 12
0
def test_json_io(tmpdir, spark_session):
    fs = FileSystem()
    si = SparkIO(spark_session, fs)
    df1 = _df([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.json")
    si.save_df(df1, path)
    actual = si.load_df(path)
    df_eq(actual, [[1, 2, 3]], "a:long,b:long,c:long")
    actual = si.load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = si.load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int"))
Exemplo n.º 13
0
def _load_single_avro(path: str, **kwargs: Any) -> pd.DataFrame:
    from fastavro import reader

    kw = ParamDict(kwargs)
    process_record = None
    if "process_record" in kw:
        process_record = kw["process_record"]
        del kw["process_record"]

    fs = FileSystem()
    with fs.openbin(path) as fp:
        # Configure Avro reader
        avro_reader = reader(fp)
        # Load records in memory
        if process_record:
            records = [process_record(r) for r in avro_reader]

        else:
            records = list(avro_reader)

        # Populate pandas.DataFrame with records
        return pd.DataFrame.from_records(records)
Exemplo n.º 14
0
def _get_single_files(fp: Iterable[FileParser],
                      fs: Optional[FileSystem]) -> Iterable[FileParser]:
    if fs is None:
        fs = FileSystem()
    for f in fp:
        if f.glob_pattern != "":
            files = [
                FileParser(pfs.path.join(f.uri, os.path.basename(x.path)))
                for x in fs.opendir(f.uri).glob(f.glob_pattern)
            ]
            yield from _get_single_files(files, fs)
        else:
            yield f
Exemplo n.º 15
0
def test_save_with_partition(tmpdir, spark_session):
    si = SparkIO(spark_session, FileSystem())
    df1 = _df([["1", 2, 3]], "a:str,b:int,c:long")
    path = os.path.join(tmpdir, "a.parquet")
    si.save_df(df1, path, partition_spec=PartitionSpec(num=2))
    actual = si.load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    si.save_df(df1, path, partition_spec=PartitionSpec(by=["a"]))
    actual = si.load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    si.save_df(df1, path, partition_spec=PartitionSpec(by=["a"], num=2))
    actual = si.load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
Exemplo n.º 16
0
def _load_avro(p: FileParser,
               columns: Any = None,
               **kwargs: Any) -> Tuple[pd.DataFrame, Any]:
    path = p.uri
    try:
        pdf = _load_single_avro(path, **kwargs)
    except (IsADirectoryError, PermissionError, FileExpected):
        fs = FileSystem()
        pdf = pd.concat([
            _load_single_avro(pfs.path.join(path, os.path.basename(x.path)),
                              **kwargs)
            for x in fs.opendir(path).glob("*.avro")
        ])

    if columns is None:
        return pdf, None
    if isinstance(columns, list):  # column names
        return pdf[columns], None

    schema = Schema(columns)

    # Return created DataFrame
    return pdf[schema.names], schema
Exemplo n.º 17
0
def save_df(
    df: LocalDataFrame,
    uri: str,
    format_hint: Optional[str] = None,
    mode: str = "overwrite",
    fs: Optional[FileSystem] = None,
    **kwargs: Any,
) -> None:
    assert_or_throw(mode in ["overwrite", "error"],
                    NotImplementedError(f"{mode} is not supported"))
    p = FileParser(uri, format_hint)
    if fs is None:
        fs = FileSystem()
    if fs.exists(uri):
        assert_or_throw(mode == "overwrite", FileExistsError(uri))
    _FORMAT_SAVE[p.file_format](df, p, **kwargs)
Exemplo n.º 18
0
def test_parquet_io(tmpdir, spark_session):
    si = SparkIO(spark_session, FileSystem())
    df1 = _df([["1", 2, 3]], "a:str,b:int,c:long")
    df2 = _df([[[1, 2]]], "a:[int]")
    # {a:int} will become {a:long} because pyarrow lib has issue
    df3 = _df([[dict(a=1)]], "a:{a:long}")
    for df in [df1, df2, df3]:
        path = os.path.join(tmpdir, "a.parquet")
        si.save_df(df, path)
        actual = si.load_df(path)
        df_eq(df, actual, throw=True)

    si.save_df(df1, path)
    actual = si.load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = si.load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int"))

    # load directory
    fs = FileSystem()
    folder = os.path.join(tmpdir, "folder")
    fs.makedirs(folder)
    f0 = os.path.join(folder, "_SUCCESS")
    f1 = os.path.join(folder, "1.parquet")
    f2 = os.path.join(folder, "3.parquet")
    fs.touch(f0)
    si.save_df(df1, f1, force_single=True)
    si.save_df(df1, f2, force_single=True)
    assert fs.isfile(f1)
    actual = si.load_df(folder, "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load multiple paths
    actual = si.load_df([f1, f2], "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")
    actual = si.load_df([f1, f2], "parquet", columns="b:str,a:str")
    df_eq(actual, [["2", "1"], ["2", "1"]], "a:str,b:int,c:long")

    # overwrite = False
    raises((FileExistsError, AnalysisException),
           lambda: si.save_df(df1, f1, mode="error"))
    # wrong mode
    raises(Exception, lambda: si.save_df(df1, f1, mode="dummy"))
Exemplo n.º 19
0
def validate_iterative_objective(
    func: IterativeObjectiveFunc,
    trial: Trial,
    budgets: List[float],
    validator: Callable[[List[TrialReport]], None],
    continuous: bool = False,
    checkpoint_path: str = "",
    monitor: Optional[Monitor] = None,
) -> None:
    path = checkpoint_path if checkpoint_path != "" else tempfile.gettempdir()
    basefs = FileSystem().makedirs(os.path.join(path, str(uuid4())),
                                   recreate=True)
    j = _Validator(monitor, budgets, continuous=continuous)
    if continuous:
        f = pickle.loads(pickle.dumps(func)).copy()
        f.run(trial, j, checkpoint_basedir_fs=basefs)
    else:
        for _ in budgets:
            f = pickle.loads(pickle.dumps(func)).copy()
            f.run(trial, j, checkpoint_basedir_fs=basefs)
    validator(j.reports)
Exemplo n.º 20
0
 def __init__(self,
              spark_session: Optional[SparkSession] = None,
              conf: Any = None):
     if spark_session is None:
         spark_session = SparkSession.builder.getOrCreate()
     self._spark_session = spark_session
     cf = dict(FUGUE_SPARK_DEFAULT_CONF)
     cf.update({
         x[0]: x[1]
         for x in spark_session.sparkContext.getConf().getAll()
     })
     cf.update(ParamDict(conf))
     super().__init__(cf)
     self._fs = FileSystem()
     self._log = logging.getLogger()
     self._default_sql_engine = SparkSQLEngine(self)
     self._broadcast_func = RunOnce(self._broadcast,
                                    lambda *args, **kwargs: id(args[0]))
     self._persist_func = RunOnce(self._persist,
                                  lambda *args, **kwargs: id(args[0]))
     self._register_func = RunOnce(self._register,
                                   lambda *args, **kwargs: id(args[0]))
     self._io = SparkIO(self.spark_session, self.fs)
Exemplo n.º 21
0
def test_parquet_io(tmpdir):
    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
    df2 = ArrayDataFrame([[[1, 2]]], "a:[int]")
    # {a:int} will become {a:long} because pyarrow lib has issue
    df3 = ArrayDataFrame([[dict(a=1)]], "a:{a:long}")
    for df in [df1, df2, df3]:
        path = os.path.join(tmpdir, "a.parquet")
        save_df(df, path)
        actual = load_df(path)
        df_eq(df, actual, throw=True)

    save_df(df1, path)
    actual = load_df(path, columns=["b", "a"])
    df_eq(actual, [[2, "1"]], "b:int,a:str")
    actual = load_df(path, columns="b:str,a:int")
    df_eq(actual, [["2", 1]], "b:str,a:int")
    # can't specify wrong columns
    raises(Exception, lambda: load_df(path, columns="bb:str,a:int"))

    # load directory
    fs = FileSystem()
    folder = os.path.join(tmpdir, "folder")
    fs.makedirs(folder)
    f0 = os.path.join(folder, "_SUCCESS")
    f1 = os.path.join(folder, "1.parquet")
    f2 = os.path.join(folder, "3.parquet")
    fs.touch(f0)
    save_df(df1, f1)
    save_df(df1, f2)
    actual = load_df(folder, "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # load multiple paths
    actual = load_df([f1, f2], "parquet")
    df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long")

    # overwrite = False
    raises(FileExistsError, lambda: save_df(df1, f1, mode="error"))
    # can't overwrite directory
    raises(
        IsADirectoryError,
        lambda: save_df(df1, folder, format_hint="parquet", mode="overwrite"),
    )
    # wrong mode
    raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
Exemplo n.º 22
0
 def load_dir() -> pd.DataFrame:
     fs = FileSystem()
     return pd.concat([
         pd.read_csv(pfs.path.join(path, os.path.basename(x.path)),
                     **kwargs) for x in fs.opendir(path).glob("*.csv")
     ])
Exemplo n.º 23
0
 def __init__(self, conf: Any = None):
     super().__init__(conf)
     self._fs = FileSystem()
     self._log = logging.getLogger()
Exemplo n.º 24
0
 def __init__(self, conf: Any = None):
     super().__init__(conf)
     self._fs = FileSystem()
     self._log = logging.getLogger()
     self._default_sql_engine = SqliteEngine(self)