예제 #1
0
 def _delegate(self, path) -> Tuple[FSBase, str]:
     with self._fs_lock:
         if self._in_create:  # pragma: no cover
             return super()._delegate(path)
         self._in_create = True
         fp = _FSPath(path)
         if fp.root not in self._fs_store:
             self._fs_store[fp.root] = self.create_fs(fp.root)
             self.mount(to_uuid(fp.root), self._fs_store[fp.root])
         self._in_create = False
     m_path = to_uuid(fp.root) + "/" + fp.relative_path
     return super()._delegate(m_path)
예제 #2
0
 def __init__(
     self,
     file_id: str,
     deterministic: bool,
     permanent: bool,
     lazy: bool = False,
     partition: Any = None,
     single: bool = False,
     namespace: Any = None,
     **save_kwargs: Any,
 ):
     super().__init__(
         to_file=True,
         deterministic=deterministic,
         permanent=permanent,
         lazy=lazy,
         fmt="",
         partition=PartitionSpec(partition),
         single=single,
         namespace=namespace,
         save_kwargs=dict(save_kwargs),
     )
     self._yield_func: Any = None
     self._file_id = to_uuid(file_id, namespace)
     self._yielded = YieldedFile(self._file_id)
예제 #3
0
파일: _tasks.py 프로젝트: gityow/fugue
 def __uuid__(self) -> str:
     return to_uuid(
         super().__uuid__(),
         self._outputter,
         self._outputter._params,
         self._outputter._partition_spec,
     )
예제 #4
0
파일: _tasks.py 프로젝트: gityow/fugue
 def __uuid__(self) -> str:
     return to_uuid(
         super().__uuid__(),
         self._processor,
         self._processor._params,
         self._processor._partition_spec,
     )
예제 #5
0
 def __uuid__(self) -> str:
     return to_uuid(
         self._wrapper,
         self._need_engine,
         self._need_output_schema,
         str(self._output_schema),
     )
예제 #6
0
 def __uuid__(self) -> str:
     return to_uuid(
         self._wrapper,
         self._engine_param,
         self._use_dfs,
         self._need_output_schema,
         str(self._output_schema),
     )
예제 #7
0
 def __uuid__(self) -> str:
     if self._id == "":
         self._ensure_fully_connected()
         if self.deterministic:
             self._id = to_uuid(self.spec, self.configs, self.inputs)
         else:
             self._id = str(uuid4())
     return self._id
예제 #8
0
 def __uuid__(self) -> str:
     return to_uuid(
         self.configs,
         self.inputs,
         self.outputs,
         get_full_type_path(self.func),
         self.metadata,
         self.deterministic,
         self.lazy,
         self._node_spec,
     )
예제 #9
0
def test__to_processor_determinism():
    a = _to_processor(t1, None)
    b = _to_processor(t1, None)
    c = _to_processor("t1", None)
    d = _to_processor("t2", None)
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
    assert a is not c
    assert to_uuid(a) == to_uuid(c)
    assert to_uuid(a) != to_uuid(d)

    a = _to_processor(MockProcessor)
    b = _to_processor("MockProcessor")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
예제 #10
0
def test__to_outputter_determinism():
    a = _to_outputter(t1)
    b = _to_outputter(t1)
    c = _to_outputter("t1")
    d = _to_outputter("t2")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
    assert a is not c
    assert to_uuid(a) == to_uuid(c)
    assert to_uuid(a) != to_uuid(d)

    a = _to_outputter(MockOutputter)
    b = _to_outputter("MockOutputter")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
예제 #11
0
def test__to_creator_determinism():
    a = _to_creator(t1, None)
    b = _to_creator(t1, None)
    c = _to_creator("t1", None)
    d = _to_creator("t2", None)
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
    assert a is not c
    assert to_uuid(a) == to_uuid(c)
    assert to_uuid(a) != to_uuid(d)

    a = _to_creator(T0)
    b = _to_creator("T0")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
예제 #12
0
def test__to_transformer_determinism():
    a = _to_transformer(t1, None)
    b = _to_transformer(t1, None)
    c = _to_transformer("t1", None)
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
    assert a is not c
    assert to_uuid(a) == to_uuid(c)

    a = _to_transformer(t4, "a:int,b:int")
    b = _to_transformer("t4", Schema("a:int,b:int"))
    assert a is not b
    assert to_uuid(a) == to_uuid(b)

    a = _to_transformer(MockTransformer)
    b = _to_transformer("MockTransformer")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)

    a = _to_transformer(t7, "a:int,b:int")
    b = _to_transformer("t7", "a:int,b:int")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
def test__to_output_transformer_determinism():
    a = _to_output_transformer(t1)
    b = _to_output_transformer(t1)
    c = _to_output_transformer("t1")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
    assert a is not c
    assert to_uuid(a) == to_uuid(c)

    a = _to_output_transformer(t4)
    b = _to_output_transformer("t4")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)

    a = _to_output_transformer(MockTransformer)
    b = _to_output_transformer("MockTransformer")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)

    a = _to_output_transformer(t7)
    b = _to_output_transformer("t7")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
예제 #14
0
파일: _tasks.py 프로젝트: gityow/fugue
 def __uuid__(self) -> str:
     # _checkpoint is not part of determinism
     # _yield_name is not part of determinism
     return to_uuid(
         self.configs,
         self.inputs,
         self.outputs,
         # get_full_type_path(self.func),
         self.metadata,
         self.deterministic,
         self.lazy,
         self._get_dependency_uuid(),
         self._broadcast,
     )
예제 #15
0
파일: _tasks.py 프로젝트: gityow/fugue
 def _get_dependency_uuid(self) -> Any:
     # TODO: this should be a part of adagio!!
     if self._dependency_uuid is not None:
         return self._dependency_uuid
     values: List[Any] = []
     for k, v in self.node_spec.dependency.items():
         t = v.split(".", 1)
         assert_or_throw(len(t) == 2)
         values.append(k)
         values.append(t[1])
         task = self.parent_workflow.tasks[t[0]]
         values.append(task.__uuid__())
     self._dependency_uuid = to_uuid(values)
     return self._dependency_uuid
예제 #16
0
    def map(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (
            None
            if on_init is None
            else RunOnce(
                on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))
            )
        )

        def _map(pdf: Any) -> pd.DataFrame:
            if pdf.shape[0] == 0:
                return PandasDataFrame([], output_schema).as_pandas()
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(
                pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
            )
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        df = self.to_df(df)
        if len(partition_spec.partition_by) == 0:
            pdf = self.repartition(df, partition_spec)
            result = pdf.native.map_partitions(_map, meta=output_schema.pandas_dtype)
        else:
            df = self.repartition(df, PartitionSpec(num=partition_spec.num_partitions))
            result = self.pl_utils.safe_groupby_apply(
                df.native,
                partition_spec.partition_by,
                _map,
                meta=output_schema.pandas_dtype,
            )
        return DaskDataFrame(result, output_schema, metadata)
예제 #17
0
    def _map_by_pandas_udf(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        df = self.to_df(self.repartition(df, partition_spec))
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (
            None
            if on_init is None
            else RunOnce(
                on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))
            )
        )

        def _udf(
            dfs: Iterable[pd.DataFrame],
        ) -> Iterable[pd.DataFrame]:  # pragma: no cover
            def get_dfs() -> Iterable[LocalDataFrame]:
                for df in dfs:
                    if df.shape[0] > 0:
                        yield PandasDataFrame(
                            df.reset_index(drop=True),
                            input_schema,
                            pandas_df_wrapper=True,
                        )

            input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema)
            if input_df.empty:
                return PandasDataFrame([], output_schema).as_pandas()
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            if isinstance(output_df, LocalDataFrameIterableDataFrame):
                for res in output_df.native:
                    yield res.as_pandas()
            else:
                yield output_df.as_pandas()

        df = self.to_df(df)
        sdf = df.native.mapInPandas(_udf, schema=to_spark_schema(output_schema))
        return SparkDataFrame(sdf, metadata=metadata)
def test_output():
    t = MockTaskForVar()
    s = OutputSpec("o", dict, False)
    o = _Output(t, s)
    assert to_uuid(t, s) == o.__uuid__()
    assert not o.is_set
    assert not o.is_skipped
    assert not o.is_successful
    assert not o.is_failed
    raises(ValueError, lambda: o.set(1))
    assert o.is_set
    assert not o.is_skipped
    assert not o.is_successful
    assert o.is_failed
    assert isinstance(o.exception, ValueError)
    assert o.trace is not None
    o.set(dict())  # when is_set, setting again will do nothing
    assert o.is_set
    assert not o.is_skipped
    assert not o.is_successful
    assert o.is_failed
    assert isinstance(o.exception, ValueError)

    o = _Output(t, s)
    # setting a bad value will cause exception on both setters and getters
    raises(ValueError, lambda: o.set(None))
    assert o.is_set
    assert not o.is_skipped
    assert not o.is_successful
    assert o.is_failed
    assert isinstance(o.exception, ValueError)

    s2 = OutputSpec("o", dict, True)
    o = _Output(t, s2)
    o.set(None)
    assert o.is_set
    assert not o.is_skipped
    assert o.is_successful
    assert not o.is_failed
    assert o.exception is None

    s2 = OutputSpec("o", dict, True)
    o = _Output(t, s2)
    o.skip()
    assert o.is_set
    assert o.is_skipped
    assert not o.is_successful
    assert not o.is_failed
    assert o.exception is None
예제 #19
0
파일: _tasks.py 프로젝트: zywillc/fugue
 def __uuid__(self) -> str:
     return to_uuid(
         self.configs,
         self.inputs,
         self.outputs,
         # get_full_type_path(self.func),
         self.metadata,
         self.deterministic,
         self.lazy,
         self.node_spec,
         str(self._persist),
         self._broadcast,
         self._checkpoint,
         self._checkpoint_namespace,
     )
예제 #20
0
def test_determinism():
    a = PartitionSpec(num=0)
    b = PartitionSpec()
    assert to_uuid(a) == to_uuid(b)

    a = PartitionSpec(by=["a"], num=2)
    b = PartitionSpec(num="2", by=["a"])
    assert to_uuid(a) == to_uuid(b)

    a = PartitionSpec(by=["a", "b"])
    b = PartitionSpec(by=["b", "a"])
    assert to_uuid(a) != to_uuid(b)
예제 #21
0
    def _group_map_by_pandas_udf(
        self,
        df: DataFrame,
        map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame],
        output_schema: Any,
        partition_spec: PartitionSpec,
        metadata: Any = None,
        on_init: Optional[Callable[[int, DataFrame], Any]] = None,
    ) -> DataFrame:
        presort = partition_spec.presort
        presort_keys = list(presort.keys())
        presort_asc = list(presort.values())
        output_schema = Schema(output_schema)
        input_schema = df.schema
        on_init_once: Any = (
            None
            if on_init is None
            else RunOnce(
                on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0]))
            )
        )

        def _udf(pdf: Any) -> pd.DataFrame:  # pragma: no cover
            if pdf.shape[0] == 0:
                return PandasDataFrame([], output_schema).as_pandas()
            if len(presort_keys) > 0:
                pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
            input_df = PandasDataFrame(
                pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
            )
            if on_init_once is not None:
                on_init_once(0, input_df)
            cursor = partition_spec.get_cursor(input_schema, 0)
            cursor.set(input_df.peek_array(), 0, 0)
            output_df = map_func(cursor, input_df)
            return output_df.as_pandas()

        df = self.to_df(df)

        gdf = df.native.groupBy(*partition_spec.partition_by)
        sdf = gdf.applyInPandas(_udf, schema=to_spark_schema(output_schema))
        return SparkDataFrame(sdf, metadata=metadata)
예제 #22
0
 def __uuid__(self) -> str:
     return to_uuid(get_full_type_path(self._func), self._params, self._rt)
예제 #23
0
 def __uuid__(self) -> str:
     return to_uuid(self._wrapper, self._need_engine, self._use_dfs)
예제 #24
0
 def __uuid__(self) -> str:
     return to_uuid(get_full_type_path(self))
예제 #25
0
파일: partition.py 프로젝트: zywillc/fugue
 def __uuid__(self) -> str:
     """Get deterministic unique id of this object"""
     return to_uuid(self.jsondict)
예제 #26
0
 def __uuid__(self) -> str:
     return to_uuid(self._wrapper.__uuid__(), self._output_schema_arg)
예제 #27
0
파일: convert.py 프로젝트: gityow/fugue
 def __uuid__(self) -> str:
     return to_uuid(self._wrapper, self._engine_param, self._use_dfs)
예제 #28
0
파일: _tasks.py 프로젝트: gityow/fugue
 def __uuid__(self) -> str:
     return to_uuid(super().__uuid__(), self._creator, self._creator._params)
예제 #29
0
def test_function_wrapper_determinism():
    w1 = FunctionWrapper(f20, "^[ldsp][ldsp]$", "[ldsp]")
    w2 = FunctionWrapper(f20, "^[ldsp][ldsp]$", "[ldsp]")
    assert w1 is not w2
    assert to_uuid(w1) == to_uuid(w2)
예제 #30
0
 def __uuid__(self) -> str:
     return to_uuid(self.code, self.annotation, self._type)