Пример #1
0
 def _func(name: str, x: WorkflowDataFrame) -> WorkflowDataFrame:
     yield_name = self.ctxToStr(
         ctx.name) if ctx.name is not None else name
     assert_or_throw(yield_name is not None,
                     "yield name is not specified")
     if ctx.DATAFRAME() is None:
         x.yield_file_as(yield_name)
     else:
         x.yield_dataframe_as(yield_name)
     return x
Пример #2
0
def tune(  # noqa: C901
    params_df: WorkflowDataFrame,
    tunable: Any,
    distributable: Optional[bool] = None,
    objective_runner: Optional[ObjectiveRunner] = None,
) -> WorkflowDataFrame:
    t = _to_tunable(  # type: ignore
        tunable, *get_caller_global_local_vars(), distributable)
    if distributable is None:
        distributable = t.distributable

    if objective_runner is None:
        objective_runner = ObjectiveRunner()

    # input_has: __fmin_params__:str
    # schema: *,__fmin_value__:double,__fmin_metadata__:str
    def compute_transformer(
            df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
        for row in df:
            dfs: Dict[str, Any] = {}
            dfs_keys: Set[str] = set()
            for k, v in row.items():
                if k.startswith("__df_"):
                    key = k[len("__df_"):]
                    if v is not None:
                        dfs[key] = pd.read_parquet(v)
                    dfs_keys.add(key)
            for params in json.loads(row["__fmin_params__"]):
                p = decode(params)
                best = objective_runner.run(  # type: ignore
                    t, dict(**dfs, **p), set(p.keys()))
                res = dict(row)
                res["__fmin_params__"] = json.dumps(best["hp"])
                res["__fmin_value__"] = best["error"]
                res["__fmin_metadata__"] = json.dumps(best["metadata"])
                yield res

    # input_has: __fmin_params__:str
    def compute_processor(engine: ExecutionEngine, df: DataFrame) -> DataFrame:
        def get_rows() -> Iterable[Any]:
            keys = list(
                df.schema.names) + ["__fmin_value__", "__fmin_metadata__"]
            for row in compute_transformer(df.as_dict_iterable()):
                yield [row[k] for k in keys]

        t._execution_engine = engine  # type:ignore
        return ArrayDataFrame(
            get_rows(),
            df.schema + "__fmin_value__:double,__fmin_metadata__:str")

    if not distributable:
        return params_df.process(compute_processor)
    else:
        return params_df.partition(num="ROWCOUNT",
                                   algo="even").transform(compute_transformer)
Пример #3
0
        def _func(name: str, x: WorkflowDataFrame) -> WorkflowDataFrame:
            data = self.get_dict(ctx, "ns", "partition", "single", "params")

            x.deterministic_checkpoint(
                lazy=ctx.LAZY() is not None,
                partition=data.get("partition"),
                single="single" in data,
                namespace=data.get("ns"),
                **data.get("params", {}),
            )
            return x
Пример #4
0
    def _serialize_df(self, df: WorkflowDataFrame,
                      name: str) -> WorkflowDataFrame:
        pre_partition = df.partition_spec
        path = self._path

        def _get_temp_path(p: str, conf: ParamDict) -> str:
            if p is not None and p != "":
                return p
            return conf.get_or_throw(TUNE_TEMP_PATH, str)

        if len(pre_partition.partition_by) == 0:

            def save_single_file(e: ExecutionEngine,
                                 _input: DataFrame) -> DataFrame:
                p = _get_temp_path(path, e.conf)
                fp = os.path.join(p, str(uuid4()) + ".parquet")
                e.save_df(_input, fp, force_single=True)
                return ArrayDataFrame([[fp]],
                                      f"{TUNE_DATASET_DF_PREFIX}{name}:str")

            return df.process(save_single_file)
        else:

            class SavePartition(Transformer):
                def get_output_schema(self, df: DataFrame) -> Any:
                    dfn = self.params.get_or_throw("name", str)
                    return self.key_schema + f"{TUNE_DATASET_DF_PREFIX}{dfn}:str"

                def transform(self, df: LocalDataFrame) -> LocalDataFrame:
                    p = _get_temp_path(self.params.get("path", ""),
                                       self.workflow_conf)
                    fp = os.path.join(p, str(uuid4()) + ".parquet")
                    first = df.peek_dict()
                    keys = [first[x] for x in self.key_schema.names]
                    df.as_pandas().to_parquet(fp)
                    return ArrayDataFrame([keys + [fp]], self.output_schema)

            return df.transform(SavePartition,
                                params={
                                    "path": path,
                                    "name": name
                                })
Пример #5
0
def select_best(df: WorkflowDataFrame, top: int = 1) -> WorkflowDataFrame:
    def _top(df: pd.DataFrame, n: int) -> pd.DataFrame:
        keys = [
            k for k in df.columns
            if not k.startswith("__df_") and not k.startswith("__fmin_")
        ]
        if len(keys) == 0:
            return df.sort_values("__fmin_value__").head(n)
        else:
            return df.sort_values("__fmin_value__").groupby(keys).head(n)

    return df.process(_top, params=dict(n=top))
Пример #6
0
def visualize_top_n(df: WorkflowDataFrame, top: int = 0) -> None:
    if top <= 0:
        return

    def outputter(df: LocalDataFrame) -> None:
        keys = [
            k for k in df.schema.names
            if not k.startswith("__df_") and not k.startswith("__fmin_")
        ]

        def show(subdf: pd.DataFrame) -> None:
            if subdf.shape[0] == 0:  # pragma: no cover
                return
            subdf = subdf.sort_values("__fmin_value__").head(top)
            title = (json.dumps({k: str(subdf[k].iloc[0])
                                 for k in keys}) if len(keys) > 0 else "")
            pdf = pd.DataFrame(
                [json.loads(x) for x in subdf["__fmin_params__"]])
            fig = plt.figure(figsize=(12, 3 * len(pdf.columns)))
            if len(keys) > 0:
                fig.suptitle(
                    title,
                    va="center",
                    size=15,
                    weight="bold",
                    y=0.93,
                )
            for i in range(len(pdf.columns)):
                ax = fig.add_subplot(len(pdf.columns), 1, i + 1)
                pdf[pdf.columns[i]].hist(ax=ax).set_title(pdf.columns[i])
                plt.subplots_adjust(hspace=0.5)

        if len(keys) == 0:
            show(df.as_pandas())
        else:
            with FugueWorkflow() as dag:
                dag.df(df).partition(by=keys).out_transform(show)

    df.output(outputter)
Пример #7
0
def serialize_df(df: WorkflowDataFrame,
                 name: str,
                 path: str = "") -> WorkflowDataFrame:
    pre_partition = df.partition_spec

    def _get_temp_path(p: str, conf: ParamDict) -> str:
        if p is not None and p != "":
            return p
        return conf.get_or_throw(FUGUE_TUNE_TEMP_PATH,
                                 str)  # TODO: remove hard code

    if len(pre_partition.partition_by) == 0:

        def save_single_file(e: ExecutionEngine,
                             input: DataFrame) -> DataFrame:
            p = _get_temp_path(path, e.conf)
            fp = os.path.join(p, str(uuid4()) + ".parquet")
            e.save_df(input, fp, force_single=True)
            return ArrayDataFrame([[fp]], f"__df_{name}:str")

        return df.process(save_single_file)
    else:

        class SavePartition(Transformer):
            def get_output_schema(self, df: DataFrame) -> Any:
                dfn = self.params.get_or_throw("name", str)
                return self.key_schema + f"__df_{dfn}:str"

            def transform(self, df: LocalDataFrame) -> LocalDataFrame:
                p = _get_temp_path(self.params.get("path", ""),
                                   self.workflow_conf)
                fp = os.path.join(p, str(uuid4()) + ".parquet")
                df.as_pandas().to_parquet(fp)
                return ArrayDataFrame([self.cursor.key_value_array + [fp]],
                                      self.output_schema)

        return df.transform(SavePartition, params={"path": path, "name": name})
Пример #8
0
 def _process_assignable(self, df: WorkflowDataFrame, ctx: Tree):
     data = self.get_dict(ctx, "assign", "checkpoint", "broadcast", "y")
     if "assign" in data:
         varname, _ = data["assign"]
     else:
         varname = None
     if "checkpoint" in data:
         data["checkpoint"](varname, df)
     if "broadcast" in data:
         df = df.broadcast()
     if "y" in data:
         data["y"](varname, df)
     if varname is not None:
         self.variables[varname] = df  # type: ignore
     self._last = df
Пример #9
0
 def p3(df: WorkflowDataFrame) -> WorkflowDataFrame:
     return df.process(process)
Пример #10
0
 def p1(wf: FugueSQLWorkflow, df: WorkflowDataFrame) -> WorkflowDataFrame:
     return df.process(process)
Пример #11
0
 def __init__(self, data: WorkflowDataFrame, dfs: List[str],
              keys: List[str]):
     self._data = data.persist()
     self._dfs = dfs
     self._keys = keys
Пример #12
0
 def __init__(self, dataset: TuneDataset, result: WorkflowDataFrame):
     self._dataset = dataset
     self._result = (result.persist().partition_by(
         TUNE_REPORT_ID, presort=TUNE_REPORT_METRIC).take(1).persist())
Пример #13
0
 def out1(wf: FugueWorkflow, df: WorkflowDataFrame) -> None:
     df.show()