def _func(name: str, x: WorkflowDataFrame) -> WorkflowDataFrame: yield_name = self.ctxToStr( ctx.name) if ctx.name is not None else name assert_or_throw(yield_name is not None, "yield name is not specified") if ctx.DATAFRAME() is None: x.yield_file_as(yield_name) else: x.yield_dataframe_as(yield_name) return x
def tune( # noqa: C901 params_df: WorkflowDataFrame, tunable: Any, distributable: Optional[bool] = None, objective_runner: Optional[ObjectiveRunner] = None, ) -> WorkflowDataFrame: t = _to_tunable( # type: ignore tunable, *get_caller_global_local_vars(), distributable) if distributable is None: distributable = t.distributable if objective_runner is None: objective_runner = ObjectiveRunner() # input_has: __fmin_params__:str # schema: *,__fmin_value__:double,__fmin_metadata__:str def compute_transformer( df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]: for row in df: dfs: Dict[str, Any] = {} dfs_keys: Set[str] = set() for k, v in row.items(): if k.startswith("__df_"): key = k[len("__df_"):] if v is not None: dfs[key] = pd.read_parquet(v) dfs_keys.add(key) for params in json.loads(row["__fmin_params__"]): p = decode(params) best = objective_runner.run( # type: ignore t, dict(**dfs, **p), set(p.keys())) res = dict(row) res["__fmin_params__"] = json.dumps(best["hp"]) res["__fmin_value__"] = best["error"] res["__fmin_metadata__"] = json.dumps(best["metadata"]) yield res # input_has: __fmin_params__:str def compute_processor(engine: ExecutionEngine, df: DataFrame) -> DataFrame: def get_rows() -> Iterable[Any]: keys = list( df.schema.names) + ["__fmin_value__", "__fmin_metadata__"] for row in compute_transformer(df.as_dict_iterable()): yield [row[k] for k in keys] t._execution_engine = engine # type:ignore return ArrayDataFrame( get_rows(), df.schema + "__fmin_value__:double,__fmin_metadata__:str") if not distributable: return params_df.process(compute_processor) else: return params_df.partition(num="ROWCOUNT", algo="even").transform(compute_transformer)
def _func(name: str, x: WorkflowDataFrame) -> WorkflowDataFrame: data = self.get_dict(ctx, "ns", "partition", "single", "params") x.deterministic_checkpoint( lazy=ctx.LAZY() is not None, partition=data.get("partition"), single="single" in data, namespace=data.get("ns"), **data.get("params", {}), ) return x
def _serialize_df(self, df: WorkflowDataFrame, name: str) -> WorkflowDataFrame: pre_partition = df.partition_spec path = self._path def _get_temp_path(p: str, conf: ParamDict) -> str: if p is not None and p != "": return p return conf.get_or_throw(TUNE_TEMP_PATH, str) if len(pre_partition.partition_by) == 0: def save_single_file(e: ExecutionEngine, _input: DataFrame) -> DataFrame: p = _get_temp_path(path, e.conf) fp = os.path.join(p, str(uuid4()) + ".parquet") e.save_df(_input, fp, force_single=True) return ArrayDataFrame([[fp]], f"{TUNE_DATASET_DF_PREFIX}{name}:str") return df.process(save_single_file) else: class SavePartition(Transformer): def get_output_schema(self, df: DataFrame) -> Any: dfn = self.params.get_or_throw("name", str) return self.key_schema + f"{TUNE_DATASET_DF_PREFIX}{dfn}:str" def transform(self, df: LocalDataFrame) -> LocalDataFrame: p = _get_temp_path(self.params.get("path", ""), self.workflow_conf) fp = os.path.join(p, str(uuid4()) + ".parquet") first = df.peek_dict() keys = [first[x] for x in self.key_schema.names] df.as_pandas().to_parquet(fp) return ArrayDataFrame([keys + [fp]], self.output_schema) return df.transform(SavePartition, params={ "path": path, "name": name })
def select_best(df: WorkflowDataFrame, top: int = 1) -> WorkflowDataFrame: def _top(df: pd.DataFrame, n: int) -> pd.DataFrame: keys = [ k for k in df.columns if not k.startswith("__df_") and not k.startswith("__fmin_") ] if len(keys) == 0: return df.sort_values("__fmin_value__").head(n) else: return df.sort_values("__fmin_value__").groupby(keys).head(n) return df.process(_top, params=dict(n=top))
def visualize_top_n(df: WorkflowDataFrame, top: int = 0) -> None: if top <= 0: return def outputter(df: LocalDataFrame) -> None: keys = [ k for k in df.schema.names if not k.startswith("__df_") and not k.startswith("__fmin_") ] def show(subdf: pd.DataFrame) -> None: if subdf.shape[0] == 0: # pragma: no cover return subdf = subdf.sort_values("__fmin_value__").head(top) title = (json.dumps({k: str(subdf[k].iloc[0]) for k in keys}) if len(keys) > 0 else "") pdf = pd.DataFrame( [json.loads(x) for x in subdf["__fmin_params__"]]) fig = plt.figure(figsize=(12, 3 * len(pdf.columns))) if len(keys) > 0: fig.suptitle( title, va="center", size=15, weight="bold", y=0.93, ) for i in range(len(pdf.columns)): ax = fig.add_subplot(len(pdf.columns), 1, i + 1) pdf[pdf.columns[i]].hist(ax=ax).set_title(pdf.columns[i]) plt.subplots_adjust(hspace=0.5) if len(keys) == 0: show(df.as_pandas()) else: with FugueWorkflow() as dag: dag.df(df).partition(by=keys).out_transform(show) df.output(outputter)
def serialize_df(df: WorkflowDataFrame, name: str, path: str = "") -> WorkflowDataFrame: pre_partition = df.partition_spec def _get_temp_path(p: str, conf: ParamDict) -> str: if p is not None and p != "": return p return conf.get_or_throw(FUGUE_TUNE_TEMP_PATH, str) # TODO: remove hard code if len(pre_partition.partition_by) == 0: def save_single_file(e: ExecutionEngine, input: DataFrame) -> DataFrame: p = _get_temp_path(path, e.conf) fp = os.path.join(p, str(uuid4()) + ".parquet") e.save_df(input, fp, force_single=True) return ArrayDataFrame([[fp]], f"__df_{name}:str") return df.process(save_single_file) else: class SavePartition(Transformer): def get_output_schema(self, df: DataFrame) -> Any: dfn = self.params.get_or_throw("name", str) return self.key_schema + f"__df_{dfn}:str" def transform(self, df: LocalDataFrame) -> LocalDataFrame: p = _get_temp_path(self.params.get("path", ""), self.workflow_conf) fp = os.path.join(p, str(uuid4()) + ".parquet") df.as_pandas().to_parquet(fp) return ArrayDataFrame([self.cursor.key_value_array + [fp]], self.output_schema) return df.transform(SavePartition, params={"path": path, "name": name})
def _process_assignable(self, df: WorkflowDataFrame, ctx: Tree): data = self.get_dict(ctx, "assign", "checkpoint", "broadcast", "y") if "assign" in data: varname, _ = data["assign"] else: varname = None if "checkpoint" in data: data["checkpoint"](varname, df) if "broadcast" in data: df = df.broadcast() if "y" in data: data["y"](varname, df) if varname is not None: self.variables[varname] = df # type: ignore self._last = df
def p3(df: WorkflowDataFrame) -> WorkflowDataFrame: return df.process(process)
def p1(wf: FugueSQLWorkflow, df: WorkflowDataFrame) -> WorkflowDataFrame: return df.process(process)
def __init__(self, data: WorkflowDataFrame, dfs: List[str], keys: List[str]): self._data = data.persist() self._dfs = dfs self._keys = keys
def __init__(self, dataset: TuneDataset, result: WorkflowDataFrame): self._dataset = dataset self._result = (result.persist().partition_by( TUNE_REPORT_ID, presort=TUNE_REPORT_METRIC).take(1).persist())
def out1(wf: FugueWorkflow, df: WorkflowDataFrame) -> None: df.show()