def visitFugueModuleTask(self, ctx: fp.FugueModuleTaskContext) -> None: data = self.get_dict(ctx, "assign", "dfs", "using", "params") sub = _to_module( data["using"], global_vars=self.global_vars, local_vars=self.local_vars, ) varname = data["assign"][0] if "assign" in data else None if varname is not None: assert_or_throw( sub.has_single_output or sub.has_multiple_output, FugueSQLSyntaxError( "invalid assignment for module without output"), ) if sub.has_input: dfs = data["dfs"] if "dfs" in data else WorkflowDataFrames( self.last) else: dfs = WorkflowDataFrames() p = data["params"] if "params" in data else {} if sub.has_dfs_input: result = sub(dfs, **p) elif len(dfs) == 0: result = sub(self.workflow, **p) elif len(dfs) == 1 or not dfs.has_key: result = sub(*list(dfs.values()), **p) else: result = sub(**dfs, **p) if sub.has_single_output or sub.has_multiple_output: self.variables[varname] = result if sub.has_single_output: self._last = result
def str_to_type( s: str, expected_base_type: Optional[type] = None, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, ) -> type: """Given a string expression, find the first/last type from all import libraries. If the expression contains `.`, it's supposed to be a relative or full path of the type including modules. :param s: type expression, for example `triad.utils.iter.Slicer` or `str` :param expected_base_type: base class type that must satisfy, defaults to None :param global_vars: overriding global variables, if None, it will use the caller's globals(), defaults to None :param local_vars: overriding local variables, if None, it will use the caller's locals(), defaults to None :raises TypeError: unable to find a matching type :return: found type """ global_vars, local_vars = get_caller_global_local_vars(global_vars, local_vars) try: obj = str_to_object(s, global_vars, local_vars) except ValueError: raise TypeError(f"{s} is not a type") assert_or_throw(isinstance(obj, type), TypeError(f"{obj} is not a type")) assert_or_throw( expected_base_type is None or issubclass(obj, expected_base_type), TypeError(f"{obj} is not a subtype of {expected_base_type}"), ) return obj
def to_function( func: Any, global_vars: Optional[Dict[str, Any]] = None, local_vars: Optional[Dict[str, Any]] = None, ) -> Any: # noqa: C901 """For an expression, it tries to find the matching function. :params s: a string expression or a callable :param global_vars: overriding global variables, if None, it will use the caller's globals(), defaults to None :param local_vars: overriding local variables, if None, it will use the caller's locals(), defaults to None :raises AttributeError: if unable to find such a function :return: the matching function """ if isinstance(func, str): global_vars, local_vars = get_caller_global_local_vars(global_vars, local_vars) try: func = str_to_object(func, global_vars, local_vars) except ValueError: raise AttributeError(f"{func} is not a function") assert_or_throw( callable(func) and not isinstance(func, six.class_types), AttributeError(f"{func} is not a function"), ) return func
def from_func(func: Callable, schema: Any, validation_rules: Dict[str, Any]) -> "_FuncAsCoTransformer": assert_or_throw( len(validation_rules) == 0, NotImplementedError( "CoTransformer does not support validation rules"), ) if schema is None: schema = parse_output_schema_from_comment(func) if isinstance(schema, Schema): # to be less strict on determinism schema = str(schema) if isinstance(schema, str): assert_or_throw( "*" not in schema, FugueInterfacelessError( "* can't be used on cotransformer output schema"), ) assert_arg_not_none(schema, "schema") tr = _FuncAsCoTransformer() tr._wrapper = FunctionWrapper( # type: ignore func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspq]$") tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore tr._output_schema_arg = schema # type: ignore tr._validation_rules = {} # type: ignore tr._uses_callback = "f" in tr._wrapper.input_code.lower( ) # type: ignore tr._requires_callback = "F" in tr._wrapper.input_code # type: ignore return tr
def _validate_callback(ctx: Any) -> None: if ctx._requires_callback: assert_or_throw( ctx.has_callback, FugueInterfacelessError( f"Callback is required but not provided: {ctx}"), )
def fillna( self, df: DataFrame, value: Any, subset: List[str] = None, metadata: Any = None, ) -> DataFrame: assert_or_throw( (not isinstance(value, list)) and (value is not None), ValueError("fillna value can not be a list or None"), ) if isinstance(value, dict): assert_or_throw( (None not in value.values()) and (any(value.values())), ValueError( "fillna dict can not contain None and needs at least one value" ), ) mapping = value else: # If subset is none, apply to all columns subset = subset or df.schema.names mapping = {col: value for col in subset} d = self.to_df(df).native.fillna(mapping) return self.to_df(d, df.schema, metadata)
def take( self, df: DataFrame, n: int, presort: str, na_position: str = "last", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, metadata: Any = None, ) -> DataFrame: assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), ) d = self.to_df(df).native nulls_last = bool(na_position == "last") if presort: presort = parse_presort_exp(presort) # Use presort over partition_spec.presort if possible _presort: IndexedOrderedDict = presort or partition_spec.presort def _presort_to_col(_col: str, _asc: bool) -> Any: if nulls_last: if _asc: return col(_col).asc_nulls_last() else: return col(_col).desc_nulls_last() else: if _asc: return col(_col).asc_nulls_first() else: return col(_col).desc_nulls_first() # If no partition if len(partition_spec.partition_by) == 0: if len(_presort.keys()) > 0: d = d.orderBy( [_presort_to_col(_col, _presort[_col]) for _col in _presort.keys()] ) d = d.limit(n) # If partition exists else: w = Window.partitionBy([col(x) for x in partition_spec.partition_by]) if len(_presort.keys()) > 0: w = w.orderBy( [_presort_to_col(_col, _presort[_col]) for _col in _presort.keys()] ) else: # row_number() still needs an orderBy w = w.orderBy(lit(1)) d = ( d.select(col("*"), row_number().over(w).alias("__row_number__")) .filter(col("__row_number__") <= n) .drop("__row_number__") ) return self.to_df(d, df.schema, metadata)
def _process_assignable(self, df: WorkflowDataFrame, ctx: Tree): data = self.get_dict(ctx, "assign", "persist", "broadcast") if "assign" in data: varname, sign = data["assign"] else: varname, sign = None, None need_checkpoint = sign == "??" if "persist" in data: is_checkpoint, value = data["persist"] if need_checkpoint or is_checkpoint: assert_or_throw( is_checkpoint, FugueSQLSyntaxError( "can't persist when checkpoint is specified"), ) df = df.checkpoint(value) else: df = df.persist(value) elif need_checkpoint: df = df.checkpoint() if "broadcast" in data: df = df.broadcast() if varname is not None: self.variables[varname] = df self._last = df
def _sql(self, code: str, *args: Any, **kwargs: Any) -> Dict[str, WorkflowDataFrame]: # TODO: move dict construction to triad params: Dict[str, Any] = {} for a in args: assert_or_throw(isinstance(a, Dict), f"args can only have dict: {a}") params.update(a) params.update(kwargs) code = fill_sql_template(code, params) sql = FugueSQL( code, "fugueLanguage", ignore_case=self.conf.get_or_throw("fugue.sql.compile.ignore_case", bool), simple_assign=self.conf.get_or_throw( "fugue.sql.compile.simple_assign", bool), ) dfs = { k: v for k, v in params.items() if isinstance(v, WorkflowDataFrame) } v = _Extensions(sql, FugueSQLHooks(), self, dfs) v.visit(sql.tree) return v.variables
def _sql(self, code: str, *args: Any, **kwargs: Any) -> Dict[str, WorkflowDataFrame]: # TODO: move dict construction to triad params: Dict[str, Any] = {} for a in args: assert_or_throw(isinstance(a, Dict), f"args can only have dict: {a}") params.update(a) params.update(kwargs) template_params = dict(params) if "self" in template_params: del template_params["self"] code = fill_sql_template(code, template_params) sql = FugueSQL( code, "fugueLanguage", ignore_case=self.conf.get_or_throw(FUGUE_SQL_CONF_IGNORE_CASE, bool), simple_assign=self.conf.get_or_throw(FUGUE_SQL_CONF_SIMPLE_ASSIGN, bool), ) dfs = { k: v for k, v in params.items() if isinstance(v, WorkflowDataFrame) } v = _Extensions(sql, FugueSQLHooks(), self, dfs, local_vars=params) v.visit(sql.tree) return v.variables
def from_func(func: Callable, schema: Any) -> "_FuncAsCreator": # pylint: disable=W0201 if schema is None: schema = parse_output_schema_from_comment(func) tr = _FuncAsCreator() tr._wrapper = FunctionWrapper(func, "^e?x*z?$", "^[dlspq]$") # type: ignore tr._need_engine = tr._wrapper.input_code.startswith("e") tr._need_output_schema = "s" == tr._wrapper.output_code tr._output_schema = Schema(schema) if len(tr._output_schema) == 0: assert_or_throw( not tr._need_output_schema, FugueInterfacelessError( f"schema must be provided for return type {tr._wrapper._rt}" ), ) else: assert_or_throw( tr._need_output_schema, FugueInterfacelessError( f"schema must not be provided for return type {tr._wrapper._rt}" ), ) return tr
def assert_not_empty(self) -> None: """Assert this dataframe is not empty :raises FugueDataFrameEmptyError: if it is empty """ assert_or_throw(not self.empty, FugueDataFrameEmptyError("dataframe is empty"))
def to_validation_rules(data: Dict[str, Any]) -> Dict[str, Any]: res: Dict[str, Any] = {} for k, v in data.items(): if k in ["partitionby_has", "partitionby_is"]: if isinstance(v, str): v = [x.strip() for x in v.split(",")] res[k] = PartitionSpec(by=v).partition_by elif k in ["presort_has", "presort_is"]: res[k] = list(parse_presort_exp(v).items()) elif k in ["input_has"]: if isinstance(v, str): res[k] = v.replace(" ", "").split(",") else: assert_or_throw( isinstance(v, list), lambda: SyntaxError(f"{v} is neither a string or a list"), ) res[k] = [x.replace(" ", "") for x in v] elif k in ["input_is"]: try: res[k] = str(Schema(v)) except SyntaxError: raise SyntaxError( # pylint: disable=W0707 f"for input_is, the input must be a schema expression {v}") else: raise NotImplementedError(k) return res
def to_cast_expression(schema1: Any, schema2: Any, allow_name_mismatch: bool) -> Tuple[bool, List[str]]: schema1 = to_spark_schema(schema1) schema2 = to_spark_schema(schema2) assert_or_throw( len(schema1) == len(schema2), lambda: ValueError(f"schema mismatch: {schema1}, {schema2}"), ) expr: List[str] = [] has_cast = False for i in range(len(schema1)): name_match = schema1[i].name == schema2[i].name assert_or_throw( name_match or allow_name_mismatch, lambda: ValueError(f"schema name mismatch: {schema1}, {schema2}"), ) if schema1[i].dataType != schema2[i].dataType: type2 = schema2[i].dataType.simpleString() if isinstance(schema1[i].dataType, pt.FractionalType) and isinstance( schema2[i].dataType, pt.StringType): expr.append( f"CAST(IF(isnan({schema1[i].name}), NULL, {schema1[i].name})" f" AS {type2}) {schema2[i].name}") else: expr.append( f"CAST({schema1[i].name} AS {type2}) {schema2[i].name}") has_cast = True else: if schema1[i].name != schema2[i].name: expr.append(f"{schema1[i].name} AS {schema2[i].name}") has_cast = True else: expr.append(schema1[i].name) return has_cast, expr
def save_df( df: DaskDataFrame, uri: str, format_hint: Optional[str] = None, mode: str = "overwrite", fs: Optional[FileSystem] = None, **kwargs: Any, ) -> None: assert_or_throw( mode in ["overwrite", "error"], lambda: NotImplementedError(f"{mode} is not supported"), ) p = FileParser(uri, format_hint).assert_no_glob() if fs is None: fs = FileSystem() if fs.exists(uri): assert_or_throw(mode == "overwrite", FileExistsError(uri)) try: fs.remove(uri) except Exception: try: fs.removetree(uri) except Exception: # pragma: no cover pass _FORMAT_SAVE[p.file_format](df, p, **kwargs)
def to_kv_iterable( # noqa: C901 data: Any, none_as_empty: bool = True) -> Iterable[Tuple[Any, Any]]: """Convert data to iterable of key value pairs :param data: input object, it can be a dict or Iterable[Tuple[Any, Any]] or Iterable[List[Any]] :param none_as_empty: if to treat None as empty iterable :raises ValueError: if input is None and `none_as_empty==False` :raises TypeError or ValueError: if input data type is not acceptable :yield: iterable of key value pair as tuples """ if data is None: assert_or_throw(none_as_empty, ValueError("data can't be None")) elif isinstance(data, Dict): for k, v in data.items(): yield k, v elif isinstance(data, Iterable): ei = make_empty_aware(data) if not ei.empty: first = ei.peek() if isinstance(first, tuple): for k, v in ei: yield k, v elif isinstance(first, List): for arr in ei: if len(arr) == 2: yield arr[0], arr[1] else: raise TypeError(f"{arr} is not an acceptable item") else: raise TypeError(f"{first} is not an acceptable item") else: raise TypeError(f"{type(data)} is not supported")
def yielded_file(self) -> YieldedFile: assert_or_throw( self.permanent, lambda: FugueWorkflowCompileError( f"yield is not allowed for {self}"), ) return self._yielded
def __init__(self, path: str, format_hint: Optional[str] = None): last = len(path) has_glob = False for i in range(len(path)): if path[i] in ["/", "\\"]: last = i if path[i] in ["*", "?"]: has_glob = True break if not has_glob: self._uri = urlparse(path) self._glob_pattern = "" self._path = self._uri.path else: self._uri = urlparse(path[:last]) self._glob_pattern = path[last + 1:] self._path = pfs.path.join(self._uri.path, self._glob_pattern) if format_hint is None or format_hint == "": for k, v in _FORMAT_MAP.items(): if self.suffix.endswith(k): self._format = v return raise NotImplementedError(f"{self.suffix} is not supported") else: assert_or_throw( format_hint in _FORMAT_MAP.values(), NotImplementedError(f"{format_hint} is not supported"), ) self._format = format_hint
def _sql( self, code: str, *args: Any, **kwargs: Any ) -> Dict[str, Tuple[WorkflowDataFrame, WorkflowDataFrames, LazyWorkflowDataFrame]]: # TODO: move dict construction to triad params: Dict[str, Any] = {} for a in args: assert_or_throw(isinstance(a, Dict), lambda: f"args can only have dict: {a}") params.update(a) params.update(kwargs) params, dfs = self._split_params(params) code = fill_sql_template(code, params) sql = FugueSQL( code, "fugueLanguage", ignore_case=self.conf.get_or_throw(FUGUE_SQL_CONF_IGNORE_CASE, bool), simple_assign=self.conf.get_or_throw(FUGUE_SQL_CONF_SIMPLE_ASSIGN, bool), ) v = _Extensions( sql, FugueSQLHooks(), self, dfs, local_vars=params # type: ignore ) v.visit(sql.tree) return v.variables
def from_func(func: Callable, schema: Any, validation_rules: Dict[str, Any]) -> "_FuncAsProcessor": if schema is None: schema = parse_output_schema_from_comment(func) validation_rules.update(parse_validation_rules_from_comment(func)) tr = _FuncAsProcessor() tr._wrapper = FunctionWrapper(func, "^e?(c|[dlspq]+)x*z?$", "^[dlspq]$") # type: ignore tr._engine_param = (tr._wrapper._params.get_value_by_index(0) if tr._wrapper.input_code.startswith("e") else None) tr._use_dfs = "c" in tr._wrapper.input_code tr._need_output_schema = tr._wrapper.need_output_schema tr._validation_rules = validation_rules tr._output_schema = Schema(schema) if len(tr._output_schema) == 0: assert_or_throw( tr._need_output_schema is None or not tr._need_output_schema, FugueInterfacelessError( f"schema must be provided for return type {tr._wrapper._rt}" ), ) else: assert_or_throw( tr._need_output_schema is None or tr._need_output_schema, FugueInterfacelessError( f"schema must not be provided for return type {tr._wrapper._rt}" ), ) return tr
def _parse_function( self, func: Callable, params_re: str = ".*", return_re: str = ".*" ) -> Tuple[bool, IndexedOrderedDict[str, "_FuncParam"], "_FuncParam"]: sig = inspect.signature(func) annotations = get_type_hints(func) res: IndexedOrderedDict[str, "_FuncParam"] = IndexedOrderedDict() class_method = False for k, w in sig.parameters.items(): if k == "self": res[k] = _SelfParam(w) class_method = True else: anno = annotations.get(k, w.annotation) res[k] = self._parse_param(anno, w) anno = annotations.get("return", sig.return_annotation) rt = self._parse_param(anno, None, none_as_other=False) params_str = "".join(x.code for x in res.values()) assert_or_throw(re.match(params_re, params_str), TypeError(f"Input types not valid {res}")) assert_or_throw(re.match(return_re, rt.code), TypeError(f"Return type not valid {rt}")) return class_method, res, rt
def parse_output_schema_from_comment(func: Callable) -> Optional[str]: """Parse schema hint from the comments above the function. It try to find comment lines starts with `schema:` from bottom up, and will use the first occurrance as the hint. :param func: the function :return: schema hint string :Example: .. code-block:: python # schema: a:int,b:str #schema:a:int,b:int # more comment # some comment def dummy(): pass assert "a:int,b:int" == parse_output_schema_from_comment(dummy) """ res = parse_comment_annotation(func, _COMMENT_SCHEMA_ANNOTATION) if res is None: return None assert_or_throw(res != "", SyntaxError("incorrect schema annotation")) return res.replace(" ", "")
def fillna( self, df: DataFrame, value: Any, subset: List[str] = None, metadata: Any = None, ) -> DataFrame: assert_or_throw( (not isinstance(value, list)) and (value is not None), ValueError("fillna value can not None or a list"), ) if isinstance(value, dict): assert_or_throw( (None not in value.values()) and (any(value.values())), ValueError( "fillna dict can not contain None and needs at least one value" ), ) mapping = value else: # If subset is none, apply to all columns subset = subset or df.schema.names mapping = {col: value for col in subset} d = df.as_pandas().fillna(mapping, inplace=False) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata)
def to_output_df(self, output: LocalDataFrame, schema: Any, ctx: Any) -> DataFrame: assert_or_throw( schema is None or output.schema == schema, lambda: f"Output schema mismatch {output.schema} vs {schema}", ) return output
def validate_on_compile(self): n = self.params.get_or_none("n", int) frac = self.params.get_or_none("frac", float) assert_or_throw( (n is None and frac is not None) or (n is not None and frac is None), ValueError("one and only one of n and frac should be set"), )
def process(self, dfs: DataFrames) -> DataFrame: assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input")) if_exists = self.params.get("if_exists", False) columns = self.params.get_or_throw("columns", list) if if_exists: columns = set(columns).intersection(dfs[0].schema.keys()) return dfs[0].drop(list(columns))
def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame: data = df.as_array(type_safe=True) assert_or_throw( len(data) == 1, FugueBug("each comap partition can have one and only one row"), ) dfs = DataFrames(list(self._get_dfs(data[0]))) return self.func(cursor, dfs)
def _append(self, value: Any): assert_or_throw(not self.has_key, InvalidOperationError("this DataFrames must have key")) assert_or_throw( isinstance(value, DataFrame), lambda: ValueError(f"{value} is not a DataFrame"), ) super().__setitem__("_" + str(len(self)), value)
def process(self, dfs: DataFrames) -> DataFrame: assert_or_throw(len(dfs) == 1, FugueWorkflowError("not single input")) columns = self.params.get_or_throw("columns", ColumnsSelect) where = None if "where" not in self.params else self.params["where"] having = None if "having" not in self.params else self.params["having"] return self.execution_engine.select( df=dfs[0], cols=columns, where=where, having=having )
def get_temp_file(self, file_id: str, permanent: bool) -> str: path = self._path if permanent else self._temp_path assert_or_throw( path != "", FugueWorkflowRuntimeError( f"{FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH} is not set"), ) return os.path.join(path, file_id + ".parquet")