def to_local_df(df: Any, schema: Any = None, metadata: Any = None) -> LocalDataFrame: """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame` :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and list or iterable of arrays :param schema: |SchemaLikeObject|, defaults to None, it should not be set for :class:`~fugue.dataframe.dataframe.DataFrame` type :param metadata: dict-like object with string keys, defaults to None :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame` but you set ``schema`` or ``metadata`` :raises TypeError: if ``df`` is not compatible :return: the dataframe itself if it's :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one :Examples: >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str") >>> assert to_local_df(a) is a >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str")) """ assert_arg_not_none(df, "df") if isinstance(df, DataFrame): aot( schema is None and metadata is None, ValueError("schema and metadata must be None when df is a DataFrame"), ) return df.as_local() if isinstance(df, pd.DataFrame): return PandasDataFrame(df, schema, metadata) if isinstance(df, List): return ArrayDataFrame(df, schema, metadata) if isinstance(df, Iterable): return IterableDataFrame(df, schema, metadata) raise TypeError(f"{df} cannot convert to a LocalDataFrame")
def to_taskspec( obj: Any, parent_workflow_spec: Optional[WorkflowSpec] = None) -> TaskSpec: assert_arg_not_none(obj, "obj") if isinstance(obj, str): return to_taskspec(json.loads(obj)) if isinstance(obj, TaskSpec): return obj if isinstance(obj, Dict): d: Dict[str, Any] = dict(obj) node_spec: Optional[_NodeSpec] = None if "node_spec" in d: aot( parent_workflow_spec is not None, lambda: InvalidOperationError("parent workflow must be set"), ) node_spec = _NodeSpec( workflow=parent_workflow_spec, **d["node_spec"] # type: ignore ) del d["node_spec"] if "tasks" in d: ts: TaskSpec = WorkflowSpec(**d) else: ts = TaskSpec(**d) if node_spec is not None: ts._node_spec = node_spec return ts raise TypeError(f"can't convert {obj} to TaskSpec") # pragma: no cover
def __init__(self, *args: Any, **kwargs: Any): p = ParamDict() for a in args: if a is None: continue elif isinstance(a, PartitionSpec): self._update_dict(p, a.jsondict) elif isinstance(a, Dict): self._update_dict(p, a) elif isinstance(a, str): self._update_dict(p, json.loads(a)) else: raise TypeError(f"{a} is not supported") self._update_dict(p, kwargs) self._num_partitions = p.get("num_partitions", "0") self._algo = p.get("algo", "").lower() self._partition_by = p.get("partition_by", []) aot( len(self._partition_by) == len(set(self._partition_by)), SyntaxError(f"{self._partition_by} has duplicated keys"), ) self._presort = self._parse_presort_exp(p.get_or_none("presort", object)) if any(x in self._presort for x in self._partition_by): raise SyntaxError( "partition by overlap with presort: " + f"{self._partition_by}, {self._presort}" ) # TODO: currently, size limit not in use self._size_limit = to_size(p.get("size_limit", "0")) self._row_limit = p.get("row_limit", 0)
def _parse_spec(self, obj: Any, to_type: Type[T]) -> T: if isinstance(obj, to_type): return obj if isinstance(obj, str): obj = json.loads(obj) aot(isinstance(obj, dict), lambda: f"{obj} is not dict") return to_type(**obj)
def get(self) -> Any: if self.dependency is not None: return self.dependency.get() # type:ignore if not self.is_set: aot(not self.spec.required, lambda: f"{self} is required but not set") return self.spec.default_value return self.value
def validate_value(self, obj: Any) -> Any: if obj is not None: aot( isinstance(obj, self.data_type), lambda: TypeError(f"{obj} mismatches type {self.paramdict}"), ) return obj aot(self.nullable, lambda: f"Can't set None to {self}") return obj
def _parse_spec_collection(self, obj: Any, to_type: Type[T]) -> IndexedOrderedDict[str, T]: res: IndexedOrderedDict[str, T] = IndexedOrderedDict() if obj is None: return res aot(isinstance(obj, List), "Spec collection must be a list") for v in obj: s = self._parse_spec(v, to_type) aot(s.name not in res, KeyError(f"Duplicated key {s.name}")) res[s.name] = s return res
def add_task( self, name: str, task: Any, dependency: Optional[Dict[str, str]] = None, config: Optional[Dict[str, Any]] = None, config_dependency: Optional[Dict[str, str]] = None, ) -> TaskSpec: _t = to_taskspec(task) aot(_t._node_spec is None, "node_spec must not be set") _t._node_spec = _NodeSpec(self, name, dependency, config, config_dependency) return self._append_task(_t)
def validate_spec(self, spec: "OutputSpec") -> "OutputSpec": if not self.nullable: aot( not spec.nullable, lambda: TypeError( f"{self} - {spec} are not compatible on nullable"), ) aot( issubclass(spec.data_type, self.data_type), lambda: TypeError( f"{self} - {spec} are not compatible on data_type"), ) return spec
def link(self, output: str, to_expr: str): try: aot( output in self.outputs, lambda: f"{output} is not an output of the workflow", ) aot( output not in self.internal_dependency, lambda: f"{output} is already defined", ) t = to_expr.split(".", 1) if len(t) == 1: aot( t[0] in self.inputs, lambda: f"{t[0]} is not an input of the workflow", ) self.outputs[output].validate_spec(self.inputs[t[0]]) else: # len(t) == 2 node = self.tasks[t[0]] aot(t[1] in node.outputs, lambda: f"{t[1]} is not an output of {node}") self.outputs[output].validate_spec(node.outputs[t[1]]) self.internal_dependency[output] = to_expr except Exception as e: raise DependencyDefinitionError(e)
def _ensure_fully_connected(self) -> None: # pragma: no cover """By design, this should be called always when fully connected, but if this failed, it means there is a bug in the framework itself. """ for k, v in self.configs.items(): try: v.get() except Exception: raise WorkflowBug( f"BUG: config {k}'s value or dependency is not set") for k, vv in self.inputs.items(): aot( vv.dependency is not None, lambda: WorkflowBug(f"BUG: input {k}'s dependency is not set"), )
def _make_top_level_workflow(spec: WorkflowSpec, ctx: WorkflowContext, configs: Dict[str, Any]) -> _Workflow: aot( len(spec.inputs) == 0, InvalidOperationError("Can't have inputs for top level workflow"), ) wf = _Workflow(spec, ctx) for k, vv in configs.items(): wf.configs[k].set(vv) for k, v in wf.configs.items(): try: v.get() except Exception: raise InvalidOperationError( f"config {k}'s value is not set for top level workflow") wf._init_tasks() return wf
def __init__(self, *args: Any, **kwargs: Any): # noqa: C901 p = ParamDict() if ( len(args) == 1 and len(kwargs) == 0 and isinstance(args[0], str) and args[0].lower() == "per_row" ): p["algo"] = "even" p["num_partitions"] = "ROWCOUNT" else: for a in args: if a is None: continue elif isinstance(a, PartitionSpec): self._update_dict(p, a.jsondict) elif isinstance(a, Dict): self._update_dict(p, a) elif isinstance(a, str): self._update_dict(p, json.loads(a)) else: raise TypeError(f"{a} is not supported") self._update_dict(p, kwargs) self._num_partitions = p.get("num_partitions", "0") self._algo = p.get("algo", "").lower() if "partition_by" not in p: self._partition_by: List[str] = [] elif isinstance(p["partition_by"], str): self._partition_by = [p["partition_by"]] elif isinstance(p["partition_by"], (list, tuple)): self._partition_by = list(p["partition_by"]) else: raise SyntaxError(p["partition_by"]) aot( len(self._partition_by) == len(set(self._partition_by)), SyntaxError(f"{self._partition_by} has duplicated keys"), ) self._presort = parse_presort_exp(p.get_or_none("presort", object)) if any(x in self._presort for x in self._partition_by): raise SyntaxError( "partition by overlap with presort: " + f"{self._partition_by}, {self._presort}" ) # TODO: currently, size limit not in use self._size_limit = to_size(p.get("size_limit", "0")) self._row_limit = p.get("row_limit", 0)
def __init__( self, name: str, data_type: Any, nullable: bool, required: bool = True, default_value: Any = None, metadata: Any = None, ): super().__init__(name, data_type, nullable, metadata) self.required = required self.default_value = default_value if required: aot(default_value is None, "required var can't have default_value") elif default_value is None: aot(nullable, "default_value can't be None because it's not nullable") else: self.default_value = as_type(self.default_value, self.data_type)
def __init__( self, name: str, data_type: Any, nullable: bool, required: bool = True, default_value: Any = None, timeout: Any = 0, default_on_timeout: bool = False, metadata: Any = None, ): super().__init__(name, data_type, nullable, required, default_value, metadata) self.timeout = to_timedelta(timeout).total_seconds() self.default_on_timeout = default_on_timeout aot(self.timeout >= 0, "timeout can't be negative") if required: aot(not default_on_timeout, "default is not allowed for required input")
def _append_task(self, task: TaskSpec) -> TaskSpec: name = task.name assert_triad_var_name(name) aot( name not in self.tasks, lambda: KeyError(f"{name} already exists in workflow"), ) aot( task.parent_workflow is self, lambda: InvalidOperationError(f"{task} has mismatching node_spec"), ) try: task._validate_config() task._validate_dependency() except DependencyDefinitionError: raise except Exception as e: raise DependencyDefinitionError(e) self.tasks[name] = task return task
def get_sorts(self, schema: Schema) -> IndexedOrderedDict[str, bool]: """Get keys for sorting in a partition, it's the combination of partition keys plus the presort keys :param schema: the dataframe schema this partition spec to operate on :return: an ordered dictionary of key, order pairs :Example: >>> p = PartitionSpec(by=["a"],presort="b , c dESc") >>> schema = Schema("a:int,b:int,c:int,d:int")) >>> assert p.get_sorts(schema) == {"a":True, "b":True, "c": False} """ d: IndexedOrderedDict[str, bool] = IndexedOrderedDict() for p in self.partition_by: aot(p in schema, KeyError(f"{p} not in {schema}")) d[p] = True for p, v in self.presort.items(): aot(p in schema, KeyError(f"{p} not in {schema}")) d[p] = v return d
def _validate_dependency(self): if set(self.node_spec.dependency.keys()) != set(self.inputs.keys()): raise DependencyNotDefinedError( self.name + " input", self.inputs.keys(), self.node_spec.dependency.keys(), ) for k, v in self.node_spec.dependency.items(): t = v.split(".", 1) if len(t) == 1: aot( t[0] in self.parent_workflow.inputs, lambda: f"{t[0]} is not an input of the workflow", ) self.inputs[k].validate_spec(self.parent_workflow.inputs[t[0]]) else: # len(t) == 2 aot( t[0] != self.name, lambda: f"{v} tries to connect to self node {self.name}", ) task = self.parent_workflow.tasks[t[0]] aot( t[1] in task.outputs, lambda: f"{t[1]} is not an output of node {task.name}", ) self.inputs[k].validate_spec(task.outputs[t[1]])
def _validate_config(self): for k, v in self.node_spec.config.items(): self.configs[k].validate_value(v) defined = set(self.node_spec.config.keys()) for k, t in self.node_spec.config_dependency.items(): aot( k not in defined, lambda: f"can't redefine config {k} in node {self.name}", ) defined.add(k) aot( t in self.parent_workflow.configs, lambda: f"{t} is not a config of the workflow", ) self.configs[k].validate_spec(self.parent_workflow.configs[t]) for k in set(self.configs.keys()).difference(defined): aot( not self.configs[k].required, lambda: f"config {k} in node {self.name} is required but not defined", )
def get_join_schemas(df1: DataFrame, df2: DataFrame, how: str, on: Iterable[str]) -> Tuple[Schema, Schema]: """Get :class:`~triad:triad.collections.schema.Schema` object after joining ``df1`` and ``df2``. If ``on`` is not empty, it's mainly for validation purpose. :param df1: first dataframe :param df2: second dataframe :param how: can accept ``semi``, ``left_semi``, ``anti``, ``left_anti``, ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross`` :param on: it can always be inferred, but if you provide, it will be validated agained the inferred keys. :return: the pair key schema and schema after join .. note:: In Fugue, joined schema can always be inferred because it always uses the input dataframes' common keys as the join keys. So you must make sure to :meth:`~fugue.dataframe.dataframe.DataFrame.rename` to input dataframes so they follow this rule. """ assert_arg_not_none(how, "how") how = how.lower() aot( how in [ "semi", "left_semi", "anti", "left_anti", "inner", "left_outer", "right_outer", "full_outer", "cross", ], ValueError(f"{how} is not a valid join type"), ) on = list(on) aot(len(on) == len(set(on)), f"{on} has duplication") if how != "cross" and len(on) == 0: on = list(df1.schema.intersect(df2.schema.names).names) aot( len(on) > 0, lambda: SchemaError( f"no common columns between {df1.schema} and {df2.schema}"), ) schema2 = df2.schema aot( how != "outer", ValueError( "'how' must use left_outer, right_outer, full_outer for outer joins" ), ) if how in ["semi", "left_semi", "anti", "left_anti"]: schema2 = schema2.extract(on) aot( on in df1.schema and on in schema2, lambda: SchemaError( f"{on} is not the intersection of {df1.schema} & {df2.schema}"), ) cm = df1.schema.intersect(on) if how == "cross": aot( len(df1.schema.intersect(schema2)) == 0, SchemaError("can't specify on for cross join"), ) else: aot(len(on) > 0, SchemaError("on must be specified")) return cm, (df1.schema.union(schema2))
def parse_presort_exp( presort: Any) -> IndexedOrderedDict[str, bool]: # noqa [C901] """Returns ordered column sorting direction where ascending order would return as true, and descending as false. :param presort: string that contains column and sorting direction or list of tuple that contains column and boolean sorting direction :type presort: Any :return: column and boolean sorting direction of column, order matters. :rtype: IndexedOrderedDict[str, bool] :Example: >>> parse_presort_exp("b desc, c asc") >>> parse_presort_exp([("b", True), ("c", False))]) both return IndexedOrderedDict([("b", True), ("c", False))]) """ if isinstance(presort, IndexedOrderedDict): return presort presort_list: List[Tuple[str, bool]] = [] res: IndexedOrderedDict[str, bool] = IndexedOrderedDict() if presort is None: return res elif isinstance(presort, str): presort = presort.strip() if presort == "": return res for p in presort.split(","): pp = p.strip().split() key = pp[0].strip() if len(pp) == 1: presort_list.append((key, True)) elif len(pp) == 2: if pp[1].strip().lower() == "asc": presort_list.append((key, True)) elif pp[1].strip().lower() == "desc": presort_list.append((key, False)) else: raise SyntaxError(f"Invalid expression {presort}") else: raise SyntaxError(f"Invalid expression {presort}") elif isinstance(presort, list): for p in presort: if isinstance(p, str): aot( len(p.strip().split()) == 1, SyntaxError(f"Invalid expression {presort}"), ) presort_list.append((p.strip(), True)) else: aot(len(p) == 2, SyntaxError(f"Invalid expression {presort}")) aot( isinstance(p, tuple) & (isinstance(p[0], str) & (isinstance(p[1], bool))), SyntaxError(f"Invalid expression {presort}"), ) presort_list.append((p[0].strip(), p[1])) for key, value in presort_list: if key in res: raise SyntaxError( f"Invalid expression {presort} duplicated key {key}") res[key] = value return res
def validate_value(self, obj: Any) -> Any: if obj is not None: return super().validate_value(obj) aot(self.nullable, lambda: f"Can't set None to {self.paramdict}") return obj
def validate_dependency(self, other: "_Dependency") -> None: aot( isinstance(other, (_ConfigVar)), lambda: TypeError(f"{other} is not Input or Output"), ) self.spec.validate_spec(other.spec) # type:ignore