示例#1
0
def to_local_df(df: Any, schema: Any = None, metadata: Any = None) -> LocalDataFrame:
    """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame`

    :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
      list or iterable of arrays
    :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
      :class:`~fugue.dataframe.dataframe.DataFrame` type
    :param metadata: dict-like object with string keys, defaults to  None
    :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
      but you set ``schema`` or ``metadata``
    :raises TypeError: if ``df`` is not compatible
    :return: the dataframe itself if it's
      :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one

    :Examples:

    >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str")
    >>> assert to_local_df(a) is a
    >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
    """
    assert_arg_not_none(df, "df")
    if isinstance(df, DataFrame):
        aot(
            schema is None and metadata is None,
            ValueError("schema and metadata must be None when df is a DataFrame"),
        )
        return df.as_local()
    if isinstance(df, pd.DataFrame):
        return PandasDataFrame(df, schema, metadata)
    if isinstance(df, List):
        return ArrayDataFrame(df, schema, metadata)
    if isinstance(df, Iterable):
        return IterableDataFrame(df, schema, metadata)
    raise TypeError(f"{df} cannot convert to a LocalDataFrame")
示例#2
0
def to_taskspec(
        obj: Any,
        parent_workflow_spec: Optional[WorkflowSpec] = None) -> TaskSpec:
    assert_arg_not_none(obj, "obj")
    if isinstance(obj, str):
        return to_taskspec(json.loads(obj))
    if isinstance(obj, TaskSpec):
        return obj
    if isinstance(obj, Dict):
        d: Dict[str, Any] = dict(obj)
        node_spec: Optional[_NodeSpec] = None
        if "node_spec" in d:
            aot(
                parent_workflow_spec is not None,
                lambda: InvalidOperationError("parent workflow must be set"),
            )
            node_spec = _NodeSpec(
                workflow=parent_workflow_spec,
                **d["node_spec"]  # type: ignore
            )
            del d["node_spec"]
        if "tasks" in d:
            ts: TaskSpec = WorkflowSpec(**d)
        else:
            ts = TaskSpec(**d)
        if node_spec is not None:
            ts._node_spec = node_spec
        return ts
    raise TypeError(f"can't convert {obj} to TaskSpec")  # pragma: no cover
示例#3
0
 def __init__(self, *args: Any, **kwargs: Any):
     p = ParamDict()
     for a in args:
         if a is None:
             continue
         elif isinstance(a, PartitionSpec):
             self._update_dict(p, a.jsondict)
         elif isinstance(a, Dict):
             self._update_dict(p, a)
         elif isinstance(a, str):
             self._update_dict(p, json.loads(a))
         else:
             raise TypeError(f"{a} is not supported")
     self._update_dict(p, kwargs)
     self._num_partitions = p.get("num_partitions", "0")
     self._algo = p.get("algo", "").lower()
     self._partition_by = p.get("partition_by", [])
     aot(
         len(self._partition_by) == len(set(self._partition_by)),
         SyntaxError(f"{self._partition_by} has duplicated keys"),
     )
     self._presort = self._parse_presort_exp(p.get_or_none("presort", object))
     if any(x in self._presort for x in self._partition_by):
         raise SyntaxError(
             "partition by overlap with presort: "
             + f"{self._partition_by}, {self._presort}"
         )
     # TODO: currently, size limit not in use
     self._size_limit = to_size(p.get("size_limit", "0"))
     self._row_limit = p.get("row_limit", 0)
示例#4
0
 def _parse_spec(self, obj: Any, to_type: Type[T]) -> T:
     if isinstance(obj, to_type):
         return obj
     if isinstance(obj, str):
         obj = json.loads(obj)
     aot(isinstance(obj, dict), lambda: f"{obj} is not dict")
     return to_type(**obj)
示例#5
0
 def get(self) -> Any:
     if self.dependency is not None:
         return self.dependency.get()  # type:ignore
     if not self.is_set:
         aot(not self.spec.required,
             lambda: f"{self} is required but not set")
         return self.spec.default_value
     return self.value
示例#6
0
 def validate_value(self, obj: Any) -> Any:
     if obj is not None:
         aot(
             isinstance(obj, self.data_type),
             lambda: TypeError(f"{obj} mismatches type {self.paramdict}"),
         )
         return obj
     aot(self.nullable, lambda: f"Can't set None to {self}")
     return obj
示例#7
0
 def _parse_spec_collection(self, obj: Any,
                            to_type: Type[T]) -> IndexedOrderedDict[str, T]:
     res: IndexedOrderedDict[str, T] = IndexedOrderedDict()
     if obj is None:
         return res
     aot(isinstance(obj, List), "Spec collection must be a list")
     for v in obj:
         s = self._parse_spec(v, to_type)
         aot(s.name not in res, KeyError(f"Duplicated key {s.name}"))
         res[s.name] = s
     return res
示例#8
0
 def add_task(
     self,
     name: str,
     task: Any,
     dependency: Optional[Dict[str, str]] = None,
     config: Optional[Dict[str, Any]] = None,
     config_dependency: Optional[Dict[str, str]] = None,
 ) -> TaskSpec:
     _t = to_taskspec(task)
     aot(_t._node_spec is None, "node_spec must not be set")
     _t._node_spec = _NodeSpec(self, name, dependency, config,
                               config_dependency)
     return self._append_task(_t)
示例#9
0
 def validate_spec(self, spec: "OutputSpec") -> "OutputSpec":
     if not self.nullable:
         aot(
             not spec.nullable,
             lambda: TypeError(
                 f"{self} - {spec} are not compatible on nullable"),
         )
     aot(
         issubclass(spec.data_type, self.data_type),
         lambda: TypeError(
             f"{self} - {spec} are not compatible on data_type"),
     )
     return spec
示例#10
0
 def link(self, output: str, to_expr: str):
     try:
         aot(
             output in self.outputs,
             lambda: f"{output} is not an output of the workflow",
         )
         aot(
             output not in self.internal_dependency,
             lambda: f"{output} is already defined",
         )
         t = to_expr.split(".", 1)
         if len(t) == 1:
             aot(
                 t[0] in self.inputs,
                 lambda: f"{t[0]} is not an input of the workflow",
             )
             self.outputs[output].validate_spec(self.inputs[t[0]])
         else:  # len(t) == 2
             node = self.tasks[t[0]]
             aot(t[1] in node.outputs,
                 lambda: f"{t[1]} is not an output of {node}")
             self.outputs[output].validate_spec(node.outputs[t[1]])
         self.internal_dependency[output] = to_expr
     except Exception as e:
         raise DependencyDefinitionError(e)
示例#11
0
 def _ensure_fully_connected(self) -> None:  # pragma: no cover
     """By design, this should be called always when fully connected,
     but if this failed, it means there is a bug in the framework itself.
     """
     for k, v in self.configs.items():
         try:
             v.get()
         except Exception:
             raise WorkflowBug(
                 f"BUG: config {k}'s value or dependency is not set")
     for k, vv in self.inputs.items():
         aot(
             vv.dependency is not None,
             lambda: WorkflowBug(f"BUG: input {k}'s dependency is not set"),
         )
示例#12
0
def _make_top_level_workflow(spec: WorkflowSpec, ctx: WorkflowContext,
                             configs: Dict[str, Any]) -> _Workflow:
    aot(
        len(spec.inputs) == 0,
        InvalidOperationError("Can't have inputs for top level workflow"),
    )
    wf = _Workflow(spec, ctx)
    for k, vv in configs.items():
        wf.configs[k].set(vv)
    for k, v in wf.configs.items():
        try:
            v.get()
        except Exception:
            raise InvalidOperationError(
                f"config {k}'s value is not set for top level workflow")
    wf._init_tasks()
    return wf
示例#13
0
文件: partition.py 项目: gityow/fugue
 def __init__(self, *args: Any, **kwargs: Any):  # noqa: C901
     p = ParamDict()
     if (
         len(args) == 1
         and len(kwargs) == 0
         and isinstance(args[0], str)
         and args[0].lower() == "per_row"
     ):
         p["algo"] = "even"
         p["num_partitions"] = "ROWCOUNT"
     else:
         for a in args:
             if a is None:
                 continue
             elif isinstance(a, PartitionSpec):
                 self._update_dict(p, a.jsondict)
             elif isinstance(a, Dict):
                 self._update_dict(p, a)
             elif isinstance(a, str):
                 self._update_dict(p, json.loads(a))
             else:
                 raise TypeError(f"{a} is not supported")
         self._update_dict(p, kwargs)
     self._num_partitions = p.get("num_partitions", "0")
     self._algo = p.get("algo", "").lower()
     if "partition_by" not in p:
         self._partition_by: List[str] = []
     elif isinstance(p["partition_by"], str):
         self._partition_by = [p["partition_by"]]
     elif isinstance(p["partition_by"], (list, tuple)):
         self._partition_by = list(p["partition_by"])
     else:
         raise SyntaxError(p["partition_by"])
     aot(
         len(self._partition_by) == len(set(self._partition_by)),
         SyntaxError(f"{self._partition_by} has duplicated keys"),
     )
     self._presort = parse_presort_exp(p.get_or_none("presort", object))
     if any(x in self._presort for x in self._partition_by):
         raise SyntaxError(
             "partition by overlap with presort: "
             + f"{self._partition_by}, {self._presort}"
         )
     # TODO: currently, size limit not in use
     self._size_limit = to_size(p.get("size_limit", "0"))
     self._row_limit = p.get("row_limit", 0)
示例#14
0
 def __init__(
     self,
     name: str,
     data_type: Any,
     nullable: bool,
     required: bool = True,
     default_value: Any = None,
     metadata: Any = None,
 ):
     super().__init__(name, data_type, nullable, metadata)
     self.required = required
     self.default_value = default_value
     if required:
         aot(default_value is None, "required var can't have default_value")
     elif default_value is None:
         aot(nullable,
             "default_value can't be None because it's not nullable")
     else:
         self.default_value = as_type(self.default_value, self.data_type)
示例#15
0
 def __init__(
     self,
     name: str,
     data_type: Any,
     nullable: bool,
     required: bool = True,
     default_value: Any = None,
     timeout: Any = 0,
     default_on_timeout: bool = False,
     metadata: Any = None,
 ):
     super().__init__(name, data_type, nullable, required, default_value,
                      metadata)
     self.timeout = to_timedelta(timeout).total_seconds()
     self.default_on_timeout = default_on_timeout
     aot(self.timeout >= 0, "timeout can't be negative")
     if required:
         aot(not default_on_timeout,
             "default is not allowed for required input")
示例#16
0
 def _append_task(self, task: TaskSpec) -> TaskSpec:
     name = task.name
     assert_triad_var_name(name)
     aot(
         name not in self.tasks,
         lambda: KeyError(f"{name} already exists in workflow"),
     )
     aot(
         task.parent_workflow is self,
         lambda: InvalidOperationError(f"{task} has mismatching node_spec"),
     )
     try:
         task._validate_config()
         task._validate_dependency()
     except DependencyDefinitionError:
         raise
     except Exception as e:
         raise DependencyDefinitionError(e)
     self.tasks[name] = task
     return task
示例#17
0
    def get_sorts(self, schema: Schema) -> IndexedOrderedDict[str, bool]:
        """Get keys for sorting in a partition, it's the combination of partition
        keys plus the presort keys

        :param schema: the dataframe schema this partition spec to operate on
        :return: an ordered dictionary of key, order pairs

        :Example:

        >>> p = PartitionSpec(by=["a"],presort="b , c dESc")
        >>> schema = Schema("a:int,b:int,c:int,d:int"))
        >>> assert p.get_sorts(schema) == {"a":True, "b":True, "c": False}
        """
        d: IndexedOrderedDict[str, bool] = IndexedOrderedDict()
        for p in self.partition_by:
            aot(p in schema, KeyError(f"{p} not in {schema}"))
            d[p] = True
        for p, v in self.presort.items():
            aot(p in schema, KeyError(f"{p} not in {schema}"))
            d[p] = v
        return d
示例#18
0
 def _validate_dependency(self):
     if set(self.node_spec.dependency.keys()) != set(self.inputs.keys()):
         raise DependencyNotDefinedError(
             self.name + " input",
             self.inputs.keys(),
             self.node_spec.dependency.keys(),
         )
     for k, v in self.node_spec.dependency.items():
         t = v.split(".", 1)
         if len(t) == 1:
             aot(
                 t[0] in self.parent_workflow.inputs,
                 lambda: f"{t[0]} is not an input of the workflow",
             )
             self.inputs[k].validate_spec(self.parent_workflow.inputs[t[0]])
         else:  # len(t) == 2
             aot(
                 t[0] != self.name,
                 lambda: f"{v} tries to connect to self node {self.name}",
             )
             task = self.parent_workflow.tasks[t[0]]
             aot(
                 t[1] in task.outputs,
                 lambda: f"{t[1]} is not an output of node {task.name}",
             )
             self.inputs[k].validate_spec(task.outputs[t[1]])
示例#19
0
 def _validate_config(self):
     for k, v in self.node_spec.config.items():
         self.configs[k].validate_value(v)
     defined = set(self.node_spec.config.keys())
     for k, t in self.node_spec.config_dependency.items():
         aot(
             k not in defined,
             lambda: f"can't redefine config {k} in node {self.name}",
         )
         defined.add(k)
         aot(
             t in self.parent_workflow.configs,
             lambda: f"{t} is not a config of the workflow",
         )
         self.configs[k].validate_spec(self.parent_workflow.configs[t])
     for k in set(self.configs.keys()).difference(defined):
         aot(
             not self.configs[k].required,
             lambda:
             f"config {k} in node {self.name} is required but not defined",
         )
示例#20
0
def get_join_schemas(df1: DataFrame, df2: DataFrame, how: str,
                     on: Iterable[str]) -> Tuple[Schema, Schema]:
    """Get :class:`~triad:triad.collections.schema.Schema` object after
    joining ``df1`` and ``df2``. If ``on`` is not empty, it's mainly for
    validation purpose.

    :param df1: first dataframe
    :param df2: second dataframe
    :param how: can accept ``semi``, ``left_semi``, ``anti``, ``left_anti``,
      ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross``
    :param on: it can always be inferred, but if you provide, it will be
      validated agained the inferred keys.
    :return: the pair key schema and schema after join

    .. note::

        In Fugue, joined schema can always be inferred because it always uses the
        input dataframes' common keys as the join keys. So you must make sure to
        :meth:`~fugue.dataframe.dataframe.DataFrame.rename` to input dataframes so
        they follow this rule.
    """
    assert_arg_not_none(how, "how")
    how = how.lower()
    aot(
        how in [
            "semi",
            "left_semi",
            "anti",
            "left_anti",
            "inner",
            "left_outer",
            "right_outer",
            "full_outer",
            "cross",
        ],
        ValueError(f"{how} is not a valid join type"),
    )
    on = list(on)
    aot(len(on) == len(set(on)), f"{on} has duplication")
    if how != "cross" and len(on) == 0:
        on = list(df1.schema.intersect(df2.schema.names).names)
        aot(
            len(on) > 0,
            lambda: SchemaError(
                f"no common columns between {df1.schema} and {df2.schema}"),
        )
    schema2 = df2.schema
    aot(
        how != "outer",
        ValueError(
            "'how' must use left_outer, right_outer, full_outer for outer joins"
        ),
    )
    if how in ["semi", "left_semi", "anti", "left_anti"]:
        schema2 = schema2.extract(on)
    aot(
        on in df1.schema and on in schema2,
        lambda: SchemaError(
            f"{on} is not the intersection of {df1.schema} & {df2.schema}"),
    )
    cm = df1.schema.intersect(on)
    if how == "cross":
        aot(
            len(df1.schema.intersect(schema2)) == 0,
            SchemaError("can't specify on for cross join"),
        )
    else:
        aot(len(on) > 0, SchemaError("on must be specified"))
    return cm, (df1.schema.union(schema2))
示例#21
0
def parse_presort_exp(
        presort: Any) -> IndexedOrderedDict[str, bool]:  # noqa [C901]
    """Returns ordered column sorting direction where ascending order
    would return as true, and descending as false.

    :param presort: string that contains column and sorting direction or
        list of tuple that contains column and boolean sorting direction
    :type presort: Any

    :return: column and boolean sorting direction of column, order matters.
    :rtype: IndexedOrderedDict[str, bool]

    :Example:

    >>> parse_presort_exp("b desc, c asc")
    >>> parse_presort_exp([("b", True), ("c", False))])
    both return IndexedOrderedDict([("b", True), ("c", False))])
    """

    if isinstance(presort, IndexedOrderedDict):
        return presort

    presort_list: List[Tuple[str, bool]] = []
    res: IndexedOrderedDict[str, bool] = IndexedOrderedDict()
    if presort is None:
        return res

    elif isinstance(presort, str):
        presort = presort.strip()
        if presort == "":
            return res
        for p in presort.split(","):
            pp = p.strip().split()
            key = pp[0].strip()
            if len(pp) == 1:
                presort_list.append((key, True))
            elif len(pp) == 2:
                if pp[1].strip().lower() == "asc":
                    presort_list.append((key, True))
                elif pp[1].strip().lower() == "desc":
                    presort_list.append((key, False))
                else:
                    raise SyntaxError(f"Invalid expression {presort}")
            else:
                raise SyntaxError(f"Invalid expression {presort}")

    elif isinstance(presort, list):
        for p in presort:
            if isinstance(p, str):
                aot(
                    len(p.strip().split()) == 1,
                    SyntaxError(f"Invalid expression {presort}"),
                )
                presort_list.append((p.strip(), True))
            else:
                aot(len(p) == 2, SyntaxError(f"Invalid expression {presort}"))
                aot(
                    isinstance(p, tuple)
                    & (isinstance(p[0], str) & (isinstance(p[1], bool))),
                    SyntaxError(f"Invalid expression {presort}"),
                )
                presort_list.append((p[0].strip(), p[1]))

    for key, value in presort_list:
        if key in res:
            raise SyntaxError(
                f"Invalid expression {presort} duplicated key {key}")
        res[key] = value
    return res
示例#22
0
 def validate_value(self, obj: Any) -> Any:
     if obj is not None:
         return super().validate_value(obj)
     aot(self.nullable, lambda: f"Can't set None to {self.paramdict}")
     return obj
示例#23
0
 def validate_dependency(self, other: "_Dependency") -> None:
     aot(
         isinstance(other, (_ConfigVar)),
         lambda: TypeError(f"{other} is not Input or Output"),
     )
     self.spec.validate_spec(other.spec)  # type:ignore