def __init__(self, *args: Any, **kwargs: Any): p = ParamDict() for a in args: if a is None: continue elif isinstance(a, PartitionSpec): self._update_dict(p, a.jsondict) elif isinstance(a, Dict): self._update_dict(p, a) elif isinstance(a, str): self._update_dict(p, json.loads(a)) else: raise TypeError(f"{a} is not supported") self._update_dict(p, kwargs) self._num_partitions = p.get("num_partitions", "0") self._algo = p.get("algo", "").lower() self._partition_by = p.get("partition_by", []) aot( len(self._partition_by) == len(set(self._partition_by)), SyntaxError(f"{self._partition_by} has duplicated keys"), ) self._presort = self._parse_presort_exp(p.get_or_none("presort", object)) if any(x in self._presort for x in self._partition_by): raise SyntaxError( "partition by overlap with presort: " + f"{self._partition_by}, {self._presort}" ) # TODO: currently, size limit not in use self._size_limit = to_size(p.get("size_limit", "0")) self._row_limit = p.get("row_limit", 0)
def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame: kw = ParamDict(kwargs) infer_schema = kw.get("infer_schema", False) if infer_schema: kw["inferSchema"] = True if "infer_schema" in kw: del kw["infer_schema"] header = str(kw.get_or_none("header", object)).lower() if "header" in kw: del kw["header"] reader = self._session.read.format("csv") reader.options(**kw) if header == "true": reader.option("header", "true") if columns is None: return SparkDataFrame(reader.load(p)) if isinstance(columns, list): # column names return SparkDataFrame(reader.load(p)[columns]) schema = Schema(columns) return SparkDataFrame(reader.load(p)[schema.names], schema) if header in ["false", "none"]: reader.option("header", "false") if columns is None: raise InvalidOperationError("columns must be set if without header") if isinstance(columns, list): # column names sdf = reader.load(p) inferred = to_schema(sdf) renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)] return SparkDataFrame(sdf.selectExpr(*renames)) schema = Schema(columns) sdf = reader.schema(to_spark_schema(schema)).load(p) return SparkDataFrame(sdf, schema) else: raise NotImplementedError(f"{header} is not supported")
def __init__(self, *args: Any, **kwargs: Any): # noqa: C901 p = ParamDict() if ( len(args) == 1 and len(kwargs) == 0 and isinstance(args[0], str) and args[0].lower() == "per_row" ): p["algo"] = "even" p["num_partitions"] = "ROWCOUNT" else: for a in args: if a is None: continue elif isinstance(a, PartitionSpec): self._update_dict(p, a.jsondict) elif isinstance(a, Dict): self._update_dict(p, a) elif isinstance(a, str): self._update_dict(p, json.loads(a)) else: raise TypeError(f"{a} is not supported") self._update_dict(p, kwargs) self._num_partitions = p.get("num_partitions", "0") self._algo = p.get("algo", "").lower() if "partition_by" not in p: self._partition_by: List[str] = [] elif isinstance(p["partition_by"], str): self._partition_by = [p["partition_by"]] elif isinstance(p["partition_by"], (list, tuple)): self._partition_by = list(p["partition_by"]) else: raise SyntaxError(p["partition_by"]) aot( len(self._partition_by) == len(set(self._partition_by)), SyntaxError(f"{self._partition_by} has duplicated keys"), ) self._presort = parse_presort_exp(p.get_or_none("presort", object)) if any(x in self._presort for x in self._partition_by): raise SyntaxError( "partition by overlap with presort: " + f"{self._partition_by}, {self._presort}" ) # TODO: currently, size limit not in use self._size_limit = to_size(p.get("size_limit", "0")) self._row_limit = p.get("row_limit", 0)
def _load_csv(p: FileParser, columns: Any = None, **kwargs: Any) -> Tuple[pd.DataFrame, Any]: kw = ParamDict(kwargs) infer_schema = kw.get("infer_schema", False) if not infer_schema: kw["dtype"] = object if "infer_schema" in kw: del kw["infer_schema"] header: Any = False if "header" in kw: header = kw["header"] del kw["header"] if str(header) in ["True", "0"]: pdf = _safe_load_csv(p.uri, **{"index_col": False, "header": 0, **kw}) if columns is None: return pdf, None if isinstance(columns, list): # column names return pdf[columns], None schema = Schema(columns) return pdf[schema.names], schema if header is None or str(header) == "False": if columns is None: raise InvalidOperationError( "columns must be set if without header") if isinstance(columns, list): # column names pdf = _safe_load_csv( p.uri, **{ "index_col": False, "header": None, "names": columns, **kw }) return pdf, None schema = Schema(columns) pdf = _safe_load_csv( p.uri, **{ "index_col": False, "header": None, "names": schema.names, **kw }) return pdf, schema else: raise NotImplementedError(f"{header} is not supported")
def test_param_dict(): d = ParamDict([("a", 1), ("b", 2)]) assert 1 == d["a"] assert 1 == d[0] assert 2 == d["b"] assert "2" == d.get_or_throw(1, str) # if giving index, it should ignore the throw flag and always throw raises(IndexError, lambda: d.get(2, "x")) raises(IndexError, lambda: d.get_or_none(2, str)) d = {"a": "b", "b": {"x": 1, "y": "d"}} p = ParamDict(d) print({"test": p}) d["b"]["x"] = 2 assert 1 == p["b"]["x"] p = ParamDict(d, deep=False) d["b"]["x"] = 3 assert 3 == p["b"]["x"] pp = ParamDict(p, deep=False) p["b"]["x"] = 4 assert 4 == pp["b"]["x"] pp = ParamDict(p, deep=True) p["b"]["x"] = 5 assert 4 == pp["b"]["x"] assert 2 == len(p) assert "a,b" == ",".join([k for k, _ in p.items()]) del p["a"] assert 1 == len(p) p["c"] = 1 assert 2 == len(p) assert "c" in p assert "a" not in p raises(ValueError, lambda: p.get("c", None)) assert 1 == p.get("c", 2) assert "1" == p.get("c", "2") assert 1.0 == p.get("c", 2.0) raises(TypeError, lambda: p.get("c", ParamDict())) assert 2 == p.get("d", 2) p["arr"] = [1, 2] assert [1, 2] == p.get("arr", []) assert [] == p.get("arr2", []) assert p.get_or_none("e", int) is None assert 1 == p.get_or_none("c", int) assert "1" == p.get_or_none("c", str) # exists but can't convert type raises(TypeError, lambda: p.get_or_none("c", ParamDict)) raises(KeyError, lambda: p.get_or_throw("e", int)) assert 1 == p.get_or_throw("c", int) assert "1" == p.get_or_throw("c", str) # exists but can't convert type raises(TypeError, lambda: p.get_or_throw("c", ParamDict)) p = ParamDict() assert 0 == len(p) for x in p: pass raises(TypeError, lambda: ParamDict("abc")) a = ParamDict({"a": 1, "b": 2}) b = ParamDict({"b": 2, "a": 1}) c = ParamDict({"b": 2}) assert a == a assert a != b assert a != c assert a == {"b": 2, "a": 1} assert a != {"b": 1, "a": 1} assert a != None assert not (a == None) p = ParamDict({ "a": "True", "b": True, "c": "true", "d": "False", "e": False, "f": "false", "g": "yes", "h": "NO", "i": 0, "j": "1", "k": "", }) assert p.get_or_throw("a", bool) assert p.get_or_throw("b", bool) assert p.get_or_throw("c", bool) assert not p.get_or_throw("d", bool) assert not p.get_or_throw("e", bool) assert not p.get_or_throw("f", bool) assert p.get_or_throw("g", bool) assert not p.get_or_throw("h", bool) assert not p.get_or_throw("i", bool) assert p.get_or_throw("j", bool) raises(TypeError, lambda: p.get_or_throw("k", bool)) s = '{"a":false,"b":10,"c":"cd"}' p = ParamDict(json.loads(s)) assert not p.get_or_throw("a", bool) assert "10" == p.get_or_throw("b", str) assert "cd" == p.get_or_throw("c", str) raises(KeyError, lambda: p.get_or_throw("d", str)) print(p.to_json()) print(p.to_json(True)) # update p = ParamDict(dict(a=1, b=2)) p1 = ParamDict(dict(b=3, c=4)) p.update(p1) assert dict(a=1, b=3, c=4) == p p = ParamDict(dict(a=1, b=2)) p.update(p1, ParamDict.OVERWRITE) assert dict(a=1, b=3, c=4) == p p = ParamDict(dict(a=1, b=2)) p.update(p1, ParamDict.IGNORE) assert dict(a=1, b=2, c=4) == p p = ParamDict(dict(a=1, b=2)) raises(KeyError, lambda: p.update(p1, ParamDict.THROW)) raises(ValueError, lambda: p.update(p1, 100)) p.set_readonly() raises(InvalidOperationError, lambda: p.update(p1, 100))