def __init__(self, conf: Any): _conf = ParamDict(conf) self._conf = ParamDict({**FUGUE_DEFAULT_CONF, **_conf}) self._rpc_server = make_rpc_server(self.conf) self._engine_start_lock = RLock() self._engine_start = 0 self._sql_engine: Optional[SQLEngine] = None
def test_flask_service(): # fugue.rpc.flask.FlaskRPCServer conf = ParamDict({ "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer", "fugue.rpc.flask_server.host": "127.0.0.1", "fugue.rpc.flask_server.port": "1234", "fugue.rpc.flask_server.timeout": "2 sec", }) def k(value: str) -> str: return value + "x" def kk(a: int, b: int) -> int: return a + b def kkk(f: callable, a: int) -> int: return f(a) with make_rpc_server(conf).start() as server: assert "1234" == server.conf["fugue.rpc.flask_server.port"] with server.start(): # recursive start will take no effect client1 = cloudpickle.loads( cloudpickle.dumps(server.make_client(k))) assert "dddx" == client1("ddd") client2 = cloudpickle.loads(cloudpickle.dumps(server.make_client(kk))) assert 3 == client2(1, 2) assert "dddx" == client1("ddd") client3 = cloudpickle.loads(cloudpickle.dumps(server.make_client(kkk))) assert 3 == client3(lambda x: x + 1, 2) assert 3 == client2(1, 2) server.stop() # extra stop in the end will take no effect
def __init__(self, conf: Any = None, use_sqlite: bool = False): configs = _process_confs(ParamDict(conf)) super().__init__(configs) if not use_sqlite: self.set_sql_engine(KaggleSQLEngineWrapper(self, QPDPandasEngine(self))) else: # pragma: no cover self.set_sql_engine(KaggleSQLEngineWrapper(self, SqliteEngine(self)))
def __init__( self, data: Any, schema: Any = None, metadata: Any = None, deterministic: bool = True, data_determiner: Optional[Callable[[Any], str]] = None, lazy: bool = True, ): self._validate_data(data, schema, metadata) self._data = data self._schema = None if schema is None else Schema(schema) self._metadata = None if metadata is None else ParamDict(metadata) did = "" if data_determiner is None else data_determiner(data) super().__init__( params=dict( schema=self._schema, metadata=self._metadata, determinism_id=did, ), input_n=0, output_n=1, deterministic=deterministic, lazy=lazy, )
def __init__(self, conf: Any = None): configs = _process_confs( {FUGUE_DASK_CONF_DATAFRAME_DEFAULT_PARTITIONS: _get_optimal_partition()}, ParamDict(conf), ) super().__init__(conf=configs) try: from dask_sql.integrations.fugue import DaskSQLEngine self.set_sql_engine(KaggleSQLEngineWrapper(self, DaskSQLEngine(self))) print("dask-sql is set as the SQL Engine for Dask") except ImportError: self.set_sql_engine(KaggleSQLEngineWrapper(self, QPDDaskEngine(self)))
def make_rpc_server(conf: Any) -> RPCServer: """Make :class:`~.RPCServer` based on configuration. If '`fugue.rpc.server`` is set, then the value will be used as the server type for the initialization. Otherwise, a :class:`~.NativeRPCServer` instance will be returned :param conf: |FugueConfig| :return: the RPC server """ conf = ParamDict(conf) tp = conf.get_or_none("fugue.rpc.server", str) t_server = NativeRPCServer if tp is None else to_type(tp, RPCServer) return t_server(conf) # type: ignore
def __init__( self, creator: Any, schema: Any = None, params: Any = None, deterministic: bool = True, lazy: bool = True, ): self._creator = _to_creator(creator, schema) self._creator._params = ParamDict(params, deep=False) super().__init__( params=params, input_n=0, output_n=1, deterministic=deterministic, lazy=lazy )
def _to_trail_row(data: Dict[str, Any], metadata: Dict[str, Any]) -> Dict[str, Any]: key_names = sorted(k for k in data.keys() if not k.startswith(TUNE_PREFIX)) keys = [data[k] for k in key_names] trials: Dict[str, Dict[str, Any]] = {} for param in pickle.loads(data[TUNE_DATASET_PARAMS_PREFIX]): p = ParamDict( sorted(((k, v) for k, v in param.items()), key=lambda x: x[0])) tid = to_uuid(keys, p) trials[tid] = Trial(trial_id=tid, params=p, metadata=metadata, keys=keys).jsondict data[TUNE_DATASET_TRIALS] = json.dumps(list(trials.values())) del data[TUNE_DATASET_PARAMS_PREFIX] return data
def __init__(self, spark_session: Optional[SparkSession] = None, conf: Any = None): if spark_session is None: spark_session = SparkSession.builder.getOrCreate() self._spark_session = spark_session cf = dict(FUGUE_SPARK_DEFAULT_CONF) cf.update({x[0]: x[1] for x in spark_session.sparkContext.getConf().getAll()}) cf.update(ParamDict(conf)) super().__init__(cf) self._fs = FileSystem() self._log = logging.getLogger() self._broadcast_func = RunOnce( self._broadcast, lambda *args, **kwargs: id(args[0]) ) self._persist_func = RunOnce(self._persist, lambda *args, **kwargs: id(args[0])) self._register_func = RunOnce( self._register, lambda *args, **kwargs: id(args[0]) ) self._io = SparkIO(self.spark_session, self.fs)
def __init__(self, spark_session: Optional[SparkSession] = None, conf: Any = None): configs = _process_confs( { "fugue.spark.use_pandas_udf": True, "spark.driver.memory": _get_optimal_mem(), "spark.sql.shuffle.partitions": _get_optimal_partition(), "spark.sql.execution.arrow.pyspark.fallback.enabled": True, "spark.driver.extraJavaOptions": "-Dio.netty.tryReflectionSetAccessible=true", # noqa: E501 "spark.executor.extraJavaOptions": "-Dio.netty.tryReflectionSetAccessible=true", # noqa: E501 }, ParamDict(conf), ) builder = SparkSession.builder.master("local[*]") for k, v in configs.items(): builder = builder.config(k, v) spark_session = builder.getOrCreate() super().__init__(spark_session=spark_session, conf=configs) self.set_sql_engine(KaggleSQLEngineWrapper(self, SparkSQLEngine(self)))
def get_engine(self, line: str, lc: Dict[str, Any]) -> ExecutionEngine: line = line.strip() p = line.find("{") if p >= 0: engine = line[:p].strip() conf = json.loads(line[p:]) else: parts = line.split(" ", 1) engine = parts[0] conf = ParamDict(None if len(parts) == 1 else lc[parts[1]]) cf = dict(self._pre_conf) cf.update(conf) for k, v in self._post_conf.items(): if k in cf and cf[k] != v: raise ValueError( f"{k} must be {v}, but you set to {cf[k]}, you may unset it" ) cf[k] = v if "+" in engine: return make_execution_engine(tuple(engine.split("+", 1)), cf) return make_execution_engine(engine, cf)
def __init__( self, input_n: int, outputter: Any, params: Any, pre_partition: Any = None, deterministic: bool = True, lazy: bool = False, input_names: Optional[List[str]] = None, ): assert_or_throw(input_n > 0, FugueWorkflowError("must have at least one input")) self._outputter = _to_outputter(outputter) self._outputter._params = ParamDict(params) self._outputter._partition_spec = PartitionSpec(pre_partition) self._outputter.validate_on_compile() super().__init__( params=params, input_n=input_n, output_n=1, deterministic=deterministic, lazy=lazy, input_names=input_names, )
def __init__( self, input_n: int, processor: Any, schema: Any, params: Any, pre_partition: Any = None, deterministic: bool = True, lazy: bool = False, input_names: Optional[List[str]] = None, ): self._processor = _to_processor(processor, schema) self._processor._params = ParamDict(params) self._processor._partition_spec = PartitionSpec(pre_partition) self._processor.validate_on_compile() super().__init__( params=params, input_n=input_n, output_n=1, deterministic=deterministic, lazy=lazy, input_names=input_names, )
FUGUE_CONF_WORKFLOW_AUTO_PERSIST, FUGUE_CONF_WORKFLOW_AUTO_PERSIST_VALUE, FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE, FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT, FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE, FUGUE_CONF_SQL_IGNORE_CASE, ]) _FUGUE_GLOBAL_CONF = ParamDict({ FUGUE_CONF_WORKFLOW_CONCURRENCY: 1, FUGUE_CONF_WORKFLOW_AUTO_PERSIST: False, FUGUE_CONF_WORKFLOW_EXCEPTION_HIDE: "fugue.,six,adagio.,pandas," "fugue_dask.,dask.,fugue_spark.,pyspark.,antlr4,_qpd_antlr,qpd,triad," "fugue_notebook.,ipython.,jupyter.,ipykernel,_pytest,pytest,fugue_ibis.", FUGUE_CONF_WORKFLOW_EXCEPTION_INJECT: 3, FUGUE_CONF_WORKFLOW_EXCEPTION_OPTIMIZE: True, FUGUE_CONF_SQL_IGNORE_CASE: False, }) def register_global_conf(conf: Dict[str, Any], on_dup: int = ParamDict.OVERWRITE) -> None: """Register global Fugue configs that can be picked up by any Fugue execution engines as the base configs. :param conf: the config dictionary
def __init__(self, conf: Any): super().__init__() self._conf = ParamDict(conf) self._handlers: Dict[str, RPCHandler] = {}
def report(self, result: Dict[str, Any]) -> None: self._error = float(result["error"]) self._hp = ParamDict(result.get("hp", None)) self._metadata = ParamDict(result.get("metadata", None))
def _get_temp_path(p: str, conf: ParamDict) -> str: if p is not None and p != "": return p return conf.get_or_throw(FUGUE_TUNE_TEMP_PATH, str) # TODO: remove hard code
def _get_temp_path(p: str, conf: ParamDict) -> str: if p is not None and p != "": return p return conf.get_or_throw(TUNE_TEMP_PATH, str)