def convert( self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context" ) -> DataContainer: table_name = str(sql.getTableName()) if table_name in context.tables: if sql.getIfNotExists(): return elif not sql.getReplace(): raise RuntimeError( f"A table with the name {table_name} is already present." ) kwargs = convert_sql_kwargs(sql.getKwargs()) logger.debug( f"Creating new table with name {table_name} and parameters {kwargs}" ) format = kwargs.pop("format", None) if format: # pragma: no cover format = format.lower() persist = kwargs.pop("persist", False) try: location = kwargs.pop("location") except KeyError: raise AttributeError("Parameters must include a 'location' parameter.") context.create_table( table_name, location, format=format, persist=persist, **kwargs )
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: select = sql.getSelect() schema_name, experiment_name = context.fqn(sql.getExperimentName()) kwargs = convert_sql_kwargs(sql.getKwargs()) if experiment_name in context.schema[schema_name].experiments: if sql.getIfNotExists(): return elif not sql.getReplace(): raise RuntimeError( f"A experiment with the name {experiment_name} is already present." ) logger.debug( f"Creating Experiment {experiment_name} from query {select} with options {kwargs}" ) model_class = None automl_class = None experiment_class = None if "model_class" in kwargs: model_class = kwargs.pop("model_class") # when model class was provided, must provide experiment_class also for tuning if "experiment_class" not in kwargs: raise ValueError( f"Parameters must include a 'experiment_class' parameter for tuning {model_class}." ) experiment_class = kwargs.pop("experiment_class") elif "automl_class" in kwargs: automl_class = kwargs.pop("automl_class") else: raise ValueError( "Parameters must include a 'model_class' or 'automl_class' parameter." ) target_column = kwargs.pop("target_column", "") tune_fit_kwargs = kwargs.pop("tune_fit_kwargs", {}) parameters = kwargs.pop("tune_parameters", {}) experiment_kwargs = kwargs.pop("experiment_kwargs", {}) automl_kwargs = kwargs.pop("automl_kwargs", {}) logger.info(parameters) select_query = context._to_sql_string(select) training_df = context.sql(select_query) if not target_column: raise ValueError( "Unsupervised Algorithm cannot be tuned Automatically," "Consider providing 'target column'") non_target_columns = [ col for col in training_df.columns if col != target_column ] X = training_df[non_target_columns] y = training_df[target_column] if model_class and experiment_class: try: ModelClass = import_class(model_class) except ImportError: raise ValueError( f"Can not import model {model_class}. Make sure you spelled it correctly and have installed all packages." ) try: ExperimentClass = import_class(experiment_class) except ImportError: raise ValueError( f"Can not import tuner {experiment_class}. Make sure you spelled it correctly and have installed all packages." ) try: from dask_ml.wrappers import ParallelPostFit except ImportError: # pragma: no cover raise ValueError( "dask_ml must be installed to use automl and tune hyperparameters" ) model = ModelClass() search = ExperimentClass(model, {**parameters}, **experiment_kwargs) logger.info(tune_fit_kwargs) search.fit(X, y, **tune_fit_kwargs) df = pd.DataFrame(search.cv_results_) df["model_class"] = model_class context.register_model( experiment_name, ParallelPostFit(estimator=search.best_estimator_), X.columns, schema_name=schema_name, ) if automl_class: try: AutoMLClass = import_class(automl_class) except ImportError: raise ValueError( f"Can not import automl model {automl_class}. Make sure you spelled it correctly and have installed all packages." ) try: from dask_ml.wrappers import ParallelPostFit except ImportError: # pragma: no cover raise ValueError( "dask_ml must be installed to use automl and tune hyperparameters" ) automl = AutoMLClass(**automl_kwargs) # should be avoided if data doesn't fit in memory automl.fit(X.compute(), y.compute()) df = (pd.DataFrame( automl.evaluated_individuals_).T.reset_index().rename( {"index": "models"}, axis=1)) context.register_model( experiment_name, ParallelPostFit(estimator=automl.fitted_pipeline_), X.columns, schema_name=schema_name, ) context.register_experiment(experiment_name, experiment_results=df, schema_name=schema_name) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def convert( self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context" ) -> DataContainer: select = sql.getSelect() model_name = str(sql.getModelName()) kwargs = convert_sql_kwargs(sql.getKwargs()) if model_name in context.models: if sql.getIfNotExists(): return elif not sql.getReplace(): raise RuntimeError( f"A model with the name {model_name} is already present." ) logger.debug( f"Creating model {model_name} from query {select} with options {kwargs}" ) try: model_class = kwargs.pop("model_class") except KeyError: raise ValueError("Parameters must include a 'model_class' parameter.") target_column = kwargs.pop("target_column", "") wrap_predict = kwargs.pop("wrap_predict", False) wrap_fit = kwargs.pop("wrap_fit", False) fit_kwargs = kwargs.pop("fit_kwargs", {}) try: ModelClass = import_class(model_class) except ImportError: raise ValueError( f"Can not import model {model_class}. Make sure you spelled it correctly and have installed all packages." ) model = ModelClass(**kwargs) if wrap_fit: from dask_ml.wrappers import Incremental model = Incremental(estimator=model) if wrap_predict: from dask_ml.wrappers import ParallelPostFit model = ParallelPostFit(estimator=model) select_query = context._to_sql_string(select) training_df = context.sql(select_query) if target_column: non_target_columns = [ col for col in training_df.columns if col != target_column ] X = training_df[non_target_columns] y = training_df[target_column] else: X = training_df y = None model.fit(X, y, **fit_kwargs) context.register_model(model_name, model, X.columns)
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: select = sql.getSelect() schema_name, model_name = context.fqn(sql.getModelName()) kwargs = convert_sql_kwargs(sql.getKwargs()) if model_name in context.schema[schema_name].models: if sql.getIfNotExists(): return elif not sql.getReplace(): raise RuntimeError( f"A model with the name {model_name} is already present.") logger.debug( f"Creating model {model_name} from query {select} with options {kwargs}" ) try: model_class = kwargs.pop("model_class") except KeyError: raise ValueError( "Parameters must include a 'model_class' parameter.") target_column = kwargs.pop("target_column", "") wrap_predict = kwargs.pop("wrap_predict", False) wrap_fit = kwargs.pop("wrap_fit", False) fit_kwargs = kwargs.pop("fit_kwargs", {}) select_query = context._to_sql_string(select) training_df = context.sql(select_query) if target_column: non_target_columns = [ col for col in training_df.columns if col != target_column ] X = training_df[non_target_columns] y = training_df[target_column] else: X = training_df y = None try: ModelClass = import_class(model_class) except ImportError: raise ValueError( f"Can not import model {model_class}. Make sure you spelled it correctly and have installed all packages." ) model = ModelClass(**kwargs) if wrap_fit: try: from dask_ml.wrappers import Incremental except ImportError: # pragma: no cover raise ValueError("Wrapping requires dask-ml to be installed.") model = Incremental(estimator=model) if wrap_predict: try: from dask_ml.wrappers import ParallelPostFit except ImportError: # pragma: no cover raise ValueError("Wrapping requires dask-ml to be installed.") # When `wrap_predict` is set to True we train on single partition frames # because this is only useful for non dask distributed models # Training via delayed fit ensures that we dont have to transfer # data back to the client for training X_d = X.repartition(npartitions=1).to_delayed() if y is not None: y_d = y.repartition(npartitions=1).to_delayed() else: y_d = None delayed_model = [ delayed(model.fit)(x_p, y_p) for x_p, y_p in zip(X_d, y_d) ] model = delayed_model[0].compute() model = ParallelPostFit(estimator=model) else: model.fit(X, y, **fit_kwargs) context.register_model(model_name, model, X.columns, schema_name=schema_name)