def c( df_simple, df, user_table_1, user_table_2, long_table, user_table_inf, user_table_nan, string_table, datetime_table, ): dfs = { "df_simple": df_simple, "df": df, "user_table_1": user_table_1, "user_table_2": user_table_2, "long_table": long_table, "user_table_inf": user_table_inf, "user_table_nan": user_table_nan, "string_table": string_table, "datetime_table": datetime_table, } # Lazy import, otherwise the pytest framework has problems from dask_sql.context import Context c = Context() for df_name, df in dfs.items(): dask_df = dd.from_pandas(df, npartitions=3) c.create_table(df_name, dask_df) yield c
def test_sort_with_nan_many_partitions(gpu): c = Context() df = pd.DataFrame({ "a": [float("nan"), 1] * 30, "b": [1, 2, 3] * 20, }) c.create_table("df", dd.from_pandas(df, npartitions=10), gpu=gpu) df_result = c.sql( "SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan")] * 30 + [1] * 30, "b": [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10, }), check_index=False, ) df = pd.DataFrame({"a": [float("nan"), 1] * 30}) c.create_table("df", dd.from_pandas(df, npartitions=10)) df_result = c.sql("SELECT * FROM df ORDER BY a") assert_eq( df_result, pd.DataFrame({ "a": [1] * 30 + [float("nan")] * 30, }), check_index=False, )
def test_select(hive_cursor): c = Context() c.create_table("df", hive_cursor) result_df = c.sql("SELECT * FROM df").compute().reset_index(drop=True) df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32") assert_frame_equal(df, result_df)
def test_select(hive_cursor): c = Context() c.create_table("df", hive_cursor) result_df = c.sql("SELECT * FROM df") expected_df = pd.DataFrame({"i": [1, 2], "j": [2, 4]}).astype("int32") assert_eq(result_df, expected_df, check_index=False)
def test_intake_location(intake_catalog_location): c = Context() c.create_table("df", intake_catalog_location, format="intake", intake_table_name="intake_table") check_read_table(c)
def main(): # pragma: no cover """ CLI version of the :func:`run_server` function. """ parser = ArgumentParser() parser.add_argument( "--host", default="0.0.0.0", help="The host interface to listen on (defaults to all interfaces)", ) parser.add_argument( "--port", default=8080, help="The port to listen on (defaults to 8080)" ) parser.add_argument( "--scheduler-address", default=None, help="Connect to this dask scheduler if given", ) parser.add_argument( "--log-level", default=None, help="Set the log level of the server. Defaults to info.", choices=uvicorn.config.LOG_LEVELS, ) parser.add_argument( "--load-test-data", default=False, action="store_true", help="Preload some test data.", ) parser.add_argument( "--startup", default=False, action="store_true", help="Wait until Apache Calcite was properly loaded", ) args = parser.parse_args() client = None if args.scheduler_address: client = dask.distributed.Client(args.scheduler_address) context = Context() if args.load_test_data: df = dask.datasets.timeseries(freq="1d").reset_index(drop=False) context.create_table("timeseries", df.persist()) run_server( context=context, client=client, host=args.host, port=args.port, startup=args.startup, log_level=args.log_level, )
def select(self, dfs: fugue.dataframe.DataFrames, statement: str) -> fugue.dataframe.DataFrame: """Send the SQL command to the dask-sql context and register all temporary dataframes""" c = Context() for k, v in dfs.items(): c.create_table(k, self.execution_engine.to_df(v).native) df = c.sql(statement) return fugue_dask.dataframe.DaskDataFrame(df)
def c( df_simple, df_wide, df, user_table_1, user_table_2, long_table, user_table_inf, user_table_nan, string_table, datetime_table, parquet_ddf, gpu_user_table_1, gpu_df, gpu_long_table, gpu_string_table, gpu_datetime_table, ): dfs = { "df_simple": df_simple, "df_wide": df_wide, "df": df, "user_table_1": user_table_1, "user_table_2": user_table_2, "long_table": long_table, "user_table_inf": user_table_inf, "user_table_nan": user_table_nan, "string_table": string_table, "datetime_table": datetime_table, "parquet_ddf": parquet_ddf, "gpu_user_table_1": gpu_user_table_1, "gpu_df": gpu_df, "gpu_long_table": gpu_long_table, "gpu_string_table": gpu_string_table, "gpu_datetime_table": gpu_datetime_table, } # Lazy import, otherwise the pytest framework has problems from dask_sql.context import Context c = Context() for df_name, df in dfs.items(): if df is None: continue if hasattr(df, "npartitions"): # df is already a dask collection dask_df = df else: dask_df = dd.from_pandas(df, npartitions=3) c.create_table(df_name, dask_df) yield c
def test_sort_with_nan_more_columns(gpu): c = Context() df = pd.DataFrame({ "a": [1, 1, 2, 2, float("nan"), float("nan")], "b": [1, 1, 2, float("nan"), float("inf"), 5], "c": [1, float("nan"), 3, 4, 5, 6], }) c.create_table("df", df, gpu=gpu) df_result = c.sql( "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST" ) assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), float("nan"), 1, 1, 2, 2], "b": [float("inf"), 5, 1, 1, 2, float("nan")], "c": [5, 6, float("nan"), 1, 3, 4], }), check_index=False, ) df_result = c.sql( "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST" ) assert_eq( df_result, pd.DataFrame({ "a": [1, 1, 2, 2, float("nan"), float("nan")], "b": [1, 1, float("nan"), 2, float("inf"), 5], "c": [1, float("nan"), 4, 3, 5, 6], }), check_index=False, ) df_result = c.sql( "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c DESC NULLS LAST" ) assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), float("nan"), 1, 1, 2, 2], "b": [float("inf"), 5, 1, 1, 2, float("nan")], "c": [5, 6, 1, float("nan"), 3, 4], }), check_index=False, )
def main(): # pragma: no cover parser = ArgumentParser() parser.add_argument( "--scheduler-address", default=None, help="Connect to this dask scheduler if given", ) parser.add_argument( "--log-level", default=None, help="Set the log level of the server. Defaults to info.", choices=["DEBUG", "INFO", "WARNING", "ERROR"], ) parser.add_argument( "--load-test-data", default=False, action="store_true", help="Preload some test data.", ) parser.add_argument( "--startup", default=False, action="store_true", help="Wait until Apache Calcite was properly loaded", ) args = parser.parse_args() client = None if args.scheduler_address: client = Client(args.scheduler_address) context = Context() if args.load_test_data: df = timeseries(freq="1d").reset_index(drop=False) context.create_table("timeseries", df.persist()) cmd_loop(context=context, client=client, startup=args.startup, log_level=args.log_level)
def test_sort_with_nan_many_partitions(): c = Context() df = pd.DataFrame({ "a": [float("nan"), 1] * 30, "b": [1, 2, 3] * 20, }) c.create_table("df", dd.from_pandas(df, npartitions=10)) df_result = ( c.sql("SELECT * FROM df ORDER BY a NULLS FIRST, b ASC NULLS FIRST" ).compute().reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan")] * 30 + [1] * 30, "b": [1] * 10 + [2] * 10 + [3] * 10 + [1] * 10 + [2] * 10 + [3] * 10, }), check_names=False, )
def test_sort_with_nan_more_columns(): c = Context() df = pd.DataFrame({ "a": [1, 1, 2, 2, float("nan"), float("nan")], "b": [1, 1, 2, float("nan"), float("inf"), 5], "c": [1, float("nan"), 3, 4, 5, 6], }) c.create_table("df", df) df_result = (c.sql( "SELECT * FROM df ORDER BY a ASC NULLS FIRST, b DESC NULLS LAST, c ASC NULLS FIRST" ).c.compute().reset_index(drop=True)) assert_series_equal(df_result, pd.Series([5, 6, float("nan"), 1, 3, 4]), check_names=False) df_result = (c.sql( "SELECT * FROM df ORDER BY a ASC NULLS LAST, b DESC NULLS FIRST, c DESC NULLS LAST" ).c.compute().reset_index(drop=True)) assert_series_equal(df_result, pd.Series([1, float("nan"), 4, 3, 5, 6]), check_names=False)
def test_sort_with_nan(): c = Context() df = pd.DataFrame({ "a": [1, 2, float("nan"), 2], "b": [4, float("nan"), 5, float("inf")] }) c.create_table("df", df) df_result = c.sql("SELECT * FROM df ORDER BY a").compute().reset_index( drop=True) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a NULLS FIRST").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a NULLS LAST").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a ASC").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST").compute( ).reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST").compute(). reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), ) df_result = ( c.sql("SELECT * FROM df ORDER BY a DESC").compute().reset_index( drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("inf"), float("nan"), 4] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST").compute( ).reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("inf"), float("nan"), 4] }), ) df_result = (c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST").compute( ).reset_index(drop=True)) assert_frame_equal( df_result, pd.DataFrame({ "a": [2, 2, 1, float("nan")], "b": [float("inf"), float("nan"), 4, 5] }), )
def test_sort_with_nan(gpu): c = Context() df = pd.DataFrame({ "a": [1, 2, float("nan"), 2], "b": [4, float("nan"), 5, float("inf")] }) c.create_table("df", df, gpu=gpu) df_result = c.sql("SELECT * FROM df ORDER BY a") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a NULLS LAST") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a ASC") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 1, 2, 2], "b": [5, 4, float("nan"), float("inf")] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a ASC NULLS LAST") assert_eq( df_result, pd.DataFrame({ "a": [1, 2, 2, float("nan")], "b": [4, float("nan"), float("inf"), 5] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a DESC") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("nan"), float("inf"), 4] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS FIRST") assert_eq( df_result, pd.DataFrame({ "a": [float("nan"), 2, 2, 1], "b": [5, float("nan"), float("inf"), 4] }), check_index=False, ) df_result = c.sql("SELECT * FROM df ORDER BY a DESC NULLS LAST") assert_eq( df_result, pd.DataFrame({ "a": [2, 2, 1, float("nan")], "b": [float("nan"), float("inf"), 4, 5] }), check_index=False, )
def test_intake_catalog(intake_catalog_location): catalog = intake.open_catalog(intake_catalog_location) c = Context() c.create_table("df", catalog, intake_table_name="intake_table") check_read_table(c)
def create_meta_data(c: Context): """ Creates the schema, table and column data for prestodb JDBC driver so that data can be viewed in a database tool like DBeaver. It doesn't create a catalog entry although JDBC expects one as dask-sql doesn't support catalogs. For both catalogs and procedures empty placeholder tables are created. The meta-data appears in a separate schema called system_jdbc largely because the JDBC driver tries to access system.jdbc and it sufficiently so shouldn't clash with other schemas. A function is required in the /v1/statement to change system.jdbc to system_jdbc and ignore order by statements from the driver (as adjust_for_presto_sql above) :param c: Context containing created tables :return: """ if c is None: logger.warn("Context None: jdbc meta data not created") return catalog = "" system_schema = "system_jdbc" c.create_schema(system_schema) # TODO: add support for catalogs in presto interface # see https://github.com/dask-contrib/dask-sql/pull/351 # if catalog and len(catalog.strip()) > 0: # catalogs = pd.DataFrame().append(create_catalog_row(catalog), ignore_index=True) # c.create_table("catalogs", catalogs, schema_name=system_schema) schemas = pd.DataFrame().append(create_schema_row(), ignore_index=True) c.create_table("schemas", schemas, schema_name=system_schema) schema_rows = [] tables = pd.DataFrame().append(create_table_row(), ignore_index=True) c.create_table("tables", tables, schema_name=system_schema) table_rows = [] columns = pd.DataFrame().append(create_column_row(), ignore_index=True) c.create_table("columns", columns, schema_name=system_schema) column_rows = [] for schema_name, schema in c.schema.items(): schema_rows.append(create_schema_row(catalog, schema_name)) for table_name, dc in schema.tables.items(): df = dc.df logger.info(f"schema ${schema_name}, table {table_name}, {df}") table_rows.append( create_table_row(catalog, schema_name, table_name)) pos: int = 0 for column in df.columns: pos = pos + 1 logger.debug(f"column {column}") dtype = "VARCHAR" if df[column].dtype == "int64" or df[column].dtype == "int": dtype = "INTEGER" elif df[column].dtype == "float64" or df[ column].dtype == "float": dtype = "FLOAT" elif (df[column].dtype == "datetime" or df[column].dtype == "datetime64[ns]"): dtype = "TIMESTAMP" column_rows.append( create_column_row( catalog, schema_name, table_name, dtype, df[column].name, str(pos), )) schemas = pd.DataFrame(schema_rows) c.create_table("schemas", schemas, schema_name=system_schema) tables = pd.DataFrame(table_rows) c.create_table("tables", tables, schema_name=system_schema) columns = pd.DataFrame(column_rows) c.create_table("columns", columns, schema_name=system_schema) logger.info(f"jdbc meta data ready for {len(table_rows)} tables")
def assert_query_gives_same_result(engine): np.random.seed(42) df1 = dd.from_pandas( pd.DataFrame( { "user_id": np.random.choice([1, 2, 3, 4, pd.NA], 100), "a": np.random.rand(100), "b": np.random.randint(-10, 10, 100), } ), npartitions=3, ) df1["user_id"] = df1["user_id"].astype("Int64") df2 = dd.from_pandas( pd.DataFrame( { "user_id": np.random.choice([1, 2, 3, 4], 100), "c": np.random.randint(20, 30, 100), "d": np.random.choice(["a", "b", "c", None], 100), } ), npartitions=3, ) df3 = dd.from_pandas( pd.DataFrame( { "s": [ "".join(np.random.choice(["a", "B", "c", "D"], 10)) for _ in range(100) ] + [None] } ), npartitions=3, ) # the other is a Int64, that makes joining simpler df2["user_id"] = df2["user_id"].astype("Int64") # add some NaNs df1["a"] = df1["a"].apply( lambda a: float("nan") if a > 0.8 else a, meta=("a", "float") ) df1["b_bool"] = df1["b"].apply( lambda b: pd.NA if b > 5 else b < 0, meta=("a", "bool") ) # Lazy import, otherwise the pytest framework has problems from dask_sql.context import Context c = Context() c.create_table("df1", df1) c.create_table("df2", df2) c.create_table("df3", df3) df1.compute().to_sql("df1", engine, index=False, if_exists="replace") df2.compute().to_sql("df2", engine, index=False, if_exists="replace") df3.compute().to_sql("df3", engine, index=False, if_exists="replace") def _assert_query_gives_same_result(query, sort_columns=None, **kwargs): sql_result = pd.read_sql_query(query, engine) dask_result = c.sql(query).compute() # allow that the names are different # as expressions are handled differently dask_result.columns = sql_result.columns if sort_columns: sql_result = sql_result.sort_values(sort_columns) dask_result = dask_result.sort_values(sort_columns) sql_result = sql_result.reset_index(drop=True) dask_result = dask_result.reset_index(drop=True) assert_frame_equal(sql_result, dask_result, check_dtype=False, **kwargs) return _assert_query_gives_same_result