Python Context примеры, dask_sql.Context Python примеры использования

Пример #1

0

Показать файл

Файл: relational_algebra_set.py Проект: jonasrenault/NeuroLang

class TimeDaskSQLJoins:
    params = [[10**4, 10**5], [10], [3], [6], [0.75]]

    param_names = [
        "rows",
        "cols",
        "number of join columns",
        "number of chained joins",
        "ratio of dictinct elements",
    ]

    def setup(self, N, ncols, njoin_columns, njoins, distinct_r):
        self.dfs = _generate_dataframes(N, ncols, njoin_columns, njoins,
                                        distinct_r)
        self.dfs = [dd.from_pandas(d, npartitions=1) for d in self.dfs]
        self.join_cols = [
            c for c in self.dfs[0].columns if c in self.dfs[1].columns
        ]
        self.ctx = Context()
        self._create_tables()
        self._create_sql_query()

    def _create_tables(self):
        self.tables = []
        for i, df in enumerate(self.dfs):
            _table_name = f"table_{i:03}"
            self.ctx.create_table(_table_name, df)
            _table = table(_table_name, *[column(c) for c in df.columns])
            self.tables.append(_table)

    def _create_sql_query(self):
        left = self.tables[0]
        joinq = left
        select_cols = list(left.c)
        for right in self.tables[1:]:
            on = and_(
                left.c.get(col) == right.c.get(col) for col in self.join_cols)
            joinq = joinq.join(right, on)
            select_cols += [c for c in right.c if c.name not in self.join_cols]
        query = select(*select_cols).select_from(joinq)
        self.sql_query = str(
            query.compile(
                dialect=postgresql.dialect(),
                compile_kwargs={"literal_binds": True},
            ))

    def time_joins(self, N, ncols, njoin_columns, njoins, distinct_r):
        start = time.perf_counter()
        print(f"Processing SQL query: {self.sql_query}")
        res = self.ctx.sql(self.sql_query)
        stop = time.perf_counter()
        print(f"Processing SQL query took {stop-start:0.4f} s.")
        start = time.perf_counter()
        print("Computing dask dataframe")
        res.compute()
        stop = time.perf_counter()
        print(f"Computing dask dataframe took {stop-start:0.4f} s.")
        # Visualize task graph
        # res.visualize('taskgraph.png')
        return res

Пример #2

0

Показать файл

Файл: test_show.py Проект: nils-braun/dask-sql

def test_tables(gpu):
    c = Context()
    c.create_table("table", pd.DataFrame(), gpu=gpu)

    result_df = c.sql(f'SHOW TABLES FROM "{c.schema_name}"')
    expected_df = pd.DataFrame({"Table": ["table"]})

    assert_eq(result_df, expected_df, check_index=False)

Пример #3

0

Показать файл

Файл: test_show.py Проект: nils-braun/dask-sql

def test_show_tables_no_schema(c):
    c = Context()

    df = pd.DataFrame({"id": [0, 1]})
    c.create_table("test", df)

    actual_df = c.sql("show tables").compute()
    expected_df = pd.DataFrame({"Table": ["test"]})
    assert_eq(actual_df, expected_df)

Пример #4

0

Показать файл

Файл: app.py Проект: th3architect/dask-sql

def main():  # pragma: no cover
    """
    CLI version of the :func:`run_server` function.
    """
    parser = ArgumentParser()
    parser.add_argument(
        "--host",
        default="0.0.0.0",
        help="The host interface to listen on (defaults to all interfaces)",
    )
    parser.add_argument(
        "--port", default=8080, help="The port to listen on (defaults to 8080)"
    )
    parser.add_argument(
        "--scheduler-address",
        default=None,
        help="Connect to this dask scheduler if given",
    )
    parser.add_argument(
        "--log-level",
        default=None,
        help="Set the log level of the server. Defaults to info.",
        choices=uvicorn.config.LOG_LEVELS,
    )
    parser.add_argument(
        "--load-test-data",
        default=False,
        action="store_true",
        help="Preload some test data.",
    )
    parser.add_argument(
        "--startup",
        default=False,
        action="store_true",
        help="Wait until Apache Calcite was properly loaded",
    )

    args = parser.parse_args()

    client = None
    if args.scheduler_address:
        client = dask.distributed.Client(args.scheduler_address)

    context = Context()
    if args.load_test_data:
        df = dask.datasets.timeseries(freq="1d").reset_index(drop=False)
        context.create_table("timeseries", df.persist())

    run_server(
        context=context,
        client=client,
        host=args.host,
        port=args.port,
        startup=args.startup,
        log_level=args.log_level,
    )

Пример #5

0

Показать файл

Файл: fugue.py Проект: gityow/dask-sql

    def select(self, dfs: fugue.dataframe.DataFrames,
               statement: str) -> fugue.dataframe.DataFrame:
        """Send the SQL command to the dask-sql context and register all temporary dataframes"""
        c = Context()

        for k, v in dfs.items():
            c.create_table(k, self.execution_engine.to_df(v).native)

        df = c.sql(statement)
        return fugue_dask.dataframe.DaskDataFrame(df)

Пример #6

0

Показать файл

Файл: relational_algebra_set.py Проект: jonasrenault/NeuroLang

 def setup(self, N, ncols, njoin_columns, njoins, distinct_r):
     self.dfs = _generate_dataframes(N, ncols, njoin_columns, njoins,
                                     distinct_r)
     self.dfs = [dd.from_pandas(d, npartitions=1) for d in self.dfs]
     self.join_cols = [
         c for c in self.dfs[0].columns if c in self.dfs[1].columns
     ]
     self.ctx = Context()
     self._create_tables()
     self._create_sql_query()

Пример #7

0

Показать файл

def test_explain():
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame)

    sql_string = c.explain("SELECT * FROM df")

    assert (
        sql_string
        == f"LogicalProject(a=[$0]){os.linesep}  LogicalTableScan(table=[[schema, df]]){os.linesep}"
    )

Пример #8

0

Показать файл

Файл: test_dask_sql.py Проект: daxiongshu/cuml

def test_dask_sql_sg_logistic_regression(
    datatype,
    nrows,
    ncols,
    n_parts,
    wrap_predict
):
    if wrap_predict:
        cuml.set_global_output_type("input")
    else:
        cuml.set_global_output_type("cudf")

    X, y = make_classification(
        n_samples=nrows, n_features=ncols, n_informative=5, random_state=0
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    train_df = cudf.DataFrame(
        X_train, dtype=datatype, columns=[chr(i) for i in range(ncols)]
    )
    train_df["target"] = y_train
    train_ddf = dask_cudf.from_cudf(train_df, npartitions=n_parts)

    c = Context()
    c.create_table("train_df", train_ddf)

    train_query = f"""
        CREATE MODEL model WITH (
            model_class = 'cuml.linear_model.LogisticRegression',
            wrap_predict = {wrap_predict},
            target_column = 'target'
        ) AS (
            SELECT * FROM train_df
        )
    """

    c.sql(train_query)

    skmodel = LogisticRegression().fit(X_train, y_train)

    test_df = cudf.DataFrame(
        X_test, dtype=datatype, columns=[chr(i) for i in range(ncols)]
    )
    test_ddf = dask_cudf.from_cudf(test_df, npartitions=n_parts)
    c.create_table("test_df", test_ddf)

    inference_query = """
        SELECT * FROM PREDICT(
            MODEL model,
            SELECT * FROM test_df
        )
    """

    preds = c.sql(inference_query).compute()
    score = cuml.metrics.accuracy_score(y_test, preds["target"].to_numpy())

    assert score >= skmodel.score(X_test, y_test) - 0.022

Пример #9

0

Показать файл

Файл: test_server.py Проект: nils-braun/dask-sql

def app_client():
    c = Context()
    c.sql("SELECT 1 + 1").compute()
    _init_app(app, c)

    # late import for the importskip
    from fastapi.testclient import TestClient

    yield TestClient(app)

    # don't disconnect the client if using an independent cluster
    if os.getenv("DASK_SQL_TEST_SCHEDULER", None) is None:
        app.client.close()

Пример #10

0

Показать файл

def test_add_remove_tables():
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame(), npartitions=1)

    c.create_table("table", data_frame)
    assert "table" in c.tables

    c.drop_table("table")
    assert "table" not in c.tables

    with pytest.raises(KeyError):
        c.drop_table("table")

    c.create_table("table", [data_frame])
    assert "table" in c.tables

Пример #11

0

Показать файл

Файл: dask_tools.py Проект: quasiben/dors

def create_context_distributed(sched):
    from dask_sql import Context
    from dask.distributed import Client

    # Need dev version 1.18+ of reticulate
    # Error: C stack usage   is too close to the limit
    # devtools::install_github('rstudio/reticulate')
    if sched:
        client = Client(sched)
    else:
        client = Client()
    ctx = Context()

    # FIX: develop better client handling on the R side
    ctx.dors_client = client
    return ctx

Пример #12

0

Показать файл

Файл: test_server.py Проект: raybellwaves/dask-sql

    def setUp(self):
        super().setUp()

        app.c = Context()
        self.client = TestClient(app)

        self.f = os.path.join(tempfile.gettempdir(), os.urandom(24).hex())

Пример #13

0

Показать файл

Файл: test_compatibility.py Проект: gallamine/dask-sql

def eq_sqlite(sql, **dfs):
    c = Context()
    engine = sqlite3.connect(":memory:")

    for name, df in dfs.items():
        c.create_table(name, df)
        df.to_sql(name, engine, index=False)

    dask_result = c.sql(sql).compute().reset_index(drop=True)
    sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True)

    # Make sure SQL and Dask use the same "NULL" value
    dask_result = dask_result.fillna(np.NaN)
    sqlite_result = sqlite_result.fillna(np.NaN)

    assert_frame_equal(dask_result, sqlite_result, check_dtype=False)

Пример #14

0

Показать файл

Файл: app.py Проект: th3architect/dask-sql

def _init_app(
    app: FastAPI,
    context: dask_sql.Context = None,
    client: dask.distributed.Client = None,
):
    app.c = context or Context()
    app.future_list = {}
    app.client = client or dask.distributed.Client()

Пример #15

0

Показать файл

def run_server(context: Context = None,
               host: str = "0.0.0.0",
               port: int = 8080):  # pragma: no cover
    """
    Run a HTTP server for answering SQL queries using ``dask-sql``.
    It uses the `Presto Wire Protocol <https://github.com/prestodb/presto/wiki/HTTP-Protocol>`_
    for communication.
    This means, it has a single POST endpoint `v1/statement`, which answers
    SQL queries (as string in the body) with the output as a JSON
    (in the format described in the documentation above).
    Every SQL expression that ``dask-sql`` understands can be used here.

    Note:
        The presto protocol also includes some statistics on the query
        in the response.
        These statistics are currently only filled with placeholder variables.

    Args:
        context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one.
        host (:obj:`str`): The host interface to listen on (defaults to all interfaces)
        port (:obj:`int`): The port to listen on (defaults to 8080)

    Example:
        It is possible to run an SQL server by using the CLI script in ``dask_sql.server.app``
        or by calling this function directly in your user code:

        .. code-block:: python

            from dask_sql import run_server

            # Create your pre-filled context
            c = Context()
            ...

            run_server(context=c)

        After starting the server, it is possible to send queries to it, e.g. with the
        `presto CLI <https://prestosql.io/docs/current/installation/cli.html>`_
        or via sqlalchemy (e.g. using the `PyHive <https://github.com/dropbox/PyHive#sqlalchemy>`_ package):

        .. code-block:: python

            from sqlalchemy.engine import create_engine
            engine = create_engine('presto://localhost:8080/')

            import pandas as pd
            pd.read_sql_query("SELECT 1 + 1", con=engine)

        Of course, it is also possible to call the usual ``CREATE TABLE``
        commands.
    """
    if context is None:
        context = Context()

    app.c = context

    uvicorn.run(app, host=host, port=port)

Пример #16

0

Показать файл

def c():
    c = Context()
    c.create_schema(schema)
    row = create_table_row()
    tables = pd.DataFrame().append(row, ignore_index=True)
    tables = tables.astype({"AN_INT": "int64"})
    c.create_table(table, tables, schema_name=schema)

    yield c

    c.drop_schema(schema)

Пример #17

0

Показать файл

def eq_sqlite(sql, **dfs):
    c = Context()
    engine = sqlite3.connect(":memory:")

    for name, df in dfs.items():
        c.create_table(name, df)
        df.to_sql(name, engine, index=False)

    dask_result = c.sql(sql).reset_index(drop=True)
    sqlite_result = pd.read_sql(sql, engine).reset_index(drop=True)

    # casting to object to ensure equality with sql-lite
    # which returns object dtype for datetime inputs
    dask_result = cast_datetime_to_string(dask_result)

    # Make sure SQL and Dask use the same "NULL" value
    dask_result = dask_result.fillna(np.NaN)
    sqlite_result = sqlite_result.fillna(np.NaN)

    assert_eq(dask_result, sqlite_result, check_dtype=False)

Пример #18

0

Показать файл

def test_aggregation_adding():
    c = Context()

    assert not c.schema[c.schema_name].function_lists
    assert not c.schema[c.schema_name].functions

    f = lambda x: x
    c.register_aggregation(f, "f", [("x", int)], float)

    assert "f" in c.schema[c.schema_name].functions
    assert c.schema[c.schema_name].functions["f"] == f
    assert len(c.schema[c.schema_name].function_lists) == 2
    assert c.schema[c.schema_name].function_lists[0].name == "F"
    assert c.schema[c.schema_name].function_lists[0].parameters == [("x", int)]
    assert c.schema[c.schema_name].function_lists[0].return_type == float
    assert c.schema[c.schema_name].function_lists[0].aggregation
    assert c.schema[c.schema_name].function_lists[1].name == "f"
    assert c.schema[c.schema_name].function_lists[1].parameters == [("x", int)]
    assert c.schema[c.schema_name].function_lists[1].return_type == float
    assert c.schema[c.schema_name].function_lists[1].aggregation

    # Without replacement
    c.register_aggregation(f, "f", [("x", float)], int, replace=False)

    assert "f" in c.schema[c.schema_name].functions
    assert c.schema[c.schema_name].functions["f"] == f
    assert len(c.schema[c.schema_name].function_lists) == 4
    assert c.schema[c.schema_name].function_lists[2].name == "F"
    assert c.schema[c.schema_name].function_lists[2].parameters == [("x", float)]
    assert c.schema[c.schema_name].function_lists[2].return_type == int
    assert c.schema[c.schema_name].function_lists[2].aggregation
    assert c.schema[c.schema_name].function_lists[3].name == "f"
    assert c.schema[c.schema_name].function_lists[3].parameters == [("x", float)]
    assert c.schema[c.schema_name].function_lists[3].return_type == int
    assert c.schema[c.schema_name].function_lists[3].aggregation

    # With replacement
    f = lambda x: x + 1
    c.register_aggregation(f, "f", [("x", str)], str, replace=True)

    assert "f" in c.schema[c.schema_name].functions
    assert c.schema[c.schema_name].functions["f"] == f
    assert len(c.schema[c.schema_name].function_lists) == 2
    assert c.schema[c.schema_name].function_lists[0].name == "F"
    assert c.schema[c.schema_name].function_lists[0].parameters == [("x", str)]
    assert c.schema[c.schema_name].function_lists[0].return_type == str
    assert c.schema[c.schema_name].function_lists[0].aggregation
    assert c.schema[c.schema_name].function_lists[1].name == "f"
    assert c.schema[c.schema_name].function_lists[1].parameters == [("x", str)]
    assert c.schema[c.schema_name].function_lists[1].return_type == str
    assert c.schema[c.schema_name].function_lists[1].aggregation

Пример #19

0

Показать файл

Файл: cmd.py Проект: th3architect/dask-sql

def main():  # pragma: no cover
    parser = ArgumentParser()
    parser.add_argument(
        "--scheduler-address",
        default=None,
        help="Connect to this dask scheduler if given",
    )
    parser.add_argument(
        "--log-level",
        default=None,
        help="Set the log level of the server. Defaults to info.",
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
    )
    parser.add_argument(
        "--load-test-data",
        default=False,
        action="store_true",
        help="Preload some test data.",
    )
    parser.add_argument(
        "--startup",
        default=False,
        action="store_true",
        help="Wait until Apache Calcite was properly loaded",
    )

    args = parser.parse_args()

    client = None
    if args.scheduler_address:
        client = dask.distributed.Client(args.scheduler_address)

    context = Context()
    if args.load_test_data:
        df = dask.datasets.timeseries(freq="1d").reset_index(drop=False)
        context.create_table("timeseries", df.persist())

    cmd_loop(
        context=context, client=client, startup=args.startup, log_level=args.log_level
    )

Пример #20

0

Показать файл

Файл: test_context.py Проект: gallamine/dask-sql

def test_tables_from_stack():
    c = Context()

    assert not c._get_tables_from_stack()

    df = pd.DataFrame()

    assert "df" in c._get_tables_from_stack()

    def f():
        df2 = pd.DataFrame()

        assert "df" in c._get_tables_from_stack()
        assert "df2" in c._get_tables_from_stack()

    f()

    def g():
        df = pd.DataFrame({"a": [1]})

        assert "df" in c._get_tables_from_stack()
        assert c._get_tables_from_stack()["df"].columns == ["a"]

Пример #21

0

Показать файл

def test_join_case_projection_subquery():
    c = Context()

    # Tables for query
    demo = pd.DataFrame({"demo_sku": [], "hd_dep_count": []})
    site_page = pd.DataFrame({"site_page_sk": [], "site_char_count": []})
    sales = pd.DataFrame({
        "sales_hdemo_sk": [],
        "sales_page_sk": [],
        "sold_time_sk": []
    })
    t_dim = pd.DataFrame({"t_time_sk": [], "t_hour": []})

    c.create_table("demos", demo, persist=False)
    c.create_table("site_page", site_page, persist=False)
    c.create_table("sales", sales, persist=False)
    c.create_table("t_dim", t_dim, persist=False)

    c.sql("""
    SELECT CASE WHEN pmc > 0.0 THEN CAST (amc AS DOUBLE) / CAST (pmc AS DOUBLE) ELSE -1.0 END AS am_pm_ratio
    FROM
    (
        SELECT SUM(amc1) AS amc, SUM(pmc1) AS pmc
        FROM
        (
            SELECT
                CASE WHEN t_hour BETWEEN 7 AND 8 THEN COUNT(1) ELSE 0 END AS amc1,
                CASE WHEN t_hour BETWEEN 19 AND 20 THEN COUNT(1) ELSE 0 END AS pmc1
            FROM sales ws
            JOIN demos hd ON (hd.demo_sku = ws.sales_hdemo_sk and hd.hd_dep_count = 5)
            JOIN site_page sp ON (sp.site_page_sk = ws.sales_page_sk and sp.site_char_count BETWEEN 5000 AND 6000)
            JOIN t_dim td ON (td.t_time_sk = ws.sold_time_sk and td.t_hour IN (7,8,19,20))
            GROUP BY t_hour
        ) cnt_am_pm
    ) sum_am_pm
    """).compute()

Пример #22

0

Показать файл

def test_tables_from_stack(gpu):
    c = Context()

    assert not c._get_tables_from_stack()

    df = pd.DataFrame() if not gpu else cudf.DataFrame()

    assert "df" in c._get_tables_from_stack()

    def f(gpu):
        df2 = pd.DataFrame() if not gpu else cudf.DataFrame()

        assert "df" in c._get_tables_from_stack()
        assert "df2" in c._get_tables_from_stack()

    f(gpu=gpu)

    def g(gpu=gpu):
        df = pd.DataFrame({"a": [1]}) if not gpu else cudf.DataFrame({"a": [1]})

        assert "df" in c._get_tables_from_stack()
        assert c._get_tables_from_stack()["df"].columns == ["a"]

    g(gpu=gpu)

Пример #23

0

Показать файл

Файл: dask_helpers.py Проект: jonasrenault/NeuroLang

 def get_context(cls, new=False):
     if cls._context is None or new:
         if not config["RAS"].getboolean("synchronous", False):
             cls._create_client()
         cls._context = Context()
         # We register an aggregate function called len which applies to string columns
         # Used for example in `test_probabilistic_frontend:test_postprob_conjunct_with_wlq_result`
         cls._context.register_aggregation(len, "len",
                                           [("x", pd.StringDtype())],
                                           np.int32)
         # We also register a sum which applies to objects (i.e `Symbol` or sets)
         # since by default sum applies only to numbers in SQL and Calcite will
         # try to cast objects to float before applying the default sum op.
         cls._context.register_aggregation(sum, "sum", [("x", np.object_)],
                                           np.object_)
     return cls._context

Пример #24

0

Показать файл

def test_sql():
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame)

    result = c.sql("SELECT * FROM df")
    assert isinstance(result, dd.DataFrame)

    result = c.sql("SELECT * FROM df", return_futures=False)
    assert isinstance(result, pd.DataFrame)

Пример #25

0

Показать файл

def test_query_case_sensitivity():
    c = Context()
    df = pd.DataFrame({"id": [0, 1]})

    c.create_table("test", df)

    try:
        c.sql(
            "select ID from test",
            config_options={"sql.identifier.case_sensitive": False},
        )
    except ParsingException as pe:
        assert False, f"Queries should be case insensitve but raised exception {pe}"

Пример #26

0

Показать файл

Файл: test_context.py Проект: gallamine/dask-sql

def test_sql():
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame)

    result = c.sql("SELECT * FROM df")
    assert isinstance(result, dd.DataFrame)
    assert_frame_equal(result.compute(), data_frame.compute())

    result = c.sql("SELECT * FROM df", return_futures=False)
    assert isinstance(result, pd.DataFrame)
    assert_frame_equal(result, data_frame.compute())

    result = c.sql("SELECT * FROM other_df",
                   dataframes={"other_df": data_frame})
    assert isinstance(result, dd.DataFrame)
    assert_frame_equal(result.compute(), data_frame.compute())

Пример #27

0

Показать файл

def test_sql(gpu):
    c = Context()

    data_frame = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=1)
    c.create_table("df", data_frame, gpu=gpu)

    result = c.sql("SELECT * FROM df")
    assert isinstance(result, dd.DataFrame)
    assert_eq(result, data_frame)

    result = c.sql("SELECT * FROM df", return_futures=False)
    assert not isinstance(result, dd.DataFrame)
    assert_eq(result, data_frame)

    result = c.sql(
        "SELECT * FROM other_df", dataframes={"other_df": data_frame}, gpu=gpu
    )
    assert isinstance(result, dd.DataFrame)
    assert_eq(result, data_frame)

Пример #28

0

Показать файл

def test_deprecation_warning():
    c = Context()
    data_frame = dd.from_pandas(pd.DataFrame(), npartitions=1)

    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")

        c.register_dask_table(data_frame, "table")

        assert len(w) == 1
        assert issubclass(w[-1].category, DeprecationWarning)

    assert "table" in c.tables

    c.drop_table("table")
    assert "table" not in c.tables

Пример #29

0

Показать файл

def test_fsql():
    def assert_eq(df: pd.DataFrame) -> None:
        assert_frame_equal(df, pd.DataFrame({"a": [1]}))

    # the simplest case: the SQL does not use any input and does not generate output
    fsql("""
    CREATE [[0],[1]] SCHEMA a:long
    SELECT * WHERE a>0
    OUTPUT USING assert_eq
    """)

    # it can directly use the dataframes inside dask-sql Context
    c = Context()
    c.create_table(
        "df",
        dd.from_pandas(pd.DataFrame([[0], [1]], columns=["a"]), npartitions=2))

    fsql(
        """
    SELECT * FROM df WHERE a>0
    OUTPUT USING assert_eq
    """,
        c,
    )

    # for dataframes with name, they can register back to the Context (register=True)
    # the return of fsql is the dict of all dask dataframes with explicit names
    result = fsql(
        """
    x=SELECT * FROM df WHERE a>0
    OUTPUT USING assert_eq
    """,
        c,
        register=True,
    )
    assert isinstance(result["x"], dd.DataFrame)
    assert "x" in c.tables

    # integration test with fugue transformer extension
    c = Context()
    c.create_table(
        "df1",
        dd.from_pandas(pd.DataFrame([[0, 1], [1, 2]], columns=["a", "b"]),
                       npartitions=2),
    )
    c.create_table(
        "df2",
        dd.from_pandas(pd.DataFrame([[1, 2], [3, 4], [-4, 5]],
                                    columns=["a", "b"]),
                       npartitions=2),
    )

    # schema: *
    def cumsum(df: pd.DataFrame) -> pd.DataFrame:
        return df.cumsum()

    fsql(
        """
    data = SELECT * FROM df1 WHERE a>0 UNION ALL SELECT * FROM df2 WHERE a>0 PERSIST
    result1 = TRANSFORM data PREPARTITION BY a PRESORT b USING cumsum
    result2 = TRANSFORM data PREPARTITION BY b PRESORT a USING cumsum
    PRINT result1, result2
    """,
        c,
        register=True,
    )
    assert "result1" in c.tables
    assert "result2" in c.tables

Пример #30

0

Показать файл

Файл: cmd.py Проект: th3architect/dask-sql

def cmd_loop(
    context: Context = None,
    client: dask.distributed.Client = None,
    startup=False,
    log_level=None,
):  # pragma: no cover
    """
    Run a REPL for answering SQL queries using ``dask-sql``.
    Every SQL expression that ``dask-sql`` understands can be used here.

    Args:
        context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one.
        client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one.
        startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded
        log_level: (:obj:`str`): The log level of the server and dask-sql

    Example:
        It is possible to run a REPL by using the CLI script in ``dask-sql``
        or by calling this function directly in your user code:

        .. code-block:: python

            from dask_sql import cmd_loop

            # Create your pre-filled context
            c = Context()
            ...

            cmd_loop(context=c)

        Of course, it is also possible to call the usual ``CREATE TABLE``
        commands.
    """
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", None)
    pd.set_option("display.max_colwidth", None)

    logging.basicConfig(level=log_level)

    client = client or dask.distributed.Client()
    context = context or Context()

    if startup:
        context.sql("SELECT 1 + 1").compute()

    session = PromptSession(lexer=PygmentsLexer(SqlLexer))

    while True:
        try:
            text = session.prompt("(dask-sql) > ")
        except KeyboardInterrupt:
            continue
        except EOFError:
            break

        text = text.rstrip(";").strip()

        if not text:
            continue

        try:
            df = context.sql(text, return_futures=False)
            print(df)
        except Exception as e:
            print(e)

Python Context примеры использования