Пример #1
0
def test_complex_query(c):
    df = timeseries(freq="1d").persist()
    c.create_table("timeseries", df)

    result = c.sql("""
        SELECT
            lhs.name,
            lhs.id,
            lhs.x
        FROM
            timeseries AS lhs
        JOIN
            (
                SELECT
                    name AS max_name,
                    MAX(x) AS max_x
                FROM timeseries
                GROUP BY name
            ) AS rhs
        ON
            lhs.name = rhs.max_name AND
            lhs.x = rhs.max_x
    """).compute()

    assert len(result) > 0
Пример #2
0
def timeseries_df(c):
    pdf = timeseries(freq="1d").compute().reset_index(drop=True)

    # input nans in pandas dataframe
    col1_index = np.random.randint(0, 30, size=int(pdf.shape[0] * 0.2))
    col2_index = np.random.randint(0, 30, size=int(pdf.shape[0] * 0.3))
    pdf.loc[col1_index, "x"] = np.nan
    pdf.loc[col2_index, "y"] = np.nan

    c.create_table("timeseries", pdf, persist=True)

    return None
Пример #3
0
def main():  # pragma: no cover
    parser = ArgumentParser()
    parser.add_argument(
        "--scheduler-address",
        default=None,
        help="Connect to this dask scheduler if given",
    )
    parser.add_argument(
        "--log-level",
        default=None,
        help="Set the log level of the server. Defaults to info.",
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
    )
    parser.add_argument(
        "--load-test-data",
        default=False,
        action="store_true",
        help="Preload some test data.",
    )
    parser.add_argument(
        "--startup",
        default=False,
        action="store_true",
        help="Wait until Apache Calcite was properly loaded",
    )

    args = parser.parse_args()

    client = None
    if args.scheduler_address:
        client = Client(args.scheduler_address)

    context = Context()
    if args.load_test_data:
        df = timeseries(freq="1d").reset_index(drop=False)
        context.create_table("timeseries", df.persist())

    cmd_loop(context=context,
             client=client,
             startup=args.startup,
             log_level=args.log_level)
from dask.datasets import timeseries
import time
from dask.dataframe.shuffle import shuffle
from dask.distributed import Client, wait

if __name__ == "__main__":
    client = Client("127.0.0.1:8786")
    ddf_h = timeseries(start='2000-01-01', end='2000-01-02', partition_freq='1min')
    result = shuffle(ddf_h, "id", shuffle="tasks")
    ddf = client.persist(result)
    _ = wait(ddf)
    client.shutdown()
    time.sleep(0.5)
Пример #5
0
def gpu_training_df(c):
    if dask_cudf:
        df = timeseries(freq="1d").reset_index(drop=True)
        df = dask_cudf.from_dask_dataframe(df)
        c.create_table("timeseries", input_table=df)
    return None
Пример #6
0
def training_df(c):
    df = timeseries(freq="1d").reset_index(drop=True)
    c.create_table("timeseries", df, persist=True)

    return None
Пример #7
0
    def setUpClass(cls):
        cls.c = Context()

        df = timeseries(freq="1d").persist()
        cls.c.register_dask_table(df, "timeseries")
Пример #8
0
dd = pytest.importorskip("dask.dataframe")
pyspark = pytest.importorskip("pyspark")
pytest.importorskip("pyarrow")
pytest.importorskip("fastparquet")

from dask.dataframe.utils import assert_eq

if not sys.platform.startswith("linux"):
    pytest.skip(
        "Unnecessary, and hard to get spark working on non-linux platforms",
        allow_module_level=True,
    )

# pyspark auto-converts timezones -- round-tripping timestamps is easier if
# we set everything to UTC.
pdf = timeseries(freq="1H").compute()
pdf.index = pdf.index.tz_localize("UTC")
pdf = pdf.reset_index()


@pytest.fixture(scope="module")
def spark_session():
    # Spark registers a global signal handler that can cause problems elsewhere
    # in the test suite. In particular, the handler fails if the spark session
    # is stopped (a bug in pyspark).
    prev = signal.getsignal(signal.SIGINT)
    # Create a spark session. Note that we set the timezone to UTC to avoid
    # conversion to local time when reading parquet files.
    spark = (pyspark.sql.SparkSession.builder.master("local").appName(
        "Dask Testing").config("spark.sql.session.timeZone",
                               "UTC").getOrCreate())