Exemplo n.º 1
0
def test_conform():
    from snapflow.modules import core

    TestSchemaA = create_quick_schema("TestSchemaA", [("a", "Integer"),
                                                      ("b", "Integer")],
                                      namespace="core")
    TestSchemaB = create_quick_schema(
        "TestSchemaB",
        [("a", "Integer"), ("c", "Integer"), ("d", "Text")],
        implementations=[Implementation("TestSchemaA", {"b": "c"})],
        namespace="core",
    )

    core.add_schema(TestSchemaA)
    core.add_schema(TestSchemaB)

    input_data = """
        a,c,d
        1,2,i
        1,3,i
        1,4,i
        2,2,i
    """
    expected = """
        a,b
        1,2
        1,3
        1,4
        2,2
    """
    # expected_df = str_as_dataframe(expected, schema=core.schemas.CoreTestSchema)
    data_input = DataInput(input_data, schema=TestSchemaB)
    s = get_tmp_sqlite_db_url()
    for p in [
            dataframe_conform_to_schema,
            sql_conform_to_schema,
    ]:
        with produce_snap_output_for_static_input(
                p,
                input=data_input,
                target_storage=s,
                params={"schema": "TestSchemaA"}) as dbs:
            assert len(dbs) == 1
            db = dbs[0]
            expected_df = DataInput(expected, schema=TestSchemaA).as_dataframe(
                db.manager.env)
            df = db.as_dataframe()
            print(expected_df)
            print(df)
Exemplo n.º 2
0
def test_cast_to_schema(cast_level, inferred, nominal, expected):
    inferred = create_quick_schema("Inf", fields=inferred)
    nominal = create_quick_schema("Nom", fields=nominal)
    if expected not in (ERROR, WARN):
        expected = create_quick_schema("Exp", fields=expected)
    env = make_test_env()
    with env.md_api.begin():
        if expected == ERROR:
            with pytest.raises(SchemaTypeError):
                s = cast_to_realized_schema(env, inferred, nominal, cast_level)
        elif expected == WARN:
            with pytest.warns(UserWarning):
                s = cast_to_realized_schema(env, inferred, nominal, cast_level)
        else:
            s = cast_to_realized_schema(env, inferred, nominal, cast_level)
            for f in s.fields:
                e = expected.get_field(f.name)
                assert f == e
Exemplo n.º 3
0
def dict_to_rough_schema(name: str,
                         d: Dict,
                         convert_to_snake_case=True,
                         **kwargs):
    fields = []
    for k, v in d.items():
        if convert_to_snake_case:
            k = title_to_snake_case(k)
        fields.append((k, pandas_series_to_field_type(pd.Series([v]))))
    fields = sorted(fields)
    return create_quick_schema(name, fields, **kwargs)
Exemplo n.º 4
0
from snapflow.core.data_block import DataBlock
from snapflow.core.environment import Environment, SnapflowSettings
from snapflow.core.execution import DataFunctionContext, ExecutionManager
from snapflow.core.execution.executable import (
    ExecutableConfiguration,
    ExecutionConfiguration,
    ExecutionContext,
)
from snapflow.core.function_interface import SelfReference
from snapflow.core.graph import Graph
from snapflow.core.module import SnapflowModule
from snapflow.core.runtime import Runtime, RuntimeClass, RuntimeEngine
from snapflow.core.streams import DataBlockStream, Stream
from snapflow.utils.typing import T

TestSchema1 = create_quick_schema("TestSchema1", [("f1", "Text")], namespace="_test")
TestSchema2 = create_quick_schema("TestSchema2", [("f1", "Text")], namespace="_test")
TestSchema3 = create_quick_schema("TestSchema3", [("f1", "Text")], namespace="_test")
TestSchema4 = create_quick_schema(
    "TestSchema4",
    [("f1", "Text"), ("f2", "Integer")],
    unique_on=["f1"],
    namespace="_test",
)


def make_test_env(**kwargs) -> Environment:
    if "metadata_storage" not in kwargs:
        url = get_tmp_sqlite_db_url()
        metadata_storage = Storage.from_url(url)
        kwargs["metadata_storage"] = metadata_storage
Exemplo n.º 5
0
Arquivo: utils.py Projeto: kvh/dcp
date_ = date(2020, 1, 1)
datestr = "1/1/2020"
dateisostr = "2020-01-01"
datetime_ = datetime(2020, 1, 1)
datetimestr = "2017-02-17T15:09:26-08:00"
timestr = "15:09:26"
time_ = time(20, 1, 1)
long_text = "helloworld" * int(65536 / 9)
json_ = {"hello": "world"}


test_records_schema = create_quick_schema(
    "TestRecordsSchema",
    [
        ("f1", "Text"),
        ("f2", "Integer"),
        ("f3", "Text"),
        ("f4", "Date"),
        ("f5", "Text"),
    ],
)

test_records = [
    {"f1": "hi", "f2": 1, "f3": None, "f4": "2020-01-01", "f5": "2020-01-01 00:00:00"},
    {"f1": "bye", "f2": 2, "f3": None, "f4": "2020-01-01", "f5": "2020-01-01 00:00:00"},
    {"f1": None, "f2": 2, "f3": None, "f4": "2020-01-01", "f5": "2020-01-01 00:00:00"},
    {
        "f1": "bye",
        "f2": 3,
        "f3": None,
        "f4": "2020-01-01",
        "f5": "202001 bad data",
Exemplo n.º 6
0
from loguru import logger
from pandas._testing import assert_almost_equal
from snapflow import DataBlock, datafunction
from snapflow.core.environment import Environment, produce
from snapflow.core.execution import DataFunctionContext
from snapflow.core.function_interface import Consumable, Reference
from snapflow.core.graph import Graph
from snapflow.core.node import DataBlockLog, DataFunctionLog, NodeState
from snapflow.core.sql.sql_function import sql_function_factory
from snapflow.modules import core
from sqlalchemy import select

logger.enable("snapflow")

Customer = create_quick_schema(
    "Customer", [("name", "Text"), ("joined", "DateTime"), ("metadata", "Json")]
)
Metric = create_quick_schema("Metric", [("metric", "Text"), ("value", "Decimal(12,2)")])


@datafunction
def shape_metrics(i1: DataBlock) -> Records[Metric]:
    df = i1.as_dataframe()
    return [
        {"metric": "row_count", "value": len(df)},
        {"metric": "col_count", "value": len(df.columns)},
    ]


@datafunction
def aggregate_metrics(