Exemplo n.º 1
0
def test_schema_identifiers():
    t1 = create_quick_schema("T1",
                             fields=[("f1", "Unicode"), ("f2", "Integer")])
    assert t1.name == "T1"
    assert t1.key == f"{DEFAULT_LOCAL_MODULE_NAME}.T1"

    t2 = create_quick_schema("TestSchema",
                             fields=[("f1", "Unicode"), ("f2", "Integer")],
                             module_name="m1")
    assert t2.name == "TestSchema"
    assert t2.key == "m1.TestSchema"
    assert t2.get_identifier() == "m1_test_schema"
    assert t2.get_field("f1").name == "f1"
    with pytest.raises(NameError):
        assert t2.get_field("f3")
def dict_to_rough_schema(name: str,
                         d: Dict,
                         convert_to_snake_case=True,
                         **kwargs):
    fields = []
    for k, v in d.items():
        if convert_to_snake_case:
            k = title_to_snake_case(k)
        fields.append((k, pandas_series_to_sqlalchemy_type(pd.Series([v]))))
    fields = sorted(fields)
    return create_quick_schema(name, fields, **kwargs)
Exemplo n.º 3
0
def test_schema_translation():
    env = make_test_env()
    t_base = create_quick_schema("t_base",
                                 fields=[("f1", "Unicode"), ("f2", "Integer")])
    t_impl = create_quick_schema(
        "t_impl",
        fields=[("g1", "Unicode"), ("g2", "Integer")],
        implementations=[Implementation("t_base", {
            "f1": "g1",
            "f2": "g2"
        })],
    )
    env.add_schema(t_base)
    env.add_schema(t_impl)
    with env.session_scope() as sess:
        trans = get_schema_translation(env,
                                       sess,
                                       source_schema=t_impl,
                                       target_schema=t_base)
        assert trans.translation == {"g1": "f1", "g2": "f2"}
Exemplo n.º 4
0
from __future__ import annotations

from pandas import DataFrame
from snapflow.core.data_block import DataBlock
from snapflow.core.environment import Environment
from snapflow.core.execution import ExecutionManager, PipeContext, RunContext
from snapflow.core.graph import Graph
from snapflow.core.module import SnapflowModule
from snapflow.core.runtime import Runtime, RuntimeClass, RuntimeEngine
from snapflow.core.streams import DataBlockStream
from snapflow.schema.base import create_quick_schema
from snapflow.storage.storage import Storage, StorageClass, StorageEngine
from snapflow.utils.common import rand_str
from snapflow.utils.typing import T

TestSchema1 = create_quick_schema("TestSchema1", [("f1", "Unicode(256)")],
                                  module_name="_test")
TestSchema2 = create_quick_schema("TestSchema2", [("f1", "Unicode(256)")],
                                  module_name="_test")
TestSchema3 = create_quick_schema("TestSchema3", [("f1", "Unicode(256)")],
                                  module_name="_test")
TestSchema4 = create_quick_schema(
    "TestSchema4",
    [("f1", "Unicode(256)"), ("f2", "Integer")],
    unique_on=["f1"],
    module_name="_test",
)


def make_test_env(**kwargs) -> Environment:
    if "metadata_storage" not in kwargs:
        url = "sqlite://"
from snapflow.storage.data_formats import Records, RecordsIterator
from snapflow.storage.storage import new_local_python_storage


def test_example():
    env = Environment(metadata_storage="sqlite://")
    g = Graph(env)
    env.add_module(core)
    df = pd.DataFrame({"a": range(10), "b": range(10)})
    g.create_node(key="n1", pipe="extract_dataframe", config={"dataframe": df})
    output = env.produce("n1", g)
    assert_almost_equal(output.as_dataframe(), df)


Customer = create_quick_schema("Customer", [("name", "Unicode"),
                                            ("joined", "DateTime"),
                                            ("metadata", "JSON")])
Metric = create_quick_schema("Metric", [("metric", "Unicode"),
                                        ("value", "Numeric(12,2)")])


@pipe
def shape_metrics(i1: DataBlock) -> Records[Metric]:
    df = i1.as_dataframe()
    return [
        {
            "metric": "row_count",
            "value": len(df)
        },
        {
            "metric": "col_count",