Exemplo n.º 1
0
"""Asset definitions for the multi_type_lakehouse example."""
import pandas as pd
from lakehouse import Column, computed_table, source_table
from pandas import DataFrame as PandasDF
from pyarrow import date32, float64, string
from pyspark.sql import DataFrame as SparkDF
from pyspark.sql import Window
from pyspark.sql import functions as f

sfo_q2_weather_sample_table = source_table(
    path=("sfo_q2_weather_sample", ),
    columns=[Column("tmpf", float64()),
             Column("valid_date", string())],
)


@computed_table(
    input_assets=[sfo_q2_weather_sample_table],
    columns=[Column("valid_date", date32()),
             Column("max_tmpf", float64())],
)
def daily_temperature_highs_table(sfo_q2_weather_sample: PandasDF) -> PandasDF:
    """Computes the temperature high for each day"""
    sfo_q2_weather_sample["valid_date"] = pd.to_datetime(
        sfo_q2_weather_sample["valid"])
    return sfo_q2_weather_sample.groupby("valid_date").max().rename(
        columns={"tmpf": "max_tmpf"})


@computed_table(
    input_assets=[daily_temperature_highs_table],
Exemplo n.º 2
0
    @computed_asset(storage_key='filesystem', input_assets=[source_asset1, source_asset2])
    def casset(b_: int, c_: float) -> str:
        return str(b_) + str(c_)

    assert casset.computation
    assert casset.path == ('casset',)
    assert casset.computation.output_in_memory_type == str
    assert set(casset.computation.deps.keys()) == set(['b_', 'c_'])
    assert casset.computation.deps['b_'].in_memory_type == int
    assert casset.computation.deps['b_'].asset == source_asset1
    assert casset.computation.deps['c_'].in_memory_type == float
    assert casset.computation.deps['c_'].asset == source_asset2


COLUMNS = [Column('a', str), Column('bb', int)]


def test_computed_table_no_deps():
    @computed_table(storage_key='filesystem', columns=COLUMNS)
    def casset() -> str:
        return 'a'

    assert casset.computation
    assert casset.path == ('casset',)
    assert casset.computation.output_in_memory_type == str
    assert len(casset.computation.deps.keys()) == 0
    assert casset.columns == COLUMNS


def test_computed_table_path():
Exemplo n.º 3
0
'''Asset definitions for the simple_lakehouse example.'''
import pandas as pd
from lakehouse import Column, computed_table, source_table
from pyarrow import date32, float64, string

sfo_q2_weather_sample_table = source_table(
    storage_key='filesystem',
    path=('data', ),
    columns=[Column('tmpf', float64()),
             Column('valid_date', string())],
)


@computed_table(
    storage_key='filesystem',
    input_assets=[sfo_q2_weather_sample_table],
    columns=[Column('valid_date', date32()),
             Column('max_tmpf', float64())],
)
def daily_temperature_highs_table(
        sfo_q2_weather_sample: pd.DataFrame) -> pd.DataFrame:
    '''Computes the temperature high for each day'''
    sfo_q2_weather_sample['valid_date'] = pd.to_datetime(
        sfo_q2_weather_sample['valid'])
    return sfo_q2_weather_sample.groupby('valid_date').max().rename(
        columns={'tmpf': 'max_tmpf'})
Exemplo n.º 4
0
    @computed_asset(storage_key="filesystem", input_assets=[source_asset1, source_asset2])
    def casset(b_: int, c_: float) -> str:
        return str(b_) + str(c_)

    assert casset.computation
    assert casset.path == ("casset",)
    assert casset.computation.output_in_memory_type == str
    assert set(casset.computation.deps.keys()) == set(["b_", "c_"])
    assert casset.computation.deps["b_"].in_memory_type == int
    assert casset.computation.deps["b_"].asset == source_asset1
    assert casset.computation.deps["c_"].in_memory_type == float
    assert casset.computation.deps["c_"].asset == source_asset2


COLUMNS = [Column("a", str), Column("bb", int)]


def test_computed_table_no_deps():
    @computed_table(storage_key="filesystem", columns=COLUMNS)
    def casset() -> str:
        return "a"

    assert casset.computation
    assert casset.path == ("casset",)
    assert casset.computation.output_in_memory_type == str
    assert len(casset.computation.deps.keys()) == 0
    assert casset.columns == COLUMNS


def test_computed_table_path():