Пример #1
0
def datasets(spark_session, expected_ds, result_ds, project_key=None):
    """Validate that two data sets are the same.

    Args:
        spark_session (SparkSession): spark session used to load data frames.
        expected_ds (str): The expected data set to load.
        result_ds (str): The result data set to load.
        project_key (str): Used if data set in a separate dataiku project.
    """
    expected_df = dataframe.get(spark_session, expected_ds, prefix=project_key)
    result_df = dataframe.get(spark_session, result_ds, prefix=project_key)
    diff_ret = dfdiff.diff(expected_df, result_df)
    assert not diff_ret, "Dataframe diff: " + diff_ret
    print("Test successful")
Пример #2
0
def test_get_with_schema(
        dataframe_source,
        fixtures,  # noqa F401
        expected,  # noqa F401
        spark_session,
        default_schema,
        wrong_types_schema,
        wrong_fields_schema,
        wrong_num_fields_schema,
        one_field_schema):
    dataframe_source.write(fixtures, 'assert_example')
    assert_partial = partial(assert_schema, spark_session, 'assert_example',
                             expected, dataframe_source)
    assertion_error_partial = partial(assertion_error, spark_session,
                                      'assert_example', expected,
                                      dataframe_source)
    assert_partial(default_schema)
    # Assert wrong types partial raising Py4J error
    with pytest.raises(Py4JJavaError):
        result = dataframe.get(spark_session,
                               'assert_example',
                               schema=wrong_types_schema,
                               dataframe_source=dataframe_source)
        result.collect()  # Force processing to trigger schema issues
    # Test assertion errors
    assertion_error_partial(wrong_fields_schema,
                            ".*Cols diff.*notletter,notnumber.*")
    assertion_error_partial(wrong_num_fields_schema,
                            ".*Cols diff.*Expected: letter,number,foo.*")
    assertion_error_partial(one_field_schema,
                            ".*Cols diff.*Expected: letter.*")
Пример #3
0
def assert_schema(
        spark_session,
        dataset_name,
        expected,  # noqa F401
        dataframe_source,
        schema):
    result = dataframe.get(spark_session,
                           dataset_name,
                           schema=schema,
                           dataframe_source=dataframe_source)
    assert not dfdiff.diff(result, expected)
Пример #4
0
def test_get_with_schema_not_nullable(
        dataframe_source,
        fixtures,  # noqa F401
        expected,  # noqa F401
        spark_session,
        default_schema,
        wrong_nulls_schema):
    fixtures_w_nulls = fixtures.withColumn(
        'letter',
        F.when(F.col("letter") != 'a', F.col('letter')).otherwise(F.lit(None)))
    dataframe_source.write(fixtures_w_nulls, 'assert_example_not_nullable')
    dataframe.get(spark_session,
                  'assert_example_not_nullable',
                  schema=default_schema,
                  dataframe_source=dataframe_source)
    # This test currently does not raise a type error, since read with schema
    # doesn't raise an error on null values. Spark feature?
    dataframe.get(spark_session,
                  'assert_example_not_nullable',
                  schema=wrong_nulls_schema,
                  dataframe_source=dataframe_source)
Пример #5
0
def test_equal(spark_session, fixtures, expected):
    s3_source = S3Source(format='parquet')
    dataset_name = "fixtures"
    s3_dir = "s3://birgittatestbucket/sourcetests"
    fixtures_mock = MagicMock()
    fixtures_mock.write.format().mode().save.return_value = None
    dataframe.write(fixtures_mock,
                    dataset_name,
                    prefix=s3_dir,
                    dataframe_source=s3_source)
    spark_session_mock = MagicMock()
    spark_session_mock.read.format().load.return_value = fixtures
    out_df = dataframe.get(spark_session_mock,
                           dataset_name,
                           prefix=s3_dir,
                           dataframe_source=s3_source)
    assert not dfdiff.diff(out_df, expected)
Пример #6
0
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import warnings

from birgitta import spark as bspark
from birgitta.dataframe import dataframe
from newsltd_etl.projects.chronicle.datasets.contract_data import dataset as ds_contract_data  # noqa 501
from newsltd_etl.projects.chronicle.datasets.contracts import dataset as ds_contracts  # noqa 501
from pyspark.sql import functions as F
warnings.filterwarnings('ignore')  # supress python warnings

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
spark_session = bspark.session()
contract_data_df = dataframe.get(spark_session, ds_contract_data.name)

with_flag = contract_data_df.withColumn('current_flag', F.lit(1))

to_output_df = with_flag.select(
    F.col('customerid').alias('customer_id'),
    F.concat(F.lit('G47'), F.col('cellphone')).alias('phone'),
    F.col('accountid').alias('chronicle_account_id'),
    F.col('groupid').alias('group_account_id'), F.col('priceplan_code'),
    F.col('startdate_yyyymmdd').cast('date').alias('start_date'),
    F.col('enddate_yyyymmdd').cast('date').alias('end_date'),
    F.col('current_flag'),
    F.col('status').alias('client_status_code'))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
dataframe.write(to_output_df, ds_contracts.name, schema=ds_contracts.schema)
#
# Documentation goes here...

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
from birgitta import spark as bspark
from birgitta.dataframe import dataframe
from pyspark.sql import functions as F
from newsltd_etl.projects.tribune.datasets.filtered_contracts import dataset as ds_filtered_contracts # noqa 501
from newsltd_etl.projects.tribune.datasets.contracts import dataset as ds_contracts # noqa 501

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
#  Get or create sparkcontext and set up sqlcontext
spark_session = bspark.session()
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
contracts = dataframe.get(spark_session,
                     ds_contracts.name,
                     cast_binary_to_str=True)
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN
# ## Convert timestamps to dates

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE

contractsf = contracts.filter(contracts.brand_code != 44) \
        .withColumnRenamed("contract_prod_code", "product_code") \
        .withColumn("start_date", F.col("start_date").cast('date')) \
        .withColumn("end_date", F.col("end_date").cast('date'))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN

# ## Add product category
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import datetime

from birgitta import spark as bspark
from birgitta.dataframe import dataframe
from birgitta.recipe import params as recipe_params
from birgitta.recipe.debug import dataframe as dfdbg
from newsltd_etl.projects.tribune.datasets.daily_contract_states import dataset as ds_daily_contract_states  # noqa 501
from newsltd_etl.projects.tribune.datasets.filtered_contracts import dataset as ds_filtered_contracts  # noqa 501
from pyspark.sql import functions as f
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
analysis_start_date = datetime.date(2016, 1, 1)
today_date = recipe_params.today()
spark_session = bspark.session()
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
filtered_contracts = dataframe.get(spark_session,
                                   ds_filtered_contracts.name,
                                   cast_binary_to_str=True)
datedim = dataframe.get(spark_session, "date_dim")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
sel_cols = [
    'customer_id', 'product_code', 'product', 'segment', 'product_name',
    'brand_name', 'start_date', 'end_date', 'shop_code', 'product_category'
]

contracts = filtered_contracts.select(*sel_cols)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN
## Convert timestamps to dates
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
contracts = contracts.withColumn("start_date",