示例#1
0
def validate(spark):
    """
    Validated the data loaded into the data warehouse
    """

    parquetpaths = {
        'i94': 'parquets/i94.parquet',
        'airports': 'parquets/airports.parquet',
        'cities': 'parquets/cities.parquet',
        'visa': 'parquets/visas.parquet',
        'countries': 'parquets/countries.parquet',
        'transport_modes': 'parquets/trasnp_modes.parquet',
        'ports': 'parquets/ports.parquet',
        'states': 'parquets/states.parquet'
    }

    validator = Validator(spark)

    df_i94_loaded = validator.get_facts(parquetpaths['i94'])
    df_airport_loaded, df_cities_loaded, df_visas_loaded, df_transp_modes_loaded, df_countries_loaded, df_states_loaded, df_ports_loaded = validator.get_dimensions(
        parquetpaths['airports'], parquetpaths['cities'], parquetpaths['visa'],
        parquetpaths['transport_modes'], parquetpaths['countries'],
        parquetpaths['states'], parquetpaths['ports'])

    assert (validator.contain_data(df_i94_loaded))
    assert (validator.contain_data(df_airport_loaded))
    assert (validator.contain_data(df_cities_loaded))
    assert (validator.contain_data(df_visas_loaded))
    assert (validator.contain_data(df_transp_modes_loaded))
    assert (validator.contain_data(df_countries_loaded))
    assert (validator.contain_data(df_states_loaded))
    assert (validator.contain_data(df_ports_loaded))