def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Uppercase columns
    - Rename dataframe
    - Correct country names
    - Get country id
    - Control input
    - Add decade column
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get(
        'temperatures_by_country')
    country_mapping_path = config.get('country_mapping_path')

    df = read_with_meta(spark, df_meta=config['input_meta'], header=True)
    df = uppercase_columns(df=df, col_list=['Country'])
    df = rename(df=df)
    df = control_input(df=df)
    df = correct_country_names(df=df,
                               country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = get_country_id(spark, df=df, config=config)
    df = add_decade_column(df=df, date_col='date')

    write_with_meta(df=df, df_meta=config['output_meta'])
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Convert dates from sas format to datetime
    - Uppercase columns
    - Rename dataframe
    - Correct country names
    - Get origin country id
    - Control input
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get('immigration')
    country_mapping_path = config.get('country_mapping_path')

    df = read_with_meta(spark, df_meta=config['input_meta'])
    df = convert_sas_to_date(df=df)
    df = uppercase_columns(df=df, col_list=['i94port', 'i94addr', 'occup', 'gender'])
    df = rename(df=df)
    df = correct_country_names(df=df, country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = get_country_id(spark, df=df, config=config)
    df = control_input(df=df)
    df = df.withColumnRenamed('country_id', 'origin_country_id')

    write_with_meta(df=df, df_meta=config['output_meta'])
Пример #3
0
def read_data(spark, config: dict) -> tuple:
    """
    Read all dataframes that include country columns

    :param spark: Spark session
    :param config: config including input meta
    :return: dataframe tuple
    """
    gdp_per_capita = read_with_meta(spark,
                                    df_meta=config['gdp_per_capita_meta'],
                                    header=True)
    human_capital_index = read_with_meta(
        spark, df_meta=config['human_capital_index_meta'], header=True)
    press_freedom_index = read_with_meta(
        spark, df_meta=config['press_freedom_index_meta'], header=True)
    temperatures_by_country = read_with_meta(
        spark, df_meta=config['temperatures_by_country_meta'], header=True)
    immigration = read_with_meta(spark,
                                 df_meta=config['immigration_meta'],
                                 header=True)
    return (gdp_per_capita, human_capital_index, press_freedom_index,
            temperatures_by_country, immigration)
Пример #4
0
def get_country_id(spark, df: DataFrame, config: Dict) -> DataFrame:
    """
    Get country id from its dimension table
    :param spark: Spark session
    :param df: dataframe
    :param config: string array of columns to be upper-cased
    :return: dataframe
    """
    country = read_with_meta(spark, config['country_meta'])
    key_col = 'country_name'
    df = df.join(country, on=key_col, how='inner')
    df = df.drop(key_col)
    logging.info("Country name is converted to id from dimension table")
    return df
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Uppercase columns
    - Rename dataframe
    - Correct country names
    - Get country id
    - Convert wide dataframe to long
    - Add rank column
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get(
        'human_capital_index')
    country_mapping_path = config.get('country_mapping_path')

    df = read_with_meta(spark, df_meta=config['input_meta'], header=True)
    df = uppercase_columns(df=df, col_list=['Country Name'])
    df = df.withColumnRenamed("Country Name", "country_name")
    df = correct_country_names(df=df,
                               country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = get_country_id(spark, df=df, config=config)

    df_long = melt(df=df,
                   key_cols=['country_id'],
                   value_cols=[str(i) for i in list(range(2010, 2021))],
                   var_name='year',
                   value_name='human_capital_index')
    df_long = add_rank_column(df=df_long,
                              partition_col='year',
                              order_by_col='human_capital_index',
                              rank_col='human_capital_rank',
                              ascending=False)

    write_with_meta(df=df_long, df_meta=config['output_meta'])
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Uppercase columns
    - Rename dataframe
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get('us_cities_demographics')

    df = read_with_meta(spark, df_meta=config['input_meta'], header=True, sep=';')
    df = uppercase_columns(df=df, col_list=['City', 'State', 'Race'])
    df = rename(df=df)

    write_with_meta(df=df, df_meta=config['output_meta'])
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Replace ids with values
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get(
        'immigration_mapping')
    mapping_config_path = config.get('mapping_config_path')

    df = read_with_meta(spark, df_meta=config['input_meta'])
    df = replace_ids_with_values(df=df,
                                 mapping_config_path=mapping_config_path)

    write_with_meta(df=df, df_meta=config['output_meta'])
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Rename dataframe
    - Add decade column
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get(
        'global_temperatures')

    df = read_with_meta(spark, df_meta=config['input_meta'], header=True)
    df = rename(df=df)
    df = add_decade_column(df=df, date_col='date')

    write_with_meta(df, df_meta=config['output_meta'])