def load_lz_standard_lookup(table_name): landing_zone_location = dict_dbs_locations.get('LANDING_ZONE_LOC') # Initializing a Spark session spark = initialize_spark_session('load_lz' + table_name) # Loading the standard lookups with the same schema in landing_zone try: # Standard schema for the standard lookups schema_lookups_schema = StructType([ StructField("Code", StringType(), True), StructField("Description", StringType(), True) ]) df_file = spark \ .read \ .schema(schema_lookups_schema) \ .option("header", "true") \ .csv(os.path.join(edge_node_path, table_name, '*.csv')) df_file.write.format("csv") \ .mode("overwrite") \ .option("sep", ",") \ .option('header', 'true') \ .save(os.path.join(landing_zone_location, table_name)) logging.info(f'{table_name} has been loaded in the landing zone.') except Exception as e: logging.error(f"Failed to load {table_name} in the landing zone,{e}")
from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_airport(spark, integration_layer_loc, landing_zone_name): delta_l_airport = DeltaTable.forPath(spark, integration_layer_loc + '/L_AIRPORT') df_LZ_l_airport = spark.sql(f""" SELECT CODE ,DESCRIPTION FROM {landing_zone_name}.L_AIRPORT """) delta_l_airport.alias("oldData") \ .merge(df_LZ_l_airport.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_l_airport') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_airport(spark, integration_layer_loc, landing_zone_name)
import os from pyspark.sql.functions import col logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(levelname)s: %(message)s ") if __name__ == '__main__': spark = initialize_spark_session('create_presentation_layer') from delta.tables import * # Creating the presentation_layer database in spark sql try: db_name = dict_dbs_names.get('PRESENTATION_LAYER_NAME') db_loc = dict_dbs_locations.get('PRESENTATION_LAYER_LOC') spark.sql( ddl_create_presentation_layer_db.format( presentation_layer_db_name=db_name, presentation_layer_db_loc=db_loc)) spark.sql(f'USE {db_name}') logging.info(f'{db_name} has been created.') except Exception as e: logging.error(f'Failed to create the {db_name} db in spark sql,{e}') spark.stop() raise Exception(f'Failed to create the {db_name}, {e}')
logging.info( 'CITY_DEMOGRAPHICS has been loaded in the Presentation layer') except Exception as e: logging.error( 'Failed to load CITY_DEMOGRAPHICS in the Presentation Layer') spark.stop() raise Exception( f'Failed to load CITY_DEMOGRAPHICS in the Presentation Layer,{e}') if __name__ == '__main__': spark = initialize_spark_session('load_pl_city_demographics') from delta.tables import * try: presentation_layer_loc = dict_dbs_locations.get( 'PRESENTATION_LAYER_LOC') presentation_layer_name = dict_dbs_names.get('PRESENTATION_LAYER_NAME') integration_layer_name = dict_dbs_names.get('INTEGRATION_LAYER_NAME') except Exception as e: logging.error('Failed to retrieve Environment variables') spark.stop() raise Exception( f'Failed to load CITY_DEMOGRAPHICS in the Presentation Layer,{e}') load_pl_city_demographics(spark, presentation_layer_name, presentation_layer_loc, integration_layer_name)
from pyspark.sql.types import StructField, StructType, StringType from constants import dict_dbs_locations, edge_node_path from sql_queries.landing_zone_ddl import list_landing_zone_standard_lookups from helper_functions.zip_csv_to_gzip_parquet import zip_csv_to_gzip_parquet from helper_functions.loop_files import loop_files import os logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(levelname)s: %(message)s ") if __name__ == '__main__': # Initializing a Spark session spark = initialize_spark_session('load_landing_zone') landing_zone_location = dict_dbs_locations.get('LANDING_ZONE_LOC') # Loading the standard lookups with the same schema in landing_zone try: # Standard schema for the standard lookups schema_lookups_schema = StructType([ StructField("Code", StringType(), True), StructField("Description", StringType(), True) ]) # Loops over all the standard lookups to load them for table_name in list_landing_zone_standard_lookups: df_file = spark \ .read \ .schema(schema_lookups_schema) \
from sql_queries.landing_zone_ddl import ddl_create_land_zone_db, dict_landing_zone_ddls from constants import dict_dbs_locations, dict_dbs_names from helper_functions.initialize_spark_session import initialize_spark_session logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(levelname)s: %(message)s ") if __name__ == '__main__': spark = initialize_spark_session('create_landing_zone') # Creating the landing_zone database in spark sql try: db_name = dict_dbs_names.get('LANDING_ZONE_NAME') db_loc = dict_dbs_locations.get('LANDING_ZONE_LOC') spark.sql( ddl_create_land_zone_db.format(landing_zone_db_name=db_name, landing_zone_db_loc=db_loc)) logging.info(f'{db_name} has been created.') except Exception as e: logging.error(f'Failed to create the {db_name} db in spark sql,{e}') spark.stop() raise Exception(f'Failed to create the {db_name}, {e}') # creating landing zone tables try:
import os from pyspark.sql.functions import col logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(levelname)s: %(message)s ") if __name__ == '__main__': spark = initialize_spark_session('create_integration_layer') from delta.tables import * # Creating the integration_layer database in spark sql try: db_name = dict_dbs_names.get('INTEGRATION_LAYER_NAME') db_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') spark.sql( ddl_create_integration_layer_db.format( integration_layer_db_name=db_name, integration_layer_db_loc=db_loc)) spark.sql(f'USE {db_name}') logging.info(f'{db_name} has been created.') except Exception as e: logging.error(f'Failed to create the {db_name} db in spark sql,{e}') spark.stop() raise Exception(f'Failed to create the {db_name}, {e}')