def execute(self, context): # sourcery skip: none-compare hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id) spark, sc = init_spark( app_name=f'concatenate_{self.dag_name}_{self.date}', step='concatenate', env=self.env, dag_name=self.dag_name, layer=self.layer, path_ojdbc=self.path_ojdbc, path_spark_avro=self.path_spark_avro, path_native_lib=self.path_native_lib, executor_cores=self.executor_cores, executor_memory=self.executor_memory, executor_instances='1', driver_memory=self.driver_memory ) if self.date is None: path = hdfs.generate_hdfs_path( dag_id=self.dag_name, env=self.env, layer=self.layer, is_partitioned=False ) else: path = hdfs.generate_hdfs_path( dag_id=self.dag_name, env=self.env, layer=self.layer, date=self.date ) list_filename = hdfs.list_filename(path) self.log.info(f'list_filename = {list_filename}') if len(list_filename) > 1: avro_schema = hdfs.read_avro_schema( path_avro_schema=self.path_avro_schema, layer=self.layer, dag_name=self.dag_name ) df = hdfs.load_pyspark_df( spark=spark, data_format=self.format_data, path=f'hdfs://{path}' ) total_registry = df.count() self.log.info(f'Total registry = {total_registry}') if self.date is not None: Variable.set(key=f'{self.dag_name}_{self.layer}_total_registry_{self.date}', value=total_registry) n_files = calculate_partitions(total_registry=total_registry, max_registry_by_avro=int(self.max_registry_by_avro)) self.log.info( f'Concatenating {total_registry} registry at {path}. Generating {n_files} files.') hdfs.save_pyspark_df( df=df, avro_schema=avro_schema, compress_type=self.compress_type, mode='append', format=self.format_data, partitions=n_files, hdfs_path=path ) self.log.info(f'Deleting file {path}') hdfs.remove_list_files(path=path, list_filename=list_filename) else: self.log.info(f'Files already concatenate')
def execute(self, **context): template = '-' * 79 hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id) spark, sc = init_spark(app_name=f'sync_data-{self.dag_name}', step='sync', env=self.env, dag_name=self.dag_name, layer=self.layer, path_ojdbc=self.path_ojdbc, path_spark_avro=self.path_spark_avro, path_native_lib=self.path_native_lib, executor_cores=self.executor_cores, executor_memory=self.executor_memory, executor_instances=self.executor_instances, driver_memory=self.driver_memory) avro_schema = hdfs.read_avro_schema( path_avro_schema=self.path_avro_schema, layer=self.layer, dag_name=self.dag_name) hdfs_path = hdfs.generate_hdfs_path(env=self.env, layer=self.layer, dag_id=self.dag_name, is_partitioned=False) sql_get_data = f''' SELECT {self.col_name_control_var}, {self.col_name_dt_ref} FROM {self.db_name}.{self.table_name} WHERE TO_DATE(to_char({self.col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY') < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY') ORDER BY {self.col_name_control_var} ASC ''' self.log.info(f'\n{template}\nGetting data from Oracle\n{template}') df_oracle_table = OracleHelper(self.oracle_conn_id) \ .get_pyspark_df_from_table(oracle_driver=self.oracle_driver, spark=spark, table=f'({sql_get_data})', partition_col=self.col_name_control_var, n_partitions=250) \ .orderBy(self.col_name_control_var) \ .withColumn(self.col_name_control_var, col(self.col_name_control_var).cast(LongType())) \ .withColumn(self.col_name_dt_ref, col(self.col_name_dt_ref).cast(StringType())) total_oracle = df_oracle_table.count() self.log.info(f'Total row from Oracle = {total_oracle}') self.log.info(f'\n{template}\nGetting data from HDFS\n{template}') hdfs.mv_files(hdfs_src_path=hdfs_path, hdfs_dst_path=f'{hdfs_path}/../.tmp_{self.dag_name}') df_hdfs = hdfs \ .load_pyspark_df(spark=spark, data_format='parquet', path=f'../../{hdfs_path}/../.tmp_{self.dag_name}') \ .orderBy(self.col_name_control_var) \ .withColumn(self.col_name_control_var, col(self.col_name_control_var).cast(LongType())) \ .withColumn(self.col_name_dt_ref, col(self.col_name_dt_ref).cast(StringType())) df_hdfs_filtered = df_hdfs \ .select(col(self.col_name_control_var), col(self.col_name_dt_ref)) total_hdfs = df_hdfs_filtered.count() self.log.info(f'Total row from HDFS = {total_hdfs}') if total_hdfs > total_oracle: self.log.warning( f'\n{template}\nTotal rows are not same equals!\n{template}') self.log.warning(f'\nOracle = {total_oracle}' f'\nHDFS = {total_hdfs}') self.log.info( f'\n{template}\nExecuting: df_hdfs - df_oracle_table\n{template}' ) df_row_to_delete_hdfs = df_hdfs_filtered.subtract(df_oracle_table) list_row_to_delete_hdfs = [ row[0] for row in df_row_to_delete_hdfs.select( self.col_name_control_var).collect() ] self.log.info( f'Total row to delete = {df_row_to_delete_hdfs.count()}') self.log.info(f'\n{template}\nDeleting rows from HDFS\n{template}') df = df_hdfs.filter(~df_hdfs[self.col_name_control_var].isin( list_row_to_delete_hdfs)) total_registry = df.count() self.log.info(f'Total row new df = {total_registry}') df.show(n=1, truncate=False) n_files = calculate_partitions(total_registry=total_registry, max_registry_by_avro=int( self.max_registry_by_file)) self.log.info(f'\n{template}\nWriting table in HDFS\n{template}') hdfs.save_pyspark_df(df=df, format='parquet', avro_schema=avro_schema, compress_type=self.compress_type, mode='overwrite', partitions=n_files, hdfs_path=hdfs_path) hdfs.remove_all_files( hdfs_path=f'{hdfs_path}/../.tmp_{self.dag_name}') try: hdfs.mv_files(hdfs_src_path=f'{hdfs_path}/../.tmp_{self.dag_name}', hdfs_dst_path=hdfs_path) except Exception as e: print(e)
def execute(self, **context): template = '-' * 79 start = time.time() hdfs = HdfsHelper(hdfs_conn=context['hdfs_conn_id']) total_registry = int( Variable.get( f'{context["dag_name"]}' f'_{context["origin_layer"]}' f'_total_registry' f'_{context["date"]}', default_var='1')) n_partitions = calculate_partitions( total_registry=total_registry, max_registry_by_avro=int(context['max_registry_by_avro'])) spark, sc = init_spark( app_name=f'pre_process_{context["dag_name"]}-{context["date"]}', step=context['step'], env=context['env'], dag_name=context['dag_name'], layer=context['origin_layer'], path_spark_avro=context['path_spark_avro'], path_spark_xml=context['path_spark_xml'], path_ojdbc=context['path_ojdbc'], path_native_lib=context['path_native_lib'], executor_cores=context['executor_cores'], executor_memory=context['executor_memory'], executor_instances='1', driver_memory=context['driver_memory']) avro_schema = hdfs.read_avro_schema( path_avro_schema=context['path_avro_schema'], layer=context['dest_layer'], dag_name=context['dag_name']) hdfs_raw_path = hdfs.generate_hdfs_path(dag_id=context["dag_name"], env=context["env"], layer=context["origin_layer"], date=context['date']) hdfs_pre_processed_path = hdfs.generate_hdfs_path( dag_id=f'{context["dag_name"]}', env=context["env"], layer=context["dest_layer"], date=context['date']) self.log.info( f'\n{template}\npre_processing data: {context["date"]}\n{template}' ) df = hdfs.load_pyspark_df(spark=spark, data_format='avro', path=hdfs_raw_path) df_pre_processed = pre_process_content( df=df, result_col_name='file', table_blob_col_blob=context['table_blob_col_blob']) self.log.info(f'Cleaning up registries...') df_cleansing = cleansing_data(df=df_pre_processed, pre_processed='file', result_col_name='file') self.log.info(f'Writing data at {hdfs_pre_processed_path}') hdfs.save_pyspark_df(df=df_cleansing, avro_schema=avro_schema, compress_type='snappy', mode='overwrite', format='avro', partitions=n_partitions, hdfs_path=hdfs_pre_processed_path) self.log.info( f'\n***** REPORT *****\n' f'Date = {context["date"]}\n' f'Local = {hdfs_pre_processed_path}\n' f'Total time = {time.time() - start} sec\n' f'Total rows = {df_cleansing.count()}\n' f'Total partitions = {df_cleansing.rdd.getNumPartitions()}')
def execute(self, context): template = '-' * 79 hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id) spark, sc = init_spark(app_name=f'{self.step}_{self.dag_name}', step=self.step, env=self.env, dag_name=self.dag_name, layer=self.layer, path_ojdbc=self.path_ojdbc, path_spark_avro=self.path_spark_avro, path_native_lib=self.path_native_lib, executor_cores=self.executor_cores, executor_memory=self.executor_memory, executor_instances=self.executor_instances, driver_memory=self.driver_memory) avro_schema = hdfs.read_avro_schema( path_avro_schema=self.path_avro_schema, layer=self.layer, dag_name=self.dag_name) for date in self.list_current_dates: start = time.time() query_blob = Variable.get( f'{self.current_dag_name}_sql_blob_{date}') n_partitions = int( Variable.get(f'{self.current_dag_name}_total_pg_{date}') ) # 1 pg = 1000 rows hdfs_path = hdfs.generate_hdfs_path(dag_id=self.dag_name, env=self.env, layer=self.layer, date=date) self.log.info( f'\n{template}\nExtracting from Oracle: {date}\n{template}') try: if self.dag_name in ['acc', 'dimp']: raise ValueError('Switch execution pyspark to pandas ...') df = get_pyspark_df_to_process( oracle_conn_id=self.oracle_conn_id, oracle_conn_blob=self.oracle_conn_blob, oracle_driver=self.oracle_driver, spark=spark, n_partitions=n_partitions, query_blob=query_blob, table_blob_col_pk=self.table_blob_col_pk, table_blob_col_blob=self.table_blob_col_blob, current_dag_name=self.current_dag_name, extra_cols=self.extra_cols, date=date) df_preprocessed = preprocess_data_doc( df=df, col_type_pk=self.col_type_pk, table_blob_col_pk=self.table_blob_col_pk, table_blob_col_blob=self.table_blob_col_blob) # load dataframe into HDFS before preprocess decrease shuffle operation self.log.info( f'\n{template}\nWriting avro in HDFS\n{template}') df_preprocessed.explain() hdfs.save_pyspark_df(df=df_preprocessed, avro_schema=avro_schema, compress_type=self.compress_type, mode='append', format='avro', partitions=n_partitions, hdfs_path=hdfs_path) self.log.info( f'\n***** REPORT *****\n' f'Date = {date}\n' f'Local = {hdfs_path}\n' f'Total time = {time.time() - start} sec\n' f'Total rows = {df_preprocessed.count()}\n' f'Total partitions = {df_preprocessed.rdd.getNumPartitions()}' ) except (ValueError, Py4JJavaError, pyspark.sql.utils.PythonException) as err: "E.g: ValueError: can not serialize object larger than 2G" self.log.error( f"\n{template}\n{err.__class__} occurred !!!\n{template}\n" ) self.log.info(f'\n{template}\nExecuting Pandas\n{template}') pdf = get_pandas_df_to_process( oracle_conn_id=self.oracle_conn_id, oracle_conn_blob=self.oracle_conn_blob, query_blob=query_blob, table_blob_col_pk=self.table_blob_col_pk, table_blob_col_blob=self.table_blob_col_blob, extra_cols=self.extra_cols, current_dag_name=self.current_dag_name, date=date) # TODO: create function # >======================================================== if self.dag_name == 'acc': data = pandas_preprocess_acc( pdf=pdf, table_blob_col_blob=self.table_blob_col_blob, compress_type=self.compress_type, avro_schema=self.path_local_avro_schemas) elif self.dag_name == 'dimp': data = pandas_preprocess_dimp( pdf=pdf, table_blob_col_blob=self.table_blob_col_blob, compress_type=self.compress_type, avro_schema=self.path_local_avro_schemas) else: data = pandas_preprocess_data_doc( pdf=pdf, table_blob_col_pk=self.table_blob_col_pk, table_blob_col_blob=self.table_blob_col_blob, compress_type=self.compress_type, avro_schema=self.path_local_avro_schemas) # >======================================================== self.log.info( f'\n{template}\nWriting avro in HDFS\n{template}') hdfs \ .load_data(hdfs_path=hdfs_path, data=data, hdfs_filename=f'part-0-mapred_{self.current_dag_name}_{date}.avro') self.log.info(f'\n***** REPORT *****\n' f'Date = {date}\n' f'Local = {hdfs_path}\n' f'Total time = {time.time() - start} sec\n' f'Total rows = {len(pdf.index)}')