예제 #1
0
    def execute(self, context):
        template = '-' * 79
        start = time.time()
        hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id)
        oracle = OracleHelper(self.oracle_conn_id)

        spark, sc = init_spark(app_name=f'{self.step}_{self.dag_name}',
                               step=self.step,
                               env=self.env,
                               dag_name=self.dag_name,
                               layer=self.layer,
                               path_ojdbc=self.path_ojdbc,
                               path_spark_avro=self.path_spark_avro,
                               path_native_lib=self.path_native_lib,
                               executor_cores=self.executor_cores,
                               executor_memory=self.executor_memory,
                               executor_instances=self.executor_instances,
                               driver_memory=self.driver_memory)
        avro_schema = hdfs \
            .read_avro_schema(path_avro_schema=self.path_avro_schema,
                              layer=self.layer,
                              dag_name=self.dag_name)

        hdfs_path = hdfs \
            .generate_hdfs_path(dag_id=self.dag_name,
                                env=self.env,
                                layer=self.layer,
                                is_partitioned=False)

        self.log.info(f'\n{template}\nGetting data from Oracle\n{template}')
        self.log.info(
            f'query:{self.sql_get_data}\n parameters:\n{self.dict_bind}')
        records = oracle.get_rows_with_bind(sql=self.sql_get_data,
                                            bind=self.dict_bind)
        list_dict_cols = read_data_config(
            self.dag_name)[self.dag_name]['hdfs_data_schema']['raw']['cols']
        df_oracle_table = convert_type_oracle_to_spark(
            spark=spark, records=records, list_dict_cols=list_dict_cols)

        df_preprocessed = preprocess_data_table(df_oracle_table)
        df_preprocessed.explain()
        df_preprocessed.printSchema()
        df_preprocessed.show(n=1)

        total_registry = df_oracle_table.count()
        n_partitions = calculate_partitions(total_registry=total_registry,
                                            max_registry_by_avro=int(
                                                self.max_registry_by_file))
        # TODO: analyze and test ORC (accept ACID)
        self.log.info(f'\n{template}\nWriting table in HDFS\n{template}')
        hdfs.save_pyspark_df(df=df_preprocessed,
                             format='parquet',
                             avro_schema=avro_schema,
                             compress_type=self.compress_type,
                             mode='append',
                             partitions=n_partitions,
                             hdfs_path=hdfs_path)

        self.log.info(
            f'\n***** REPORT *****\n'
            f'Local            = {hdfs_path}\n'
            f'Total time       = {time.time() - start} sec\n'
            f'Total rows       = {total_registry}\n'
            f'Total partitions = {df_preprocessed.rdd.getNumPartitions()}')
예제 #2
0
    def execute(self, context):  # sourcery skip: none-compare
        hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id)

        spark, sc = init_spark(
            app_name=f'concatenate_{self.dag_name}_{self.date}',
            step='concatenate',
            env=self.env,
            dag_name=self.dag_name,
            layer=self.layer,
            path_ojdbc=self.path_ojdbc,
            path_spark_avro=self.path_spark_avro,
            path_native_lib=self.path_native_lib,
            executor_cores=self.executor_cores,
            executor_memory=self.executor_memory,
            executor_instances='1',
            driver_memory=self.driver_memory
        )

        if self.date is None:
            path = hdfs.generate_hdfs_path(
                dag_id=self.dag_name,
                env=self.env,
                layer=self.layer,
                is_partitioned=False
            )
        else:
            path = hdfs.generate_hdfs_path(
                dag_id=self.dag_name,
                env=self.env,
                layer=self.layer,
                date=self.date
            )

        list_filename = hdfs.list_filename(path)
        self.log.info(f'list_filename = {list_filename}')

        if len(list_filename) > 1:
            avro_schema = hdfs.read_avro_schema(
                path_avro_schema=self.path_avro_schema,
                layer=self.layer,
                dag_name=self.dag_name
            )
            df = hdfs.load_pyspark_df(
                spark=spark,
                data_format=self.format_data,
                path=f'hdfs://{path}'
            )
            total_registry = df.count()
            self.log.info(f'Total registry = {total_registry}')

            if self.date is not None:
                Variable.set(key=f'{self.dag_name}_{self.layer}_total_registry_{self.date}',
                             value=total_registry)

            n_files = calculate_partitions(total_registry=total_registry,
                                           max_registry_by_avro=int(self.max_registry_by_avro))

            self.log.info(
                f'Concatenating {total_registry} registry at {path}. Generating {n_files} files.')
            hdfs.save_pyspark_df(
                df=df,
                avro_schema=avro_schema,
                compress_type=self.compress_type,
                mode='append',
                format=self.format_data,
                partitions=n_files,
                hdfs_path=path
            )

            self.log.info(f'Deleting file {path}')
            hdfs.remove_list_files(path=path, list_filename=list_filename)

        else:
            self.log.info(f'Files already concatenate')
예제 #3
0
    def execute(self, **context):
        template = '-' * 79
        hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id)
        spark, sc = init_spark(app_name=f'sync_data-{self.dag_name}',
                               step='sync',
                               env=self.env,
                               dag_name=self.dag_name,
                               layer=self.layer,
                               path_ojdbc=self.path_ojdbc,
                               path_spark_avro=self.path_spark_avro,
                               path_native_lib=self.path_native_lib,
                               executor_cores=self.executor_cores,
                               executor_memory=self.executor_memory,
                               executor_instances=self.executor_instances,
                               driver_memory=self.driver_memory)
        avro_schema = hdfs.read_avro_schema(
            path_avro_schema=self.path_avro_schema,
            layer=self.layer,
            dag_name=self.dag_name)
        hdfs_path = hdfs.generate_hdfs_path(env=self.env,
                                            layer=self.layer,
                                            dag_id=self.dag_name,
                                            is_partitioned=False)

        sql_get_data = f'''
        SELECT
            {self.col_name_control_var},
            {self.col_name_dt_ref}
        FROM {self.db_name}.{self.table_name}
        WHERE
            TO_DATE(to_char({self.col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
            < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')
        ORDER BY {self.col_name_control_var} ASC
        '''

        self.log.info(f'\n{template}\nGetting data from Oracle\n{template}')
        df_oracle_table = OracleHelper(self.oracle_conn_id) \
            .get_pyspark_df_from_table(oracle_driver=self.oracle_driver,
                                       spark=spark,
                                       table=f'({sql_get_data})',
                                       partition_col=self.col_name_control_var,
                                       n_partitions=250) \
            .orderBy(self.col_name_control_var) \
            .withColumn(self.col_name_control_var,
                        col(self.col_name_control_var).cast(LongType())) \
            .withColumn(self.col_name_dt_ref,
                        col(self.col_name_dt_ref).cast(StringType()))

        total_oracle = df_oracle_table.count()
        self.log.info(f'Total row from Oracle = {total_oracle}')

        self.log.info(f'\n{template}\nGetting data from HDFS\n{template}')
        hdfs.mv_files(hdfs_src_path=hdfs_path,
                      hdfs_dst_path=f'{hdfs_path}/../.tmp_{self.dag_name}')

        df_hdfs = hdfs \
            .load_pyspark_df(spark=spark,
                             data_format='parquet',
                             path=f'../../{hdfs_path}/../.tmp_{self.dag_name}') \
            .orderBy(self.col_name_control_var) \
            .withColumn(self.col_name_control_var, col(self.col_name_control_var).cast(LongType())) \
            .withColumn(self.col_name_dt_ref, col(self.col_name_dt_ref).cast(StringType()))

        df_hdfs_filtered = df_hdfs \
            .select(col(self.col_name_control_var),
                    col(self.col_name_dt_ref))

        total_hdfs = df_hdfs_filtered.count()
        self.log.info(f'Total row from HDFS = {total_hdfs}')

        if total_hdfs > total_oracle:
            self.log.warning(
                f'\n{template}\nTotal rows are not same equals!\n{template}')
            self.log.warning(f'\nOracle = {total_oracle}'
                             f'\nHDFS   = {total_hdfs}')

            self.log.info(
                f'\n{template}\nExecuting: df_hdfs - df_oracle_table\n{template}'
            )
            df_row_to_delete_hdfs = df_hdfs_filtered.subtract(df_oracle_table)
            list_row_to_delete_hdfs = [
                row[0] for row in df_row_to_delete_hdfs.select(
                    self.col_name_control_var).collect()
            ]
            self.log.info(
                f'Total row to delete = {df_row_to_delete_hdfs.count()}')

            self.log.info(f'\n{template}\nDeleting rows from HDFS\n{template}')
            df = df_hdfs.filter(~df_hdfs[self.col_name_control_var].isin(
                list_row_to_delete_hdfs))
            total_registry = df.count()
            self.log.info(f'Total row new df = {total_registry}')
            df.show(n=1, truncate=False)

            n_files = calculate_partitions(total_registry=total_registry,
                                           max_registry_by_avro=int(
                                               self.max_registry_by_file))

            self.log.info(f'\n{template}\nWriting table in HDFS\n{template}')
            hdfs.save_pyspark_df(df=df,
                                 format='parquet',
                                 avro_schema=avro_schema,
                                 compress_type=self.compress_type,
                                 mode='overwrite',
                                 partitions=n_files,
                                 hdfs_path=hdfs_path)
            hdfs.remove_all_files(
                hdfs_path=f'{hdfs_path}/../.tmp_{self.dag_name}')

        try:
            hdfs.mv_files(hdfs_src_path=f'{hdfs_path}/../.tmp_{self.dag_name}',
                          hdfs_dst_path=hdfs_path)
        except Exception as e:
            print(e)
    def execute(self, **context):
        template = '-' * 79
        start = time.time()
        hdfs = HdfsHelper(hdfs_conn=context['hdfs_conn_id'])
        total_registry = int(
            Variable.get(
                f'{context["dag_name"]}'
                f'_{context["origin_layer"]}'
                f'_total_registry'
                f'_{context["date"]}',
                default_var='1'))
        n_partitions = calculate_partitions(
            total_registry=total_registry,
            max_registry_by_avro=int(context['max_registry_by_avro']))
        spark, sc = init_spark(
            app_name=f'pre_process_{context["dag_name"]}-{context["date"]}',
            step=context['step'],
            env=context['env'],
            dag_name=context['dag_name'],
            layer=context['origin_layer'],
            path_spark_avro=context['path_spark_avro'],
            path_spark_xml=context['path_spark_xml'],
            path_ojdbc=context['path_ojdbc'],
            path_native_lib=context['path_native_lib'],
            executor_cores=context['executor_cores'],
            executor_memory=context['executor_memory'],
            executor_instances='1',
            driver_memory=context['driver_memory'])
        avro_schema = hdfs.read_avro_schema(
            path_avro_schema=context['path_avro_schema'],
            layer=context['dest_layer'],
            dag_name=context['dag_name'])
        hdfs_raw_path = hdfs.generate_hdfs_path(dag_id=context["dag_name"],
                                                env=context["env"],
                                                layer=context["origin_layer"],
                                                date=context['date'])
        hdfs_pre_processed_path = hdfs.generate_hdfs_path(
            dag_id=f'{context["dag_name"]}',
            env=context["env"],
            layer=context["dest_layer"],
            date=context['date'])

        self.log.info(
            f'\n{template}\npre_processing data: {context["date"]}\n{template}'
        )
        df = hdfs.load_pyspark_df(spark=spark,
                                  data_format='avro',
                                  path=hdfs_raw_path)

        df_pre_processed = pre_process_content(
            df=df,
            result_col_name='file',
            table_blob_col_blob=context['table_blob_col_blob'])

        self.log.info(f'Cleaning up registries...')
        df_cleansing = cleansing_data(df=df_pre_processed,
                                      pre_processed='file',
                                      result_col_name='file')

        self.log.info(f'Writing data at {hdfs_pre_processed_path}')
        hdfs.save_pyspark_df(df=df_cleansing,
                             avro_schema=avro_schema,
                             compress_type='snappy',
                             mode='overwrite',
                             format='avro',
                             partitions=n_partitions,
                             hdfs_path=hdfs_pre_processed_path)

        self.log.info(
            f'\n***** REPORT *****\n'
            f'Date             = {context["date"]}\n'
            f'Local            = {hdfs_pre_processed_path}\n'
            f'Total time       = {time.time() - start} sec\n'
            f'Total rows       = {df_cleansing.count()}\n'
            f'Total partitions = {df_cleansing.rdd.getNumPartitions()}')
    def execute(self, context):
        template = '-' * 79
        hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id)
        spark, sc = init_spark(app_name=f'{self.step}_{self.dag_name}',
                               step=self.step,
                               env=self.env,
                               dag_name=self.dag_name,
                               layer=self.layer,
                               path_ojdbc=self.path_ojdbc,
                               path_spark_avro=self.path_spark_avro,
                               path_native_lib=self.path_native_lib,
                               executor_cores=self.executor_cores,
                               executor_memory=self.executor_memory,
                               executor_instances=self.executor_instances,
                               driver_memory=self.driver_memory)
        avro_schema = hdfs.read_avro_schema(
            path_avro_schema=self.path_avro_schema,
            layer=self.layer,
            dag_name=self.dag_name)

        for date in self.list_current_dates:
            start = time.time()
            query_blob = Variable.get(
                f'{self.current_dag_name}_sql_blob_{date}')
            n_partitions = int(
                Variable.get(f'{self.current_dag_name}_total_pg_{date}')
            )  # 1 pg = 1000 rows
            hdfs_path = hdfs.generate_hdfs_path(dag_id=self.dag_name,
                                                env=self.env,
                                                layer=self.layer,
                                                date=date)

            self.log.info(
                f'\n{template}\nExtracting from Oracle: {date}\n{template}')
            try:
                if self.dag_name in ['acc', 'dimp']:
                    raise ValueError('Switch execution pyspark to pandas ...')

                df = get_pyspark_df_to_process(
                    oracle_conn_id=self.oracle_conn_id,
                    oracle_conn_blob=self.oracle_conn_blob,
                    oracle_driver=self.oracle_driver,
                    spark=spark,
                    n_partitions=n_partitions,
                    query_blob=query_blob,
                    table_blob_col_pk=self.table_blob_col_pk,
                    table_blob_col_blob=self.table_blob_col_blob,
                    current_dag_name=self.current_dag_name,
                    extra_cols=self.extra_cols,
                    date=date)

                df_preprocessed = preprocess_data_doc(
                    df=df,
                    col_type_pk=self.col_type_pk,
                    table_blob_col_pk=self.table_blob_col_pk,
                    table_blob_col_blob=self.table_blob_col_blob)

                # load dataframe into HDFS before preprocess decrease shuffle operation
                self.log.info(
                    f'\n{template}\nWriting avro in HDFS\n{template}')
                df_preprocessed.explain()

                hdfs.save_pyspark_df(df=df_preprocessed,
                                     avro_schema=avro_schema,
                                     compress_type=self.compress_type,
                                     mode='append',
                                     format='avro',
                                     partitions=n_partitions,
                                     hdfs_path=hdfs_path)

                self.log.info(
                    f'\n***** REPORT *****\n'
                    f'Date             = {date}\n'
                    f'Local            = {hdfs_path}\n'
                    f'Total time       = {time.time() - start} sec\n'
                    f'Total rows       = {df_preprocessed.count()}\n'
                    f'Total partitions = {df_preprocessed.rdd.getNumPartitions()}'
                )

            except (ValueError, Py4JJavaError,
                    pyspark.sql.utils.PythonException) as err:
                "E.g: ValueError: can not serialize object larger than 2G"
                self.log.error(
                    f"\n{template}\n{err.__class__} occurred !!!\n{template}\n"
                )
                self.log.info(f'\n{template}\nExecuting Pandas\n{template}')

                pdf = get_pandas_df_to_process(
                    oracle_conn_id=self.oracle_conn_id,
                    oracle_conn_blob=self.oracle_conn_blob,
                    query_blob=query_blob,
                    table_blob_col_pk=self.table_blob_col_pk,
                    table_blob_col_blob=self.table_blob_col_blob,
                    extra_cols=self.extra_cols,
                    current_dag_name=self.current_dag_name,
                    date=date)

                # TODO: create function
                # >========================================================
                if self.dag_name == 'acc':
                    data = pandas_preprocess_acc(
                        pdf=pdf,
                        table_blob_col_blob=self.table_blob_col_blob,
                        compress_type=self.compress_type,
                        avro_schema=self.path_local_avro_schemas)
                elif self.dag_name == 'dimp':
                    data = pandas_preprocess_dimp(
                        pdf=pdf,
                        table_blob_col_blob=self.table_blob_col_blob,
                        compress_type=self.compress_type,
                        avro_schema=self.path_local_avro_schemas)
                else:
                    data = pandas_preprocess_data_doc(
                        pdf=pdf,
                        table_blob_col_pk=self.table_blob_col_pk,
                        table_blob_col_blob=self.table_blob_col_blob,
                        compress_type=self.compress_type,
                        avro_schema=self.path_local_avro_schemas)
                # >========================================================

                self.log.info(
                    f'\n{template}\nWriting avro in HDFS\n{template}')
                hdfs \
                    .load_data(hdfs_path=hdfs_path,
                               data=data,
                               hdfs_filename=f'part-0-mapred_{self.current_dag_name}_{date}.avro')

                self.log.info(f'\n***** REPORT *****\n'
                              f'Date             = {date}\n'
                              f'Local            = {hdfs_path}\n'
                              f'Total time       = {time.time() - start} sec\n'
                              f'Total rows       = {len(pdf.index)}')