Exemplo n.º 1
0
    def execute(self, **context):
        template = '-' * 79
        hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn)
        self.log.info(f'\n{template}\nGetting paths\n{template}')
        list_path = [
            hdfs.generate_hdfs_path(dag_id=context["dag_name"],
                                    env=context["env"],
                                    layer=context["layer"],
                                    date=date)
            for date in context['list_all_dates']
        ]
        self.log.info(list_path)

        self.log.info(f'\n{template}\nRemoving _SUCCESS\n{template}')
        hdfs.remove_empty_files(list_hdfs_path=list_path)

        self.log.info(f'\n{template}\nGetting paths to concat\n{template}')
        if context['agg_by'] is 'month':
            hdfs.create_dir_first_day(list_hdfs_path=list_path)

        self.log.info(list_path)

        if context['agg_by'] is 'month':
            self.log.info(f'\n{template}\nCreating folder YEAR/MONTH/01 each month\n{template}')
            list_path_to_mv = hdfs.remove_path_first_day(list_path)

            self.log.info(f'list_path_to_mv = {list_path_to_mv}')
            self.log.info(f'\n{template}\nMoving files to first day each month\n{template}')
            hdfs.mv_files_to_first_day(list_path_to_mv)
Exemplo n.º 2
0
    def execute(self, **context):
        hive = HiveHelper(hive_conn=context['hive_conn_id'])
        impala = ImpalaHelper(hive_conn=context['impala_conn_id'])
        list_all_dates = AirflowMetaStoreHelper().set_granularity(
            list_all_dates=context['list_all_dates'],
            agg_by=context['agg_by']
        )

        for date in list_all_dates:
            hdfs_path = HdfsHelper().generate_hdfs_path(env=context["env"],
                                                        layer=context['layer'],
                                                        dag_id=context['dag_name'],
                                                        date=date)

            self.log.info("HIVE: Creating partition")
            hive.add_partition(date=date,
                               db=context['dag_name'],
                               table=context['layer'],
                               hdfs_path=hdfs_path)

            self.log.info("IMPALA: Creating partition")
            impala.add_partition(date=date,
                                 db=context['dag_name'],
                                 table=context['layer'],
                                 hdfs_path=hdfs_path)

            self.log.info("IMPALA: Refreshing table")
            impala.execute_refresh_table(db=context['dag_name'],
                                         table=context['layer'])
Exemplo n.º 3
0
    def generate_all_partitions(self, oracle_conn: str, table_ctrl: str,
                                table_ctrl_col_dt_ref: str, agg_by: str,
                                env: str, layer: str, data: str) -> None:
        """
        Generates all partitions in hive and impala.
        Useful when it is necessary to recreate/change dbs, tables or data directories in HDFS
        """
        list_all_dates = OracleHelper(oracle_conn).get_all_dates(
            table_ctrl=table_ctrl, table_ctrl_col_dt_ref=table_ctrl_col_dt_ref)
        list_all_dates = [dt for dt in list_all_dates if dt is not None]
        list_dates = AirflowMetaStoreHelper().set_granularity(
            list_all_dates=list_all_dates, agg_by=agg_by)

        for date in list_dates:
            hdfs_path = HdfsHelper().generate_hdfs_path(env=env,
                                                        layer=layer,
                                                        dag_id=data,
                                                        date=date)

            self.log.info(f"Creating partition:")
            self.add_partition(date=date,
                               db=data,
                               table=layer,
                               hdfs_path=hdfs_path)
Exemplo n.º 4
0
    def execute(self, context):
        template = '-' * 79
        start = time.time()
        hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id)
        oracle = OracleHelper(self.oracle_conn_id)

        spark, sc = init_spark(app_name=f'{self.step}_{self.dag_name}',
                               step=self.step,
                               env=self.env,
                               dag_name=self.dag_name,
                               layer=self.layer,
                               path_ojdbc=self.path_ojdbc,
                               path_spark_avro=self.path_spark_avro,
                               path_native_lib=self.path_native_lib,
                               executor_cores=self.executor_cores,
                               executor_memory=self.executor_memory,
                               executor_instances=self.executor_instances,
                               driver_memory=self.driver_memory)
        avro_schema = hdfs \
            .read_avro_schema(path_avro_schema=self.path_avro_schema,
                              layer=self.layer,
                              dag_name=self.dag_name)

        hdfs_path = hdfs \
            .generate_hdfs_path(dag_id=self.dag_name,
                                env=self.env,
                                layer=self.layer,
                                is_partitioned=False)

        self.log.info(f'\n{template}\nGetting data from Oracle\n{template}')
        self.log.info(
            f'query:{self.sql_get_data}\n parameters:\n{self.dict_bind}')
        records = oracle.get_rows_with_bind(sql=self.sql_get_data,
                                            bind=self.dict_bind)
        list_dict_cols = read_data_config(
            self.dag_name)[self.dag_name]['hdfs_data_schema']['raw']['cols']
        df_oracle_table = convert_type_oracle_to_spark(
            spark=spark, records=records, list_dict_cols=list_dict_cols)

        df_preprocessed = preprocess_data_table(df_oracle_table)
        df_preprocessed.explain()
        df_preprocessed.printSchema()
        df_preprocessed.show(n=1)

        total_registry = df_oracle_table.count()
        n_partitions = calculate_partitions(total_registry=total_registry,
                                            max_registry_by_avro=int(
                                                self.max_registry_by_file))
        # TODO: analyze and test ORC (accept ACID)
        self.log.info(f'\n{template}\nWriting table in HDFS\n{template}')
        hdfs.save_pyspark_df(df=df_preprocessed,
                             format='parquet',
                             avro_schema=avro_schema,
                             compress_type=self.compress_type,
                             mode='append',
                             partitions=n_partitions,
                             hdfs_path=hdfs_path)

        self.log.info(
            f'\n***** REPORT *****\n'
            f'Local            = {hdfs_path}\n'
            f'Total time       = {time.time() - start} sec\n'
            f'Total rows       = {total_registry}\n'
            f'Total partitions = {df_preprocessed.rdd.getNumPartitions()}')
Exemplo n.º 5
0
def main():
    template = '-' * 79
    list_data = get_list_docs(path=''.join(here + f'/../../configs/data/'))

    for tag in ('oracle', 'redis', 'big_data'):
        logging.info(f'\n{template}\nCreating connections\n{template}')
        for conn_name in dict_config_conn[tag]:
            c = dict_config_conn[tag][conn_name]
            AirflowMetaStoreHelper(airflow_conn=c['conn']) \
                .create_conn(conn_type=c['conn_type'],
                             host=c['host'],
                             login=c['login'],
                             passwd=c['password'],
                             port=c['port'],
                             extra=c['extra'],
                             describe=c['describe'])

    impala = ImpalaHelper(hive_conn='impala')
    hive = HiveHelper(hive_conn='hive')
    hdfs = HdfsHelper(hdfs_conn='webhdfs')

    Variable.set(key='total_rows', value='100000')
    Variable.set(key='env', value=os.environ.get('ENV').lower())

    for data_name in list_data:
        dict_cfg_data = read_data_config(data_name=data_name)
        cfg = dict_cfg_data[data_name]
        cfg_hdfs = dag_cfg["paths"]["path_hdfs"]

        list_all_layers = list(cfg["hdfs_data_schema"].keys())
        list_layers = [
            layer for layer in list_all_layers if layer != 'db_comment'
        ]

        for layer in list_layers:

            # control var
            try:
                logging.info(
                    f'\n{template}\nLoading {data_name} control_var to airflow variables\n{template}'
                )
                val_control_var = hdfs.get_control_var(data_name=data_name,
                                                       layer=layer)
                Variable.set(key=f'{data_name}_control_var',
                             value=val_control_var)
                logging.info(f'control_var = {val_control_var}')
            except Exception as err:
                if layer == 'raw':
                    logging.error(err.__class__)
                    logging.info(f'File does not have exists. Creating ...')
                    hdfs.create_file_state(data_name=data_name, layer=layer)

            # avro schema
            logging.info(
                f'\n{template}\nGenerating and saving {data_name} avro schema\n{template}'
            )
            avro_schema = prepare_avro_schema(
                layer=layer,
                data_name=data_name,
                template=template,
                path_ojdbc=
                f"{dag_cfg['paths']['path_libs']}/{dag_cfg['libs']['ojdbc']}",
                path_native_lib=dag_cfg['paths']['path_native_lib'],
                doc_type=cfg['doc_type'],
                list_dict_cols=cfg["hdfs_data_schema"][layer]["cols"])
            hdfs \
                .save_avro_schema_hdfs(data_name=data_name,
                                       hdfs_path=cfg_hdfs["path_avro_schemas"],
                                       layer=layer,
                                       avro_schema=avro_schema)
            hdfs \
                .download_file(hdfs_path=f'{cfg_hdfs["path_avro_schemas"]}/{layer}/{data_name}.avsc',
                               local_path=f'{dag_cfg["paths"]["path_local_avro_schemas"]}/{layer}')

            # db, tables, partitions
            hive \
                .hive_prepare_env(env='prod',
                                  layer=layer,
                                  data_name=data_name,
                                  cfg=dict_cfg_data[data_name],
                                  cfg_hdfs=cfg_hdfs,
                                  template=template)

            logging.info(f'\n{template}\nUpdating DB statistics\n{template}')
            impala \
                .execute_invalidate_metadata()

            logging.info(
                f'\n{template}\nDownload {data_name} avro schemas to'
                f' {dag_cfg["paths"]["path_local_avro_schemas"]}\n{template}')
Exemplo n.º 6
0
    def execute(self, context):  # sourcery skip: none-compare
        hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id)

        spark, sc = init_spark(
            app_name=f'concatenate_{self.dag_name}_{self.date}',
            step='concatenate',
            env=self.env,
            dag_name=self.dag_name,
            layer=self.layer,
            path_ojdbc=self.path_ojdbc,
            path_spark_avro=self.path_spark_avro,
            path_native_lib=self.path_native_lib,
            executor_cores=self.executor_cores,
            executor_memory=self.executor_memory,
            executor_instances='1',
            driver_memory=self.driver_memory
        )

        if self.date is None:
            path = hdfs.generate_hdfs_path(
                dag_id=self.dag_name,
                env=self.env,
                layer=self.layer,
                is_partitioned=False
            )
        else:
            path = hdfs.generate_hdfs_path(
                dag_id=self.dag_name,
                env=self.env,
                layer=self.layer,
                date=self.date
            )

        list_filename = hdfs.list_filename(path)
        self.log.info(f'list_filename = {list_filename}')

        if len(list_filename) > 1:
            avro_schema = hdfs.read_avro_schema(
                path_avro_schema=self.path_avro_schema,
                layer=self.layer,
                dag_name=self.dag_name
            )
            df = hdfs.load_pyspark_df(
                spark=spark,
                data_format=self.format_data,
                path=f'hdfs://{path}'
            )
            total_registry = df.count()
            self.log.info(f'Total registry = {total_registry}')

            if self.date is not None:
                Variable.set(key=f'{self.dag_name}_{self.layer}_total_registry_{self.date}',
                             value=total_registry)

            n_files = calculate_partitions(total_registry=total_registry,
                                           max_registry_by_avro=int(self.max_registry_by_avro))

            self.log.info(
                f'Concatenating {total_registry} registry at {path}. Generating {n_files} files.')
            hdfs.save_pyspark_df(
                df=df,
                avro_schema=avro_schema,
                compress_type=self.compress_type,
                mode='append',
                format=self.format_data,
                partitions=n_files,
                hdfs_path=path
            )

            self.log.info(f'Deleting file {path}')
            hdfs.remove_list_files(path=path, list_filename=list_filename)

        else:
            self.log.info(f'Files already concatenate')
    def execute(self, **context):
        template = '-' * 79
        start = time.time()
        hdfs = HdfsHelper(hdfs_conn=context['hdfs_conn_id'])
        total_registry = int(
            Variable.get(
                f'{context["dag_name"]}'
                f'_{context["origin_layer"]}'
                f'_total_registry'
                f'_{context["date"]}',
                default_var='1'))
        n_partitions = calculate_partitions(
            total_registry=total_registry,
            max_registry_by_avro=int(context['max_registry_by_avro']))
        spark, sc = init_spark(
            app_name=f'pre_process_{context["dag_name"]}-{context["date"]}',
            step=context['step'],
            env=context['env'],
            dag_name=context['dag_name'],
            layer=context['origin_layer'],
            path_spark_avro=context['path_spark_avro'],
            path_spark_xml=context['path_spark_xml'],
            path_ojdbc=context['path_ojdbc'],
            path_native_lib=context['path_native_lib'],
            executor_cores=context['executor_cores'],
            executor_memory=context['executor_memory'],
            executor_instances='1',
            driver_memory=context['driver_memory'])
        avro_schema = hdfs.read_avro_schema(
            path_avro_schema=context['path_avro_schema'],
            layer=context['dest_layer'],
            dag_name=context['dag_name'])
        hdfs_raw_path = hdfs.generate_hdfs_path(dag_id=context["dag_name"],
                                                env=context["env"],
                                                layer=context["origin_layer"],
                                                date=context['date'])
        hdfs_pre_processed_path = hdfs.generate_hdfs_path(
            dag_id=f'{context["dag_name"]}',
            env=context["env"],
            layer=context["dest_layer"],
            date=context['date'])

        self.log.info(
            f'\n{template}\npre_processing data: {context["date"]}\n{template}'
        )
        df = hdfs.load_pyspark_df(spark=spark,
                                  data_format='avro',
                                  path=hdfs_raw_path)

        df_pre_processed = pre_process_content(
            df=df,
            result_col_name='file',
            table_blob_col_blob=context['table_blob_col_blob'])

        self.log.info(f'Cleaning up registries...')
        df_cleansing = cleansing_data(df=df_pre_processed,
                                      pre_processed='file',
                                      result_col_name='file')

        self.log.info(f'Writing data at {hdfs_pre_processed_path}')
        hdfs.save_pyspark_df(df=df_cleansing,
                             avro_schema=avro_schema,
                             compress_type='snappy',
                             mode='overwrite',
                             format='avro',
                             partitions=n_partitions,
                             hdfs_path=hdfs_pre_processed_path)

        self.log.info(
            f'\n***** REPORT *****\n'
            f'Date             = {context["date"]}\n'
            f'Local            = {hdfs_pre_processed_path}\n'
            f'Total time       = {time.time() - start} sec\n'
            f'Total rows       = {df_cleansing.count()}\n'
            f'Total partitions = {df_cleansing.rdd.getNumPartitions()}')
Exemplo n.º 8
0
def create_dag(
    dag_name: str,
    agg_by: str,
    cache_blob: str,
    path_avro_schema: str,
    executor_cores: str,
    executor_memory: str,
    driver_memory: str,
    max_registry_by_avro: str,
    table_id: str,
    table_id_col_fk: str,
    table_id_col_dt_ref: str,
    table_id_col_dt_created: str,
    table_blob_col_blob: str
) -> airflow.models.dag.DAG:
    # -----------------
    #        DAG
    # -----------------
    args = {
        'owner': 'job',
        'run_as_user': '******',
        'start_date': datetime(2021, 4, 12),
        'do_xcom_push': False,
        'depends_on_past': True,
        'retries': 100,
        'retry_delay': timedelta(seconds=90),
        'dag_name': dag_name
    }

    with DAG(dag_id=f'{step}_{dag_name}',
             description=f'pre_processe e limpa os dados de {dag_name}',
             schedule_interval=None,
             default_args=args) as dag:
        dag.doc_md = __doc__
        dag.doc_md = """![image alt <](http://www.xpto.company/layout/site/images/logo_xpto.png)"""

        origin_layer = 'raw'
        dest_layer = 'pre_processed'
        env = Variable.get('env', default_var='dev')
        control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}"
        last_control_var = Variable.get(f'{dag_name}_last_control_var', default_var='000000000000000')
        current_dag_name = dag_name + '_' + control_var
        list_all_dates = eval(Variable.get(f'{dag_name}_list_all_dates', default_var='[]'))
        list_partitions = eval(Variable.get(f'{dag_name}_list_partitions', default_var='[]'))
        list_inconsistency = eval(Variable.get(f'{dag_name}_{dest_layer}_list_inconsistency', default_var='[]'))

        # -----------------
        #      TASKS
        # -----------------
        task_start = PythonOperator(
            task_id='start',
            python_callable=start_time,
            depends_on_past=False,
            op_kwargs={'dag_name': dag_name,
                       'execution_date': '{{ ts }}'}
        )
        
        with TaskGroup(group_id='group_pre_process_clean_load') as group_pre_process_clean_load:
            list_all_dates = AirflowMetaStoreHelper().set_granularity(list_all_dates=list_all_dates,
                                                                      agg_by=agg_by)
            for date in list_all_dates:
                PythonOperator(
                    task_id=f'pre_process_clean_load-{date}',
                    python_callable=pre_processAndCleanData().execute,
                    trigger_rule=TriggerRule.ALL_SUCCESS,
                    op_kwargs={
                        'dag_id': dag.dag_id,
                        'step': step,
                        'dag_name': dag_name,
                        'date': date,
                        'env': env,
                        'origin_layer': origin_layer,
                        'dest_layer': dest_layer,
                        'table_blob_col_blob': table_blob_col_blob,
                        'path_spark_xml': path_spark_xml,
                        'path_spark_avro': path_spark_avro,
                        'path_native_lib': path_native_lib,
                        'path_ojdbc': path_ojdbc,
                        'executor_cores': executor_cores,
                        'executor_memory': executor_memory,
                        'driver_memory': driver_memory,
                        'hdfs_conn_id': 'webhdfs',
                        'path_avro_schema': path_avro_schema,
                        'list_all_dates': list_all_dates,
                        'max_registry_by_avro': max_registry_by_avro
                    }
                )
        
        task_create_partitions = PythonOperator(
            task_id='create_partitions',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=CreatePartitions().execute,
            op_kwargs={'dag_name': dag_name,
                       'current_dag_name': current_dag_name,
                       'list_all_dates': list_all_dates,
                       'agg_by': agg_by,
                       'env': env,
                       'hive_conn_id': 'hive',
                       'impala_conn_id': 'impala',
                       'layer': dest_layer}
        )
        
        with TaskGroup(group_id='group_compare_data') as group_compare_data:
            list_all_dates = AirflowMetaStoreHelper().set_granularity(list_all_dates=list_all_dates,
                                                                      agg_by=agg_by)
        
            for date in list_all_dates:
                PythonOperator(
                    task_id=f'compare_{origin_layer}_and_{dest_layer}_{date}',
                    python_callable=CompareDataImpalaImpala().execute,
                    trigger_rule=TriggerRule.ALL_SUCCESS,
                    op_kwargs={'dag_name': dag_name,
                               'control_var': control_var,
                               'last_control_var': last_control_var,
                               'date': date,
                               'hive_conn_id': 'impala',
                               'origin_layer': origin_layer,
                               'dest_layer': dest_layer,
                               'redis_conn_id': cache_blob,
                               'table_id': table_id,
                               'dt_ref': table_id_col_dt_ref,
                               'agg_by': agg_by,
                               'table_id_col_fk': table_id_col_fk,
                               'table_id_col_dt_created': table_id_col_dt_created,
                               'list_partitions': list_partitions,
                               'list_all_dates': list_all_dates}
                )
        
        task_check_if_contains_inconsistency = BranchPythonOperator(
            task_id=f'check_if_contains_inconsistency',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            wait_for_downstream=True,
            python_callable=AirflowMetaStoreHelper('airflow_db').check_if_contains_inconsistency,
            op_kwargs={'dag_name': dag_name,
                       'last_control_var': last_control_var,
                       'layer': dest_layer,
                       'true_case': 'generate_list_inconsistency',
                       'false_case': 'clear_airflow_var',
                       'redis_conn_id': cache_blob,
                       'get_redis_key': f'{dag_name}_inconsistency_date'}
        )
        
        task_generate_list_inconsistency = PythonOperator(
            task_id=f'generate_list_inconsistency',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=AirflowMetaStoreHelper('airflow_db').generate_list_inconsistency,
            op_kwargs={'dag_name': dag_name,
                       'last_control_var': last_control_var,
                       'layer': dest_layer}
        )
        
        task_hdfs_clear_inconsistency_data = PythonOperator(
            task_id='hdfs_clear_inconsistency_data',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=HdfsHelper('webhdfs').clear_inconsistency_data,
            op_kwargs={'dag_name': dag_name,
                       'list_inconsistency': list_inconsistency,
                       'layer': dest_layer}
        )
        
        task_crash_dag = PythonOperator(
            task_id=f'crash_dag',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=crash_dag
        )

        task_clear_airflow_var = PythonOperator(
            task_id='clear_airflow_var',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=clear_airflow_var,
            op_kwargs={'airflow_conn_id': 'airflow_db',
                       'list_pattern_name_var': f"["
                                                f" '{dag_name}_%_total_row_id',"
                                                f" '{dag_name}_raw_total_registry_%',"
                                                f" '{dag_name}_list_dags',"
                                                f" '{dag_name}_list_path_to_concat',"
                                                f" '{dag_name}_list_path_to_rename',"
                                                f" '{dag_name}_list_all_dates'"
                                                f"]"}
        )

        task_no_inconsistencies_were_found = DummyOperator(
            task_id='no_inconsistencies_were_found'
        )

        task_end = PythonOperator(
            task_id='end',
            python_callable=end_time,
            op_kwargs={'current_dag_name': current_dag_name,
                       'dag_name': dag_name,
                       'last_control_var_name': f'{dag_name}_last_control_var',
                       'list_dates': f'{current_dag_name}_list_dates',
                       'postgres_conn_id': 'airflow_db'}
        )

    # -----------------
    #      GRAPH
    # -----------------
    task_start >> group_pre_process_clean_load >> task_create_partitions >> group_compare_data >> task_check_if_contains_inconsistency >> task_generate_list_inconsistency >> task_hdfs_clear_inconsistency_data >> task_crash_dag
    task_start >> group_pre_process_clean_load >> task_create_partitions >> group_compare_data >> task_check_if_contains_inconsistency >> task_clear_airflow_var >> task_no_inconsistencies_were_found >> task_end
    task_start >> task_clear_airflow_var >> task_no_inconsistencies_were_found >> task_end

    return dag
Exemplo n.º 9
0
def create_dag(dag_name: str, agg_by: str, doc_type: str, cache_blob: str,
               path_avro_schema: str, path_local_avro_schemas: str,
               executor_cores: str, executor_memory: str,
               executor_instances: str, driver_memory: str, col_type_pk: str,
               extra_cols: str, max_registry_by_file: str, oracle_conn_id: str,
               table_ctrl: str, table_ctrl_col_control_var: str,
               table_ctrl_col_fk: str, table_ctrl_col_dt_ref: str,
               table_ctrl_col_dt_created: str, oracle_conn_blob: str,
               table_blob: str, table_blob_col_pk: str,
               table_blob_col_blob: str) -> airflow.models.dag.DAG:
    # -----------------
    #        DAG
    # -----------------
    args = {
        'owner': 'job',
        'run_as_user': '******',
        'start_date': datetime(2021, 8, 17),
        'do_xcom_push': False,
        'depends_on_past': True,
        'retries': 10,
        'retry_delay': timedelta(seconds=60),
        'dag_name': dag_name
    }

    with DAG(dag_id=f'{step}_{dag_name}',
             description=f'Import data from {dag_name}',
             schedule_interval='00 19 * * *',
             catchup=False,
             default_args=args) as dag:
        dag.doc_md = __doc__
        dag.doc_md = """![image alt <](../big_data.wiki/.attachments/xpto_company.png)"""

        layer = 'raw'
        env = Variable.get('env', default_var='dev')
        control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}"
        last_control_var = Variable.get(f'{dag_name}_last_control_var',
                                        default_var='000000000000000')
        current_dag_name = dag_name + '_' + control_var
        total_pg = int(
            Variable.get(f'{current_dag_name}_total_pg', default_var=1))
        list_all_dates = eval(
            Variable.get(f'{dag_name}_list_all_dates', default_var='[]'))
        list_current_dates = eval(
            Variable.get(f'{current_dag_name}_current_dates',
                         default_var='[]'))
        list_dags = eval(
            Variable.get(f'{dag_name}_list_dags', default_var='[]'))
        total_rows = Variable.get('total_rows', default_var='100000')
        items_by_query = 1000

        sql_id = f'''
        SELECT
            {table_ctrl_col_fk} id,
            {table_ctrl_col_control_var} control_var,
            to_char({table_ctrl_col_dt_ref}, 'DD-MM-YYYY') dt_ref
        FROM {table_ctrl}
        WHERE
           {table_ctrl_col_control_var} > :control_var
           AND TO_DATE(to_char({table_ctrl_col_dt_created}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')
        ORDER BY {table_ctrl_col_control_var} ASC
        FETCH FIRST :total_rows ROWS ONLY'''
        dict_bind_sql_get_data = {
            'control_var': f'{control_var}',
            'total_rows': f'{total_rows}'
        }

        sql_count_id = f'''
        SELECT COUNT({table_ctrl_col_fk})
        FROM {table_ctrl}
        WHERE
           {table_ctrl_col_control_var} > :control_var
           AND TO_DATE(to_char({table_ctrl_col_dt_created}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')'''
        dict_bind_sql_count_id = {'control_var': f'{control_var}'}

        # -----------------
        #      TASKS
        # -----------------
        task_start = PythonOperator(task_id='start',
                                    python_callable=start_time,
                                    depends_on_past=False,
                                    op_kwargs={
                                        'dag_name': dag_name,
                                        'execution_date': '{{ ts }}'
                                    })

        task_oracle_execute_count = OracleGetResults(
            task_id='oracle_execute_count',
            current_dag_name=current_dag_name,
            oracle_conn_id=oracle_conn_id,
            sql_count_id=sql_count_id,
            dict_bind=dict_bind_sql_count_id)

        task_check_if_contains_data_in_oracle = BranchPythonOperator(
            task_id='check_if_contains_data_in_oracle',
            python_callable=AirflowMetaStoreHelper(
            ).check_if_contains_data_in_oracle,
            op_kwargs={
                'control_var': control_var,
                'last_control_var': last_control_var,
                'current_dag_name': current_dag_name,
                'redis_conn_id': cache_blob,
                'redis_key': f'{dag_name}_original',
                'true_case': 'get_id',
                'false_case': 'check_len_list_processed_dates'
            })

        task_get_id = OracleToRedisTransfer(
            task_id='get_id',
            oracle_conn_id=oracle_conn_id,
            redis_conn_id=cache_blob,
            sql=sql_id,
            dict_bind=dict_bind_sql_get_data,
            name_redis_key=f'{dag_name}_original')

        task_fill_data_gap = PythonOperator(
            task_id='fill_data_gap',
            python_callable=RedisHelper(cache_blob).fill_data_gaps,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'redis_conn_id': cache_blob,
                'redis_key': f'{dag_name}_original'
            })

        task_get_dag_name = PythonOperator(
            task_id='get_dag_name',
            python_callable=AirflowMetaStoreHelper().get_dag_name,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'name_list_dags': f'{dag_name}_list_dags',
                'list_dags': list_dags
            })

        task_get_date = PythonOperator(
            task_id='get_date',
            python_callable=RedisHelper(cache_blob).get_date,
            op_kwargs={
                'dag_name': dag_name,
                'current_dag_name': current_dag_name,
                'list_columns': "['id', 'control_var', 'date']",
                'redis_key': current_dag_name
            })

        task_split_id_by_date = PythonOperator(
            task_id='split_id_by_date',
            python_callable=RedisHelper(cache_blob).split_id_by_date,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'list_current_dates': list_current_dates,
                'redis_key': current_dag_name
            })

        task_generate_pagination = PythonOperator(
            task_id='generate_pagination',
            python_callable=RedisHelper(cache_blob).generate_pagination,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'items_by_query': items_by_query,
                'list_current_dates': list_current_dates,
                'redis_key': current_dag_name
            })

        task_generate_sql_by_date = PythonOperator(
            task_id='generate_sql_by_date',
            python_callable=RedisHelper(cache_blob).generate_sql_by_date,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'list_current_dates': list_current_dates,
                'oracle_conn': oracle_conn_blob,
                'table_ctrl': table_ctrl,
                'table_ctrl_col_fk': table_ctrl_col_fk,
                'table_blob': table_blob,
                'table_blob_col_pk': table_blob_col_pk,
                'table_blob_col_blob': table_blob_col_blob,
                'items_by_query': items_by_query,
                'total_pg': total_pg,
                'extra_cols': extra_cols,
                'redis_key': current_dag_name
            })

        task_extract_decompress_load = OracleBlobToHdfsTransfer(
            task_id=f'extract_decompress_load',
            retries=20,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            oracle_conn_id=oracle_conn_id,
            query_id=sql_id,
            table_ctrl_col_fk=table_ctrl_col_fk,
            extra_cols=extra_cols,
            oracle_conn_blob=oracle_conn_blob,
            table_blob_col_pk=table_blob_col_pk,
            table_blob_col_blob=table_blob_col_blob,
            path_avro_schema=path_avro_schema,
            path_local_avro_schemas=
            f'{path_local_avro_schemas}/{layer}/{dag_name}.avsc',
            total_pg=total_pg,
            layer=layer,
            env=env,
            step=step,
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            executor_instances=executor_instances,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            col_type_pk=col_type_pk,
            compress_type='snappy',
            hdfs_conn_id='webhdfs',
            oracle_driver='oracle.jdbc.driver.OracleDriver',
            list_current_dates=list_current_dates)

        task_update_control_var = PythonOperator(
            task_id='update_control_var',
            python_callable=AirflowMetaStoreHelper().update_control_var,
            trigger_rule=TriggerRule.ALL_SUCCESS,
            depends_on_past=True,
            op_kwargs={
                'control_var': control_var,
                'dag_name': dag_name,
                'current_dag_name': current_dag_name,
                'redis_conn_id': cache_blob,
                'last_control_var': last_control_var,
                'list_dags': list_dags,
                'total_pg': total_pg,
                'list_current_dates': list_current_dates,
                'list_all_dates': list_all_dates,
                'redis_key': current_dag_name
            })

        task_clear_environment = PythonOperator(
            task_id='clear_environment',
            python_callable=clear_environment,
            trigger_rule=TriggerRule.ALL_SUCCESS,
            op_kwargs={
                'control_var': control_var,
                'dag_name': dag_name,
                'redis_conn_id': cache_blob,
                'airflow_conn_id': 'airflow_db',
                'last_control_var': last_control_var,
                'list_dags': list_dags,
                'redis_key': current_dag_name
            })

        task_check_len_list_processed_dates = BranchPythonOperator(
            task_id='check_len_list_processed_dates',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=check_len_list_processed_dates,
            op_kwargs={
                'dag_name': dag_name,
                'list_all_dates': list_all_dates,
                'true_case': 'prepare_execution',
                'false_case': 'waiting_execution'
            })

        task_prepare_execution = DummyOperator(task_id='prepare_execution')

        with TaskGroup(
                group_id='group_hdfs_concat_file') as group_hdfs_concat_file:
            task_hdfs_prepare_concat = PythonOperator(
                task_id='hdfs_prepare_concat',
                trigger_rule=TriggerRule.ALL_SUCCESS,
                python_callable=HdfsPrepareConcat('webhdfs').execute,
                op_kwargs={
                    'dag_name': dag_name,
                    'current_dag_name': current_dag_name,
                    'hdfs_path': f'/data/{env}/{layer}/{dag_name}',
                    'agg_by': agg_by,
                    'layer': layer,
                    'env': env,
                    'list_all_dates': list_all_dates,
                    'path_avro_tools': path_avro_tools
                })

            # TODO: refactor -> create a task
            list_all_dates = AirflowMetaStoreHelper().set_granularity(
                list_all_dates=list_all_dates, agg_by=agg_by)
            for date in list_all_dates:
                task_concat_file = HdfsConcatFiles(
                    task_id=f'hdfs_concat_file-{date}',
                    retries=100,
                    dag_name=dag_name,
                    date=date,
                    layer=layer,
                    env=env,
                    col_name_control_var=table_ctrl_col_control_var,
                    path_avro_schema=path_avro_schema,
                    hdfs_conn_id='webhdfs',
                    executor_cores=executor_cores,
                    executor_memory=executor_memory,
                    driver_memory=driver_memory,
                    path_ojdbc=path_ojdbc,
                    path_spark_avro=path_spark_avro,
                    path_native_lib=path_native_lib,
                    format_data='avro',
                    compress_type='snappy',
                    max_registry_by_avro=max_registry_by_file)
                task_hdfs_prepare_concat >> task_concat_file

        task_create_partitions = PythonOperator(
            task_id='create_partitions',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=CreatePartitions().execute,
            op_kwargs={
                'dag_name': dag_name,
                'current_dag_name': current_dag_name,
                'list_all_dates': list_all_dates,
                'hive_conn_id': 'hive',
                'impala_conn_id': 'impala',
                'agg_by': agg_by,
                'layer': layer,
                'env': env
            })

        task_save_execution_state_hdfs = PythonOperator(
            task_id='save_execution_state_hdfs',
            python_callable=HdfsHelper('webhdfs').save_execution_state_hdfs,
            op_kwargs={
                'dag_name': dag_name,
                'layer': layer,
                'control_var': control_var
            })

        with TaskGroup(group_id='group_generate_statistics'
                       ) as group_generate_statistics:
            # TODO: refactor -> create a task
            list_all_dates = AirflowMetaStoreHelper().set_granularity(
                list_all_dates=list_all_dates, agg_by=agg_by)

            for date in list_all_dates:
                PythonOperator(task_id=f'generate_statistics-{date}',
                               retries=50,
                               python_callable=GenerateStatistics().execute,
                               op_kwargs={
                                   'dag_name': dag_name,
                                   'date': date,
                                   'layer': layer,
                                   'impala_conn_id': 'impala',
                                   'hive_conn_id': 'hive'
                               })

        with TaskGroup(group_id='group_check_data_quality'
                       ) as group_check_data_quality:
            # TODO: refactor -> create a task
            list_all_dates = AirflowMetaStoreHelper().set_granularity(
                list_all_dates=list_all_dates, agg_by=agg_by)

            for date in list_all_dates:
                CompareDataOracleImpala(
                    task_id=f'compare_oracle_impala_{date}',
                    retries=100,
                    dag_name=dag_name,
                    last_control_var=last_control_var,
                    layer=layer,
                    date=date,
                    table_ctrl=table_ctrl,
                    dt_ref=table_ctrl_col_dt_ref,
                    agg_by=agg_by,
                    oracle_conn_id=oracle_conn_id,
                    hive_conn='impala',
                    table_ctrl_col_fk=table_ctrl_col_fk,
                    table_ctrl_col_dt_created=table_ctrl_col_dt_created)

        task_check_if_contains_inconsistency = BranchPythonOperator(
            task_id=f'check_if_contains_inconsistency',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            wait_for_downstream=True,
            python_callable=AirflowMetaStoreHelper(
                'airflow_db').check_if_contains_inconsistency,
            op_kwargs={
                'dag_name': dag_name,
                'last_control_var': last_control_var,
                'layer': layer,
                'true_case': 'prepare_reprocessing_inconsistency_data',
                'false_case': f'check_next_dag',
                'redis_conn_id': cache_blob,
                'redis_key': f'{dag_name}_inconsistency_date'
            })

        task_prepare_reprocessing_inconsistency_data = PrepareReprocessingInconsistencyData(
            task_id=f'prepare_reprocessing_inconsistency_data',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            layer=layer,
            last_control_var=last_control_var,
            list_all_dates=list_all_dates,
            table_ctrl=table_ctrl,
            table_ctrl_col_fk=table_ctrl_col_fk,
            table_ctrl_col_control_var=table_ctrl_col_control_var,
            table_ctrl_col_dt_ref=table_ctrl_col_dt_ref,
            table_ctrl_col_dt_created=table_ctrl_col_dt_created,
            hive_conn_id='impala',
            hdfs_conn_id='webhdfs',
            airflow_conn_id='airflow_db',
            oracle_conn_id=oracle_conn_id)

        task_crash_dag = PythonOperator(
            task_id=f'crash_dag',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=crash_dag,
        )

        task_check_next_dag = BranchPythonOperator(
            task_id='check_next_dag',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=check_next_dag,
            op_kwargs={
                'dag_name': dag_name,
                'doc_type': doc_type,
                'true_case': f'trigger_pre_process_{dag_name}',
                'false_case': f'trigger_parser_{dag_name}'
            })

        task_trigger_pre_process = TriggerDagRunOperator(
            task_id=f'trigger_pre_process_{dag_name}',
            trigger_dag_id=f"pre_process_{dag_name}")

        task_trigger_parser = TriggerDagRunOperator(
            task_id=f'trigger_parser_{dag_name}',
            trigger_dag_id=f"parser_{dag_name}")

        task_trigger_import_file = TriggerDagRunOperator(
            task_id=f'trigger_import_file_{dag_name}',
            trigger_dag_id=dag.dag_id)

        task_waiting_execution = DummyOperator(
            trigger_rule=TriggerRule.ALL_DONE, task_id='waiting_execution')

        task_end = PythonOperator(task_id='end',
                                  python_callable=end_time,
                                  op_kwargs={
                                      'current_dag_name': current_dag_name,
                                      'dag_name': dag_name,
                                      'last_control_var_name':
                                      f'{dag_name}_last_control_var',
                                      'list_dates':
                                      f'{current_dag_name}_list_dates',
                                      'postgres_conn_id': 'airflow_db'
                                  })

    # -----------------
    #      GRAPH
    # -----------------
    # task_check_if_contains_data_in_oracle: true
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_get_id >> task_fill_data_gap >> [
        task_get_date, task_get_dag_name
    ] >> task_split_id_by_date >> task_generate_pagination >> task_generate_sql_by_date >> task_extract_decompress_load >> task_update_control_var >> [
        task_clear_environment, task_trigger_import_file
    ] >> task_waiting_execution >> task_end

    # task_check_if_contains_data_in_oracle: false
    #   task_check_len_list_processed_dates: true
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_prepare_execution >> [
        group_hdfs_concat_file, task_save_execution_state_hdfs
    ] >> task_create_partitions >> [
        group_check_data_quality, group_generate_statistics
    ] >> task_check_if_contains_inconsistency
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_prepare_execution >> [
        group_hdfs_concat_file, task_save_execution_state_hdfs
    ] >> task_create_partitions >> task_check_if_contains_inconsistency >> task_prepare_reprocessing_inconsistency_data >> task_crash_dag

    # task_check_next_dag: true
    task_check_if_contains_inconsistency >> task_check_next_dag >> task_trigger_pre_process >> task_waiting_execution >> task_end
    # task_check_next_dag: false
    task_check_if_contains_inconsistency >> task_check_next_dag >> task_trigger_parser >> task_waiting_execution >> task_end

    # task_check_if_contains_data_in_oracle: false
    #   task_check_len_list_processed_dates: false
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_waiting_execution >> task_end

    return dag
Exemplo n.º 10
0
    def execute(self, **context):
        template = '-' * 79
        hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id)
        spark, sc = init_spark(app_name=f'sync_data-{self.dag_name}',
                               step='sync',
                               env=self.env,
                               dag_name=self.dag_name,
                               layer=self.layer,
                               path_ojdbc=self.path_ojdbc,
                               path_spark_avro=self.path_spark_avro,
                               path_native_lib=self.path_native_lib,
                               executor_cores=self.executor_cores,
                               executor_memory=self.executor_memory,
                               executor_instances=self.executor_instances,
                               driver_memory=self.driver_memory)
        avro_schema = hdfs.read_avro_schema(
            path_avro_schema=self.path_avro_schema,
            layer=self.layer,
            dag_name=self.dag_name)
        hdfs_path = hdfs.generate_hdfs_path(env=self.env,
                                            layer=self.layer,
                                            dag_id=self.dag_name,
                                            is_partitioned=False)

        sql_get_data = f'''
        SELECT
            {self.col_name_control_var},
            {self.col_name_dt_ref}
        FROM {self.db_name}.{self.table_name}
        WHERE
            TO_DATE(to_char({self.col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
            < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')
        ORDER BY {self.col_name_control_var} ASC
        '''

        self.log.info(f'\n{template}\nGetting data from Oracle\n{template}')
        df_oracle_table = OracleHelper(self.oracle_conn_id) \
            .get_pyspark_df_from_table(oracle_driver=self.oracle_driver,
                                       spark=spark,
                                       table=f'({sql_get_data})',
                                       partition_col=self.col_name_control_var,
                                       n_partitions=250) \
            .orderBy(self.col_name_control_var) \
            .withColumn(self.col_name_control_var,
                        col(self.col_name_control_var).cast(LongType())) \
            .withColumn(self.col_name_dt_ref,
                        col(self.col_name_dt_ref).cast(StringType()))

        total_oracle = df_oracle_table.count()
        self.log.info(f'Total row from Oracle = {total_oracle}')

        self.log.info(f'\n{template}\nGetting data from HDFS\n{template}')
        hdfs.mv_files(hdfs_src_path=hdfs_path,
                      hdfs_dst_path=f'{hdfs_path}/../.tmp_{self.dag_name}')

        df_hdfs = hdfs \
            .load_pyspark_df(spark=spark,
                             data_format='parquet',
                             path=f'../../{hdfs_path}/../.tmp_{self.dag_name}') \
            .orderBy(self.col_name_control_var) \
            .withColumn(self.col_name_control_var, col(self.col_name_control_var).cast(LongType())) \
            .withColumn(self.col_name_dt_ref, col(self.col_name_dt_ref).cast(StringType()))

        df_hdfs_filtered = df_hdfs \
            .select(col(self.col_name_control_var),
                    col(self.col_name_dt_ref))

        total_hdfs = df_hdfs_filtered.count()
        self.log.info(f'Total row from HDFS = {total_hdfs}')

        if total_hdfs > total_oracle:
            self.log.warning(
                f'\n{template}\nTotal rows are not same equals!\n{template}')
            self.log.warning(f'\nOracle = {total_oracle}'
                             f'\nHDFS   = {total_hdfs}')

            self.log.info(
                f'\n{template}\nExecuting: df_hdfs - df_oracle_table\n{template}'
            )
            df_row_to_delete_hdfs = df_hdfs_filtered.subtract(df_oracle_table)
            list_row_to_delete_hdfs = [
                row[0] for row in df_row_to_delete_hdfs.select(
                    self.col_name_control_var).collect()
            ]
            self.log.info(
                f'Total row to delete = {df_row_to_delete_hdfs.count()}')

            self.log.info(f'\n{template}\nDeleting rows from HDFS\n{template}')
            df = df_hdfs.filter(~df_hdfs[self.col_name_control_var].isin(
                list_row_to_delete_hdfs))
            total_registry = df.count()
            self.log.info(f'Total row new df = {total_registry}')
            df.show(n=1, truncate=False)

            n_files = calculate_partitions(total_registry=total_registry,
                                           max_registry_by_avro=int(
                                               self.max_registry_by_file))

            self.log.info(f'\n{template}\nWriting table in HDFS\n{template}')
            hdfs.save_pyspark_df(df=df,
                                 format='parquet',
                                 avro_schema=avro_schema,
                                 compress_type=self.compress_type,
                                 mode='overwrite',
                                 partitions=n_files,
                                 hdfs_path=hdfs_path)
            hdfs.remove_all_files(
                hdfs_path=f'{hdfs_path}/../.tmp_{self.dag_name}')

        try:
            hdfs.mv_files(hdfs_src_path=f'{hdfs_path}/../.tmp_{self.dag_name}',
                          hdfs_dst_path=hdfs_path)
        except Exception as e:
            print(e)
Exemplo n.º 11
0
def create_dag(dag_name: str, max_registry_by_file: str, db_name: str,
               table_name: str, col_control_var: str, col_name_dt_ref: str,
               oracle_conn_id: str, path_avro_schema: str,
               path_config_docs: str, executor_cores: str,
               executor_memory: str, executor_instances: str,
               driver_memory: str) -> airflow.models.dag.DAG:
    # -----------------
    #        DAG
    # -----------------
    args = {
        'owner': 'job',
        'run_as_user': '******',
        'start_date': datetime(2021, 8, 1),
        'do_xcom_push': False,
        'depends_on_past': True,
        'retries': 10,
        'retry_delay': timedelta(seconds=90),
        'dag_name': dag_name
    }

    with DAG(
            dag_id=f'{step}_{dag_name}',
            description=f'Importa os dados de {dag_name}',
            # schedule_interval=None,
            schedule_interval='00 19 * * *',
            catchup=False,
            default_args=args) as dag:
        dag.doc_md = __doc__
        dag.doc_md = """![image alt <](../big_data.wiki/.attachments/xpto_company.png)"""
        layer = 'raw'
        env = Variable.get('env', default_var='prod')
        last_control_var = Variable.get(f'{dag_name}_last_control_var',
                                        default_var='000000000000000')
        control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}"
        current_dag_name = dag_name + '_' + control_var
        total_rows = Variable.get('total_rows', default_var='100000')

        sql_get_data = f'''
        SELECT
            *
        FROM {db_name}.{table_name}
        WHERE
            {col_control_var} > :control_var
            AND TO_DATE(to_char({col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')
        ORDER BY {col_control_var} ASC
        FETCH FIRST :total_rows ROWS ONLY'''
        dict_bind_sql_get_data = {
            'control_var': control_var,
            'total_rows': total_rows
        }

        sql_count_id = f'''
        SELECT COUNT({col_control_var})
        FROM {db_name}.{table_name}
        WHERE
            {col_control_var} > :control_var
            AND TO_DATE(to_char({col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')'''
        dict_bind_sql_count_id = {'control_var': control_var}

        # -----------------
        #      TASKS
        # -----------------
        task_start = PythonOperator(task_id='start',
                                    python_callable=start_time,
                                    depends_on_past=False,
                                    op_kwargs={
                                        'dag_name': dag_name,
                                        'execution_date': '{{ ts }}'
                                    })

        task_oracle_execute_count = OracleGetResults(
            task_id='oracle_execute_count',
            current_dag_name=current_dag_name,
            oracle_conn_id=oracle_conn_id,
            sql_count_id=sql_count_id,
            dict_bind=dict_bind_sql_count_id)

        task_check_if_contains_data_in_oracle = BranchPythonOperator(
            task_id='check_if_contains_data_in_oracle',
            python_callable=AirflowMetaStoreHelper(
            ).check_if_contains_data_in_oracle,
            op_kwargs={
                'control_var': control_var,
                'last_control_var': last_control_var,
                'current_dag_name': current_dag_name,
                'true_case': 'extract_transform_load',
                'false_case': 'sync_data'
            })

        task_extract_transform_load = OracleTableToHdfsTransfer(
            task_id=f'extract_transform_load',
            retries=20,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            sql_get_data=sql_get_data,
            dict_bind=dict_bind_sql_get_data,
            col_control_var=col_control_var,
            path_avro_schema=path_avro_schema,
            layer=layer,
            env=env,
            step=step,
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            executor_instances=executor_instances,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            compress_type='snappy',
            oracle_conn_id=oracle_conn_id,
            hdfs_conn_id='webhdfs',
            oracle_driver='oracle.jdbc.driver.OracleDriver',
            max_registry_by_file=max_registry_by_file)

        task_sync_data = SyncData(
            task_id='sync_data',
            dag_name=dag_name,
            db_name=db_name,
            table_name=table_name,
            col_name_control_var=col_control_var,
            col_name_dt_ref=col_name_dt_ref,
            path_avro_schema=path_avro_schema,
            layer=layer,
            env=env,
            hdfs_conn_id='webhdfs',
            oracle_conn_id=oracle_conn_id,
            oracle_driver='oracle.jdbc.driver.OracleDriver',
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            executor_instances=executor_instances,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            compress_type='snappy',
            max_registry_by_file=max_registry_by_file)

        task_concat_file = HdfsConcatFiles(
            task_id=f'hdfs_concat_file',
            retries=100,
            dag_name=dag_name,
            layer=layer,
            env=env,
            col_name_control_var=col_control_var,
            path_avro_schema=path_avro_schema,
            hdfs_conn_id='webhdfs',
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            format_data='parquet',
            compress_type='snappy',
            max_registry_by_avro=max_registry_by_file)

        task_save_execution_state_hdfs = PythonOperator(
            task_id='save_execution_state_hdfs',
            python_callable=HdfsHelper('webhdfs').save_execution_state_hdfs,
            op_kwargs={
                'dag_name': dag_name,
                'control_var': control_var,
                'layer': layer
            })

        task_update_statistics = UpdateStasByTable(
            task_id=f'update_statistics',
            retries=50,
            db_name=dag_name,
            table_name=layer,
            impala_conn_id='impala')

        task_clear_airflow_var = PythonOperator(
            task_id='clear_airflow_var',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=clear_airflow_var,
            op_kwargs={
                'airflow_conn_id':
                'airflow_db',
                'list_pattern_name_var':
                f"["
                f" '{dag_name}_%_total_row_id',"
                f" '{dag_name}_raw_total_registry_%',"
                f" '{dag_name}_list_dags',"
                f" '{dag_name}_list_path_to_concat',"
                f" '{dag_name}_list_path_to_rename',"
                f" '{dag_name}_list_all_dates'"
                f"]"
            })

        task_update_control_var = OracleUpdateControlVar(
            task_id='update_control_var',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            depends_on_past=True,
            control_var=control_var,
            last_control_var=last_control_var,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            col_control_var=col_control_var,
            oracle_conn_id=oracle_conn_id,
            dict_bind=dict_bind_sql_get_data,
            sql=sql_get_data)

        task_trigger_import_file = TriggerDagRunOperator(
            task_id=f'trigger_import_file_{dag_name}',
            trigger_dag_id=dag.dag_id)

        task_end = PythonOperator(task_id='end',
                                  trigger_rule=TriggerRule.ALL_DONE,
                                  python_callable=end_time,
                                  op_kwargs={
                                      'current_dag_name': current_dag_name,
                                      'dag_name': dag_name,
                                      'last_control_var_name':
                                      f'{dag_name}_last_control_var',
                                      'postgres_conn_id': 'airflow_db'
                                  })

    # -----------------
    #      GRAPH
    # -----------------
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_extract_transform_load >> task_update_control_var >> task_trigger_import_file >> task_end
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_sync_data >> [
        task_concat_file, task_save_execution_state_hdfs
    ] >> task_update_statistics >> task_clear_airflow_var >> task_end

    return dag
Exemplo n.º 12
0
    def execute(self, context):
        template = '-' * 79
        hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id)
        spark, sc = init_spark(app_name=f'{self.step}_{self.dag_name}',
                               step=self.step,
                               env=self.env,
                               dag_name=self.dag_name,
                               layer=self.layer,
                               path_ojdbc=self.path_ojdbc,
                               path_spark_avro=self.path_spark_avro,
                               path_native_lib=self.path_native_lib,
                               executor_cores=self.executor_cores,
                               executor_memory=self.executor_memory,
                               executor_instances=self.executor_instances,
                               driver_memory=self.driver_memory)
        avro_schema = hdfs.read_avro_schema(
            path_avro_schema=self.path_avro_schema,
            layer=self.layer,
            dag_name=self.dag_name)

        for date in self.list_current_dates:
            start = time.time()
            query_blob = Variable.get(
                f'{self.current_dag_name}_sql_blob_{date}')
            n_partitions = int(
                Variable.get(f'{self.current_dag_name}_total_pg_{date}')
            )  # 1 pg = 1000 rows
            hdfs_path = hdfs.generate_hdfs_path(dag_id=self.dag_name,
                                                env=self.env,
                                                layer=self.layer,
                                                date=date)

            self.log.info(
                f'\n{template}\nExtracting from Oracle: {date}\n{template}')
            try:
                if self.dag_name in ['acc', 'dimp']:
                    raise ValueError('Switch execution pyspark to pandas ...')

                df = get_pyspark_df_to_process(
                    oracle_conn_id=self.oracle_conn_id,
                    oracle_conn_blob=self.oracle_conn_blob,
                    oracle_driver=self.oracle_driver,
                    spark=spark,
                    n_partitions=n_partitions,
                    query_blob=query_blob,
                    table_blob_col_pk=self.table_blob_col_pk,
                    table_blob_col_blob=self.table_blob_col_blob,
                    current_dag_name=self.current_dag_name,
                    extra_cols=self.extra_cols,
                    date=date)

                df_preprocessed = preprocess_data_doc(
                    df=df,
                    col_type_pk=self.col_type_pk,
                    table_blob_col_pk=self.table_blob_col_pk,
                    table_blob_col_blob=self.table_blob_col_blob)

                # load dataframe into HDFS before preprocess decrease shuffle operation
                self.log.info(
                    f'\n{template}\nWriting avro in HDFS\n{template}')
                df_preprocessed.explain()

                hdfs.save_pyspark_df(df=df_preprocessed,
                                     avro_schema=avro_schema,
                                     compress_type=self.compress_type,
                                     mode='append',
                                     format='avro',
                                     partitions=n_partitions,
                                     hdfs_path=hdfs_path)

                self.log.info(
                    f'\n***** REPORT *****\n'
                    f'Date             = {date}\n'
                    f'Local            = {hdfs_path}\n'
                    f'Total time       = {time.time() - start} sec\n'
                    f'Total rows       = {df_preprocessed.count()}\n'
                    f'Total partitions = {df_preprocessed.rdd.getNumPartitions()}'
                )

            except (ValueError, Py4JJavaError,
                    pyspark.sql.utils.PythonException) as err:
                "E.g: ValueError: can not serialize object larger than 2G"
                self.log.error(
                    f"\n{template}\n{err.__class__} occurred !!!\n{template}\n"
                )
                self.log.info(f'\n{template}\nExecuting Pandas\n{template}')

                pdf = get_pandas_df_to_process(
                    oracle_conn_id=self.oracle_conn_id,
                    oracle_conn_blob=self.oracle_conn_blob,
                    query_blob=query_blob,
                    table_blob_col_pk=self.table_blob_col_pk,
                    table_blob_col_blob=self.table_blob_col_blob,
                    extra_cols=self.extra_cols,
                    current_dag_name=self.current_dag_name,
                    date=date)

                # TODO: create function
                # >========================================================
                if self.dag_name == 'acc':
                    data = pandas_preprocess_acc(
                        pdf=pdf,
                        table_blob_col_blob=self.table_blob_col_blob,
                        compress_type=self.compress_type,
                        avro_schema=self.path_local_avro_schemas)
                elif self.dag_name == 'dimp':
                    data = pandas_preprocess_dimp(
                        pdf=pdf,
                        table_blob_col_blob=self.table_blob_col_blob,
                        compress_type=self.compress_type,
                        avro_schema=self.path_local_avro_schemas)
                else:
                    data = pandas_preprocess_data_doc(
                        pdf=pdf,
                        table_blob_col_pk=self.table_blob_col_pk,
                        table_blob_col_blob=self.table_blob_col_blob,
                        compress_type=self.compress_type,
                        avro_schema=self.path_local_avro_schemas)
                # >========================================================

                self.log.info(
                    f'\n{template}\nWriting avro in HDFS\n{template}')
                hdfs \
                    .load_data(hdfs_path=hdfs_path,
                               data=data,
                               hdfs_filename=f'part-0-mapred_{self.current_dag_name}_{date}.avro')

                self.log.info(f'\n***** REPORT *****\n'
                              f'Date             = {date}\n'
                              f'Local            = {hdfs_path}\n'
                              f'Total time       = {time.time() - start} sec\n'
                              f'Total rows       = {len(pdf.index)}')