示例#1
0
def get_pyspark_df_to_process(oracle_conn_id: str, oracle_conn_blob: str,
                              oracle_driver: str,
                              spark: pyspark.sql.session.SparkSession,
                              n_partitions: int, query_blob: str,
                              table_blob_col_pk: str, table_blob_col_blob: str,
                              current_dag_name: str, extra_cols: str,
                              date: str) -> pyspark.sql.dataframe.DataFrame:
    df = OracleHelper(oracle_conn_blob) \
        .get_pyspark_df_from_table(oracle_driver=oracle_driver,
                                   spark=spark,
                                   table=f'({query_blob})',
                                   partition_col='COL_PARTITION',
                                   n_partitions=n_partitions * 5) \
        .select(table_blob_col_pk,
                table_blob_col_blob)

    if len(extra_cols) > 0:
        query_extra_col = Variable.get(
            f'{current_dag_name}_sql_extra_cols_{date}')
        df_extra_cols = OracleHelper(oracle_conn_id) \
            .get_pyspark_df(spark=spark,
                            oracle_driver=oracle_driver,
                            sql=query_extra_col)

        return join_pyspark_df(df=df,
                               df_extra_cols=df_extra_cols,
                               id_df=table_blob_col_pk)

    return df
示例#2
0
def get_pandas_df_to_process(oracle_conn_id: str,
                             oracle_conn_blob: str,
                             query_blob: str,
                             table_blob_col_pk: str,
                             table_blob_col_blob: str,
                             extra_cols: str,
                             current_dag_name: str,
                             date: str) -> pd.DataFrame:
    df = OracleHelper(oracle_conn_blob) \
        .get_pandas_df(query_blob)

    df = df[
        [table_blob_col_pk, table_blob_col_blob]
    ]

    if len(extra_cols) > 0:
        query_extra_col = Variable.get(f'{current_dag_name}_sql_extra_cols_{date}')
        df_extra_cols = OracleHelper(oracle_conn_id) \
            .get_pandas_df(sql=query_extra_col)

        return join_pandas_df(pdf=df,
                              pdf_extra_cols=df_extra_cols,
                              id_df=table_blob_col_pk.upper())

    return df
示例#3
0
    def execute(self, context):
        oracle = OracleHelper(self.oracle_conn_id)
        self.log.info(
            f"Executing SQL:{self.sql_count_id}\nParameters: {self.dict_bind}")
        count_id = oracle.get_rows_with_bind(sql=self.sql_count_id,
                                             bind=self.dict_bind)[0][0]

        Variable.set(key=f'{self.current_dag_name}_total_row_id',
                     value=count_id)
        self.log.info(f"{count_id} rows are not in HDFS.")
    def execute(self, context):
        oracle = OracleHelper(self.oracle_conn_id)
        redis = RedisHook(self.redis_conn_id)
        self.log.info(f"Executing SQL:{self.sql}")

        self.log.info("Extracting data from Oracle")
        conn_redis = redis.get_conn()
        records = oracle.get_rows_with_bind(sql=self.sql,
                                            bind=self.dict_bind)

        self.log.info("Inserting rows into Redis")
        pipe = conn_redis.pipeline()
        [pipe.lpush(self.name_redis_key, str(row)) for row in records]
        pipe.execute()
        self.log.info(f"Inserted {len(records)} rows.")
示例#5
0
def test_oracle_conn_db_transfers():
    """
    Tests whether the connection in Airflow has been correctly created
    and validates that the connection is open
    """
    assert 1 == OracleHelper('DB_trans') \
        .get_rows('SELECT 1 db_name.table_name FETCH FIRST 1 ROWS ONLY')
示例#6
0
def prepare_avro_schema(layer: str, data_name: str, template: str,
                        path_ojdbc: str, path_native_lib: str, doc_type: str,
                        list_dict_cols: list) -> dict:
    if doc_type != 'table_test':
        return generate_avro_schema(data_name=data_name,
                                    layer=layer,
                                    list_dict_cols=list_dict_cols)

    "Generate dynamic avro schema from tables"
    spark, sc = init_spark(app_name='generate_data_schema',
                           step='generate_data_schema',
                           dag_name=data_name,
                           layer=layer,
                           env=env,
                           path_ojdbc=path_ojdbc,
                           path_native_lib=path_native_lib,
                           executor_cores='4',
                           executor_memory='4g',
                           executor_instances='2',
                           driver_memory='1g')

    logging.info(f'\n{template}\nGetting data from Oracle\n{template}')
    df_oracle_table = OracleHelper(context['oracle_conn_table']) \
        .get_pyspark_df(spark=spark,
                        oracle_driver='oracle.jdbc.driver.OracleDriver',
                        sql=f'SELECT * FROM {data_name}')
    df_preprocessed = preprocess_data_table(df_oracle_table)

    return generate_avro_schema_from_df(dag_name=data_name,
                                        layer=layer,
                                        df=df_preprocessed)
示例#7
0
    def generate_sql_by_date(self, **context) -> None:
        self.log.info(f'Generating SQL ...')
        items_by_query = int(context['items_by_query'])

        for date in context['list_current_dates']:
            list_records_unsorted = self.get_list_redis(context['redis_key'] + '_' + date)
            list_records = sorted(list_records_unsorted, key=lambda tup: tup[1])
            total_pg_date = eval(
                Variable.get(context['current_dag_name'] + '_' + 'total_pg' + '_' + date))

            self.log.info(f'Getting {context["redis_key"] + "_" + date} in Redis')

            sql_blob = OracleHelper(context['oracle_conn']) \
                .generate_sql_get_data(total_pg_date=total_pg_date,
                                       list_id_by_date=[x[-3] for x in list_records],
                                       items_by_query=items_by_query,
                                       date=date,
                                       table_blob=context['table_blob'],
                                       table_blob_col_pk=context['table_blob_col_pk'],
                                       table_blob_col_blob=context['table_blob_col_blob'])
            Variable.set(key=context['current_dag_name'] + '_' + 'sql_blob' + '_' + date,
                         value=sql_blob)

            if len(context['extra_cols']) > 0:
                self.log.info(f'Generating extra cols ...')
                sql_id_extra_cols = OracleHelper(context['oracle_conn']) \
                    .generate_sql_get_data(total_pg_date=total_pg_date,
                                           list_id_by_date=[x[-3] for x in list_records],
                                           date=date,
                                           items_by_query=items_by_query,
                                           table_ctrl=context['table_ctrl'],
                                           table_ctrl_col_fk=context['table_ctrl_col_fk'],
                                           has_extra_cols=True,
                                           extra_cols=context['extra_cols'])
                Variable.set(key=context['current_dag_name'] + '_' + 'sql_extra_cols' + '_' + date,
                             value=sql_id_extra_cols)
    def execute(self, context):
        oracle = OracleHelper(self.oracle_conn_id)
        sql = f"SELECT MAX({self.col_control_var}) FROM ({self.sql})"
        self.log.info(f"Executing SQL:\n{sql}")
        self.log.info(f"Parameters:\n{self.dict_bind}")

        max_value = f"{oracle.get_rows_with_bind(sql=sql, bind=self.dict_bind)[0][0]:015d}"  # 000.000.000.000.000

        Variable.set(key=f'{self.dag_name}_control_var', value=max_value)
        Variable.set(key=f'{self.dag_name}_last_control_var',
                     value=self.control_var)

        self.log.info(f'Updated Airflow variable:\n'
                      f'current_dag_name: {self.current_dag_name}\n'
                      f'last_control_var to: {self.control_var}\n'
                      f'control_var to: {max_value}')
示例#9
0
    def generate_all_partitions(self, oracle_conn: str, table_ctrl: str,
                                table_ctrl_col_dt_ref: str, agg_by: str,
                                env: str, layer: str, data: str) -> None:
        """
        Generates all partitions in hive and impala.
        Useful when it is necessary to recreate/change dbs, tables or data directories in HDFS
        """
        list_all_dates = OracleHelper(oracle_conn).get_all_dates(
            table_ctrl=table_ctrl, table_ctrl_col_dt_ref=table_ctrl_col_dt_ref)
        list_all_dates = [dt for dt in list_all_dates if dt is not None]
        list_dates = AirflowMetaStoreHelper().set_granularity(
            list_all_dates=list_all_dates, agg_by=agg_by)

        for date in list_dates:
            hdfs_path = HdfsHelper().generate_hdfs_path(env=env,
                                                        layer=layer,
                                                        dag_id=data,
                                                        date=date)

            self.log.info(f"Creating partition:")
            self.add_partition(date=date,
                               db=data,
                               table=layer,
                               hdfs_path=hdfs_path)
示例#10
0
    def execute(self, context):
        template = '-' * 79
        start = time.time()
        hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id)
        oracle = OracleHelper(self.oracle_conn_id)

        spark, sc = init_spark(app_name=f'{self.step}_{self.dag_name}',
                               step=self.step,
                               env=self.env,
                               dag_name=self.dag_name,
                               layer=self.layer,
                               path_ojdbc=self.path_ojdbc,
                               path_spark_avro=self.path_spark_avro,
                               path_native_lib=self.path_native_lib,
                               executor_cores=self.executor_cores,
                               executor_memory=self.executor_memory,
                               executor_instances=self.executor_instances,
                               driver_memory=self.driver_memory)
        avro_schema = hdfs \
            .read_avro_schema(path_avro_schema=self.path_avro_schema,
                              layer=self.layer,
                              dag_name=self.dag_name)

        hdfs_path = hdfs \
            .generate_hdfs_path(dag_id=self.dag_name,
                                env=self.env,
                                layer=self.layer,
                                is_partitioned=False)

        self.log.info(f'\n{template}\nGetting data from Oracle\n{template}')
        self.log.info(
            f'query:{self.sql_get_data}\n parameters:\n{self.dict_bind}')
        records = oracle.get_rows_with_bind(sql=self.sql_get_data,
                                            bind=self.dict_bind)
        list_dict_cols = read_data_config(
            self.dag_name)[self.dag_name]['hdfs_data_schema']['raw']['cols']
        df_oracle_table = convert_type_oracle_to_spark(
            spark=spark, records=records, list_dict_cols=list_dict_cols)

        df_preprocessed = preprocess_data_table(df_oracle_table)
        df_preprocessed.explain()
        df_preprocessed.printSchema()
        df_preprocessed.show(n=1)

        total_registry = df_oracle_table.count()
        n_partitions = calculate_partitions(total_registry=total_registry,
                                            max_registry_by_avro=int(
                                                self.max_registry_by_file))
        # TODO: analyze and test ORC (accept ACID)
        self.log.info(f'\n{template}\nWriting table in HDFS\n{template}')
        hdfs.save_pyspark_df(df=df_preprocessed,
                             format='parquet',
                             avro_schema=avro_schema,
                             compress_type=self.compress_type,
                             mode='append',
                             partitions=n_partitions,
                             hdfs_path=hdfs_path)

        self.log.info(
            f'\n***** REPORT *****\n'
            f'Local            = {hdfs_path}\n'
            f'Total time       = {time.time() - start} sec\n'
            f'Total rows       = {total_registry}\n'
            f'Total partitions = {df_preprocessed.rdd.getNumPartitions()}')
示例#11
0
    def execute(self, **context):
        template = '-' * 79
        hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id)
        spark, sc = init_spark(app_name=f'sync_data-{self.dag_name}',
                               step='sync',
                               env=self.env,
                               dag_name=self.dag_name,
                               layer=self.layer,
                               path_ojdbc=self.path_ojdbc,
                               path_spark_avro=self.path_spark_avro,
                               path_native_lib=self.path_native_lib,
                               executor_cores=self.executor_cores,
                               executor_memory=self.executor_memory,
                               executor_instances=self.executor_instances,
                               driver_memory=self.driver_memory)
        avro_schema = hdfs.read_avro_schema(
            path_avro_schema=self.path_avro_schema,
            layer=self.layer,
            dag_name=self.dag_name)
        hdfs_path = hdfs.generate_hdfs_path(env=self.env,
                                            layer=self.layer,
                                            dag_id=self.dag_name,
                                            is_partitioned=False)

        sql_get_data = f'''
        SELECT
            {self.col_name_control_var},
            {self.col_name_dt_ref}
        FROM {self.db_name}.{self.table_name}
        WHERE
            TO_DATE(to_char({self.col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
            < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')
        ORDER BY {self.col_name_control_var} ASC
        '''

        self.log.info(f'\n{template}\nGetting data from Oracle\n{template}')
        df_oracle_table = OracleHelper(self.oracle_conn_id) \
            .get_pyspark_df_from_table(oracle_driver=self.oracle_driver,
                                       spark=spark,
                                       table=f'({sql_get_data})',
                                       partition_col=self.col_name_control_var,
                                       n_partitions=250) \
            .orderBy(self.col_name_control_var) \
            .withColumn(self.col_name_control_var,
                        col(self.col_name_control_var).cast(LongType())) \
            .withColumn(self.col_name_dt_ref,
                        col(self.col_name_dt_ref).cast(StringType()))

        total_oracle = df_oracle_table.count()
        self.log.info(f'Total row from Oracle = {total_oracle}')

        self.log.info(f'\n{template}\nGetting data from HDFS\n{template}')
        hdfs.mv_files(hdfs_src_path=hdfs_path,
                      hdfs_dst_path=f'{hdfs_path}/../.tmp_{self.dag_name}')

        df_hdfs = hdfs \
            .load_pyspark_df(spark=spark,
                             data_format='parquet',
                             path=f'../../{hdfs_path}/../.tmp_{self.dag_name}') \
            .orderBy(self.col_name_control_var) \
            .withColumn(self.col_name_control_var, col(self.col_name_control_var).cast(LongType())) \
            .withColumn(self.col_name_dt_ref, col(self.col_name_dt_ref).cast(StringType()))

        df_hdfs_filtered = df_hdfs \
            .select(col(self.col_name_control_var),
                    col(self.col_name_dt_ref))

        total_hdfs = df_hdfs_filtered.count()
        self.log.info(f'Total row from HDFS = {total_hdfs}')

        if total_hdfs > total_oracle:
            self.log.warning(
                f'\n{template}\nTotal rows are not same equals!\n{template}')
            self.log.warning(f'\nOracle = {total_oracle}'
                             f'\nHDFS   = {total_hdfs}')

            self.log.info(
                f'\n{template}\nExecuting: df_hdfs - df_oracle_table\n{template}'
            )
            df_row_to_delete_hdfs = df_hdfs_filtered.subtract(df_oracle_table)
            list_row_to_delete_hdfs = [
                row[0] for row in df_row_to_delete_hdfs.select(
                    self.col_name_control_var).collect()
            ]
            self.log.info(
                f'Total row to delete = {df_row_to_delete_hdfs.count()}')

            self.log.info(f'\n{template}\nDeleting rows from HDFS\n{template}')
            df = df_hdfs.filter(~df_hdfs[self.col_name_control_var].isin(
                list_row_to_delete_hdfs))
            total_registry = df.count()
            self.log.info(f'Total row new df = {total_registry}')
            df.show(n=1, truncate=False)

            n_files = calculate_partitions(total_registry=total_registry,
                                           max_registry_by_avro=int(
                                               self.max_registry_by_file))

            self.log.info(f'\n{template}\nWriting table in HDFS\n{template}')
            hdfs.save_pyspark_df(df=df,
                                 format='parquet',
                                 avro_schema=avro_schema,
                                 compress_type=self.compress_type,
                                 mode='overwrite',
                                 partitions=n_files,
                                 hdfs_path=hdfs_path)
            hdfs.remove_all_files(
                hdfs_path=f'{hdfs_path}/../.tmp_{self.dag_name}')

        try:
            hdfs.mv_files(hdfs_src_path=f'{hdfs_path}/../.tmp_{self.dag_name}',
                          hdfs_dst_path=hdfs_path)
        except Exception as e:
            print(e)