def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = dwm_oride_driver_act_m_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name,
                                                 hdfs_path, "true", "true")
示例#2
0
def execution_data_task_id(ds, ds_nodash, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = app_opay_bd_agent_report_d_sql_task(ds, ds_nodash)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name,
                                                 hdfs_path, "true", "true")
示例#3
0
def execution_data_task_id(ds,**kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = dim_oride_passenger_whitelist_base_sql_task(ds)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 熔断数据
    check_key_data_task(ds)

    # 生成_SUCCESS
    """
        第一个参数true: 数据目录是有country_code分区。false 没有
        第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

        """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
def execution_act_driver_task(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = app_oride_act_driver_cohort_m_sql_task(ds)

    # 执行hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    hdfs_path = get_table_info(3)[1]
    TaskTouchzSuccess().countries_touchz_success(ds, "oride_dw",
                                                 get_table_info(3)[0],
                                                 hdfs_path, "true", "true")
示例#5
0
def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = ods_sqoop_base_adjustment_increase_record_di_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name,
                                                 hdfs_path, "false", "true")
示例#6
0
def execution_data_task_id(ds, **kwargs):
    v_date = kwargs.get('v_execution_date')
    v_day = kwargs.get('v_execution_day')
    v_hour = kwargs.get('v_execution_hour')

    hive_hook = HiveCliHook()
    """
            #功能函数
            alter语句: alter_partition
            删除分区: delete_partition
            生产success: touchz_success

            #参数
            第一个参数true: 所有国家是否上线。false 没有
            第二个参数true: 数据目录是有country_code分区。false 没有
            第三个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

            #读取sql
            %_sql(ds,v_hour)

            第一个参数ds: 天级任务
            第二个参数v_hour: 小时级任务,需要使用

        """
    cf = CountriesPublicFrame("true", ds, db_name, table_name, hdfs_path, "true", "true")

    # 删除分区
    cf.delete_partition()

    # 读取sql
    _sql = "\n" + cf.alter_partition() + "\n" + dm_oride_order_base_d_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 熔断数据,如果数据不能为0
    # check_key_data_cnt_task(ds)

    # 生产success
    cf.touchz_success()
def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = dwm_oride_passenger_act_w_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(
        '{pt}'.format(pt=airflow.macros.ds_add(ds, +6)), db_name, table_name,
        hdfs_path, "true", "true")
def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = app_opay_owealth_report_d_19_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(airflow.macros.ds_add(ds, +1),
                                                 db_name, table_name,
                                                 hdfs_path, "true", "true")
示例#9
0
def execution_new_user_task(ds, **kargs):
    hive_hook = HiveCliHook()

    #读取sql
    _sql = app_oride_new_user_cohort_w_sql_task(ds)

    logging.info('Executing: %s', _sql)

    #执行hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    pt = airflow.macros.ds_add(ds, +6)
    hdfs_path = get_table_info(0)[1]
    TaskTouchzSuccess().countries_touchz_success(pt, "oride_dw", get_table_info(0)[0], hdfs_path, "true", "true")
def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = app_opos_user_shop_remain_month_m_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 熔断数据
    # check_key_data_task(ds)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
    def test_load_df_with_data_types(self, mock_run_cli):
        d = OrderedDict()
        d['b'] = [True]
        d['i'] = [-1]
        d['t'] = [1]
        d['f'] = [0.0]
        d['c'] = ['c']
        d['M'] = [datetime.datetime(2018, 1, 1)]
        d['O'] = [object()]
        d['S'] = ['STRING'.encode('utf-8')]
        d['U'] = ['STRING']
        d['V'] = [None]
        df = pd.DataFrame(d)

        hook = HiveCliHook()
        hook.load_df(df, 't')

        query = """
            CREATE TABLE IF NOT EXISTS t (
                b BOOLEAN,
                i BIGINT,
                t BIGINT,
                f DOUBLE,
                c STRING,
                M TIMESTAMP,
                O STRING,
                S STRING,
                U STRING,
                V STRING)
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            STORED AS textfile
            ;
        """

        def _trim(s):
            return re.sub("\s+", " ", s.strip())

        self.assertEqual(_trim(mock_run_cli.call_args_list[0][0][0]),
                         _trim(query))
示例#12
0
def execution_data_task_id(ds, execution_date, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = dwd_oride_location_driver_event_hi_sql_task(ds, execution_date)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 熔断数据
    # check_key_data_task(ds, execution_date)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "false",
                                                 execution_date.strftime("%H"))
示例#13
0
def execution_data_task_id(ds, **kwargs):
    hive_hook = HiveCliHook()

    v_hour = kwargs.get('v_execution_hour')

    # 读取sql
    _sql = dwd_driver_track_data_hi_sql_task(ds, v_hour)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name,
                                                 hdfs_path, "true", "false",
                                                 v_hour)
示例#14
0
    def test_load_df(self, mock_to_csv, mock_load_file):
        df = pd.DataFrame({"c": ["foo", "bar", "baz"]})
        table = "t"
        delimiter = ","
        encoding = "utf-8"

        hook = HiveCliHook()
        hook.load_df(df=df,
                     table=table,
                     delimiter=delimiter,
                     encoding=encoding)

        mock_to_csv.assert_called_once()
        kwargs = mock_to_csv.call_args[1]
        self.assertEqual(kwargs["header"], False)
        self.assertEqual(kwargs["index"], False)
        self.assertEqual(kwargs["sep"], delimiter.encode(encoding))

        mock_load_file.assert_called_once()
        kwargs = mock_load_file.call_args[1]
        self.assertEqual(kwargs["delimiter"], delimiter)
        self.assertEqual(kwargs["field_dict"], {"c": u"STRING"})
        self.assertEqual(kwargs["table"], table)
示例#15
0
    def execute(self, context=None):
        try:
            metastore_hook = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
            table_metadata = metastore_hook.get_table(self.table, db=self.schema)
            is_partitioned = len(table_metadata.partitionKeys) > 0
            column_string = ', '.join([col.name for col in table_metadata.sd.cols])

            where_clause = 'WHERE {}'.format(self.partition) if is_partitioned else ''
            self.hql = "SELECT COUNT(col2sum) FROM (SELECT COUNT(1) AS col2sum FROM {}.{} {} GROUP BY {}) t2 " \
                       "WHERE t2.col2sum > 1".format(self.schema, self.table, where_clause, column_string)

            hook = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id, mapred_queue=self.mapred_queue)
            hook.hive_cli_params = '-S'  # suppress hive junk output
            output = hook.run_cli(hql=self.hql, schema=self.schema)
            output_row = int(output.strip())

            if output_row > self.max_duplicates:
                raise AirflowException('There are {} duplicate records found whereas the max number of duplicates'
                                       ' can be {}'.format(output_row, self.max_duplicates))

        except Exception as e:
            raise AirflowException('An error occurred with the following duplicate check query:\n\t{}\n{}'
                                   .format(self.hql, e))
def merge_pre_hi_data_task(hive_db, hive_all_hi_table_name, hive_hi_table_name, is_must_have_data, pt, now_hour,
                           pre_hour_day, pre_hour,
                           **kwargs):
    sqoopSchema = SqoopSchemaUpdate()
    hive_columns = sqoopSchema.get_hive_column_name(hive_db, hive_all_hi_table_name)

    hql = ADD_HI_SQL.format(
        db_name=hive_db,
        hive_all_hi_table_name=hive_all_hi_table_name,
        hive_hi_table_name=hive_hi_table_name,
        pt=pt,
        now_hour=now_hour,
        pre_hour_day=pre_hour_day,
        pre_hour=pre_hour,
        columns=',\n'.join(hive_columns)
    )

    hive_hook = HiveCliHook()

    # 读取sql

    logging.info('Executing: %s', hql)

    # 执行Hive
    hive_hook.run_cli(hql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(pt, hive_db, hive_all_hi_table_name,
                                                 ALL_HI_OSS_PATH % hive_all_hi_table_name,
                                                 "false",
                                                 is_must_have_data,
                                                 now_hour)
示例#17
0
    def test_load_df_with_data_types(self, mock_run_cli):
        d = OrderedDict()
        d['b'] = [True]
        d['i'] = [-1]
        d['t'] = [1]
        d['f'] = [0.0]
        d['c'] = ['c']
        d['M'] = [datetime.datetime(2018, 1, 1)]
        d['O'] = [object()]
        d['S'] = [b'STRING']
        d['U'] = ['STRING']
        d['V'] = [None]
        df = pd.DataFrame(d)

        hook = HiveCliHook()
        hook.load_df(df, 't')

        query = """
            CREATE TABLE IF NOT EXISTS t (
                b BOOLEAN,
                i BIGINT,
                t BIGINT,
                f DOUBLE,
                c STRING,
                M TIMESTAMP,
                O STRING,
                S STRING,
                U STRING,
                V STRING)
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            STORED AS textfile
            ;
        """
        assertEqualIgnoreMultipleSpaces(self,
                                        mock_run_cli.call_args_list[0][0][0],
                                        query)
示例#18
0
    def test_load_df(self, mock_to_csv, mock_load_file):
        df = pd.DataFrame({"c": ["foo", "bar", "baz"]})
        table = "t"
        delimiter = ","
        encoding = "utf-8"

        hook = HiveCliHook()
        hook.load_df(df=df,
                     table=table,
                     delimiter=delimiter,
                     encoding=encoding)

        assert mock_to_csv.call_count == 1
        kwargs = mock_to_csv.call_args[1]
        self.assertEqual(kwargs["header"], False)
        self.assertEqual(kwargs["index"], False)
        self.assertEqual(kwargs["sep"], delimiter)

        assert mock_load_file.call_count == 1
        kwargs = mock_load_file.call_args[1]
        self.assertEqual(kwargs["delimiter"], delimiter)
        self.assertEqual(kwargs["field_dict"], {"c": "STRING"})
        self.assertTrue(isinstance(kwargs["field_dict"], OrderedDict))
        self.assertEqual(kwargs["table"], table)
示例#19
0
    def test_load_df_with_data_types(self, mock_run_cli):
        ord_dict = OrderedDict()
        ord_dict['b'] = [True]
        ord_dict['i'] = [-1]
        ord_dict['t'] = [1]
        ord_dict['f'] = [0.0]
        ord_dict['c'] = ['c']
        ord_dict['M'] = [datetime.datetime(2018, 1, 1)]
        ord_dict['O'] = [object()]
        ord_dict['S'] = [b'STRING']
        ord_dict['U'] = ['STRING']
        ord_dict['V'] = [None]
        df = pd.DataFrame(ord_dict)

        hook = HiveCliHook()
        hook.load_df(df, 't')

        query = """
            CREATE TABLE IF NOT EXISTS t (
                `b` BOOLEAN,
                `i` BIGINT,
                `t` BIGINT,
                `f` DOUBLE,
                `c` STRING,
                `M` TIMESTAMP,
                `O` STRING,
                `S` STRING,
                `U` STRING,
                `V` STRING)
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY ','
            STORED AS textfile
            ;
        """
        assert_equal_ignore_multiple_spaces(
            self, mock_run_cli.call_args_list[0][0][0], query)
示例#20
0
def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = dwd_active_user_week_di_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 熔断数据
    # check_key_data_task(ds)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    after_6_day = airflow.macros.ds_add(ds, +6)
    TaskTouchzSuccess().countries_touchz_success(after_6_day, db_name,
                                                 table_name, hdfs_path, "true",
                                                 "true")
    def execute(self, context):
        # Downloading file from S3
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        self.log.info("Downloading S3 file")

        if self.wildcard_match:
            if not self.s3.check_for_wildcard_key(self.s3_key):
                raise AirflowException("No key matches {0}"
                                       .format(self.s3_key))
            s3_key_object = self.s3.get_wildcard_key(self.s3_key)
        else:
            if not self.s3.check_for_key(self.s3_key):
                raise AirflowException(
                    "The key {0} does not exists".format(self.s3_key))
            s3_key_object = self.s3.get_key(self.s3_key)

        _, file_ext = os.path.splitext(s3_key_object.key)
        if (self.select_expression and self.input_compressed and
                file_ext.lower() != '.gz'):
            raise AirflowException("GZIP is the only compression " +
                                   "format Amazon S3 Select supports")

        with TemporaryDirectory(prefix='tmps32hive_') as tmp_dir,\
                NamedTemporaryFile(mode="wb",
                                   dir=tmp_dir,
                                   suffix=file_ext) as f:
            self.log.info(
                "Dumping S3 key %s contents to local file %s", s3_key_object.key, f.name
            )
            if self.select_expression:
                option = {}
                if self.headers:
                    option['FileHeaderInfo'] = 'USE'
                if self.delimiter:
                    option['FieldDelimiter'] = self.delimiter

                input_serialization = {'CSV': option}
                if self.input_compressed:
                    input_serialization['CompressionType'] = 'GZIP'

                content = self.s3.select_key(
                    bucket_name=s3_key_object.bucket_name,
                    key=s3_key_object.key,
                    expression=self.select_expression,
                    input_serialization=input_serialization
                )
                f.write(content.encode("utf-8"))
            else:
                s3_key_object.download_fileobj(f)
            f.flush()

            if self.select_expression or not self.headers:
                self.log.info("Loading file %s into Hive", f.name)
                self.hive.load_file(
                    f.name,
                    self.hive_table,
                    field_dict=self.field_dict,
                    create=self.create,
                    partition=self.partition,
                    delimiter=self.delimiter,
                    recreate=self.recreate,
                    tblproperties=self.tblproperties)
            else:
                # Decompressing file
                if self.input_compressed:
                    self.log.info("Uncompressing file %s", f.name)
                    fn_uncompressed = uncompress_file(f.name,
                                                      file_ext,
                                                      tmp_dir)
                    self.log.info("Uncompressed to %s", fn_uncompressed)
                    # uncompressed file available now so deleting
                    # compressed file to save disk space
                    f.close()
                else:
                    fn_uncompressed = f.name

                # Testing if header matches field_dict
                if self.check_headers:
                    self.log.info("Matching file header against field_dict")
                    header_list = self._get_top_row_as_list(fn_uncompressed)
                    if not self._match_headers(header_list):
                        raise AirflowException("Header check failed")

                # Deleting top header row
                self.log.info("Removing header from file %s", fn_uncompressed)
                headless_file = (
                    self._delete_top_row_and_compress(fn_uncompressed,
                                                      file_ext,
                                                      tmp_dir))
                self.log.info("Headless file %s", headless_file)
                self.log.info("Loading file %s into Hive", headless_file)
                self.hive.load_file(headless_file,
                                    self.hive_table,
                                    field_dict=self.field_dict,
                                    create=self.create,
                                    partition=self.partition,
                                    delimiter=self.delimiter,
                                    recreate=self.recreate,
                                    tblproperties=self.tblproperties)
示例#22
0
 def test_run_cli(self):
     hook = HiveCliHook()
     hook.run_cli("SHOW DATABASES")
示例#23
0
 def get_hook(self):
     if self.conn_type == 'mysql':
         from airflow.hooks.mysql_hook import MySqlHook
         return MySqlHook(mysql_conn_id=self.conn_id)
     elif self.conn_type == 'google_cloud_platform':
         from airflow.gcp.hooks.bigquery import BigQueryHook
         return BigQueryHook(bigquery_conn_id=self.conn_id)
     elif self.conn_type == 'postgres':
         from airflow.hooks.postgres_hook import PostgresHook
         return PostgresHook(postgres_conn_id=self.conn_id)
     elif self.conn_type == 'pig_cli':
         from airflow.hooks.pig_hook import PigCliHook
         return PigCliHook(pig_cli_conn_id=self.conn_id)
     elif self.conn_type == 'hive_cli':
         from airflow.hooks.hive_hooks import HiveCliHook
         return HiveCliHook(hive_cli_conn_id=self.conn_id)
     elif self.conn_type == 'presto':
         from airflow.hooks.presto_hook import PrestoHook
         return PrestoHook(presto_conn_id=self.conn_id)
     elif self.conn_type == 'hiveserver2':
         from airflow.hooks.hive_hooks import HiveServer2Hook
         return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
     elif self.conn_type == 'sqlite':
         from airflow.hooks.sqlite_hook import SqliteHook
         return SqliteHook(sqlite_conn_id=self.conn_id)
     elif self.conn_type == 'jdbc':
         from airflow.hooks.jdbc_hook import JdbcHook
         return JdbcHook(jdbc_conn_id=self.conn_id)
     elif self.conn_type == 'mssql':
         from airflow.hooks.mssql_hook import MsSqlHook
         return MsSqlHook(mssql_conn_id=self.conn_id)
     elif self.conn_type == 'oracle':
         from airflow.hooks.oracle_hook import OracleHook
         return OracleHook(oracle_conn_id=self.conn_id)
     elif self.conn_type == 'vertica':
         from airflow.contrib.hooks.vertica_hook import VerticaHook
         return VerticaHook(vertica_conn_id=self.conn_id)
     elif self.conn_type == 'cloudant':
         from airflow.contrib.hooks.cloudant_hook import CloudantHook
         return CloudantHook(cloudant_conn_id=self.conn_id)
     elif self.conn_type == 'jira':
         from airflow.contrib.hooks.jira_hook import JiraHook
         return JiraHook(jira_conn_id=self.conn_id)
     elif self.conn_type == 'redis':
         from airflow.contrib.hooks.redis_hook import RedisHook
         return RedisHook(redis_conn_id=self.conn_id)
     elif self.conn_type == 'wasb':
         from airflow.contrib.hooks.wasb_hook import WasbHook
         return WasbHook(wasb_conn_id=self.conn_id)
     elif self.conn_type == 'docker':
         from airflow.hooks.docker_hook import DockerHook
         return DockerHook(docker_conn_id=self.conn_id)
     elif self.conn_type == 'azure_data_lake':
         from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
         return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id)
     elif self.conn_type == 'azure_cosmos':
         from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook
         return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id)
     elif self.conn_type == 'cassandra':
         from airflow.contrib.hooks.cassandra_hook import CassandraHook
         return CassandraHook(cassandra_conn_id=self.conn_id)
     elif self.conn_type == 'mongo':
         from airflow.contrib.hooks.mongo_hook import MongoHook
         return MongoHook(conn_id=self.conn_id)
     elif self.conn_type == 'gcpcloudsql':
         from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook
         return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id)
     elif self.conn_type == 'grpc':
         from airflow.contrib.hooks.grpc_hook import GrpcHook
         return GrpcHook(grpc_conn_id=self.conn_id)
     raise AirflowException("Unknown hook type {}".format(self.conn_type))
示例#24
0
 def get_hook(self):
     return HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id,
                        run_as=self.run_as,
                        mapred_queue=self.mapred_queue,
                        mapred_queue_priority=self.mapred_queue_priority,
                        mapred_job_name=self.mapred_job_name)
def execution_data_task_id(ds, dag, **kwargs):
    v_date = kwargs.get('v_execution_date')
    v_day = kwargs.get('v_execution_day')
    v_hour = kwargs.get('v_execution_hour')

    hive_hook = HiveCliHook()

    # 读取sql
    # _sql = app_otrade_b2b_order_target_retailer_di_sql_task(ds)

    # logging.info('Executing: %s', _sql)

    # 执行Hive
    # hive_hook.run_cli(_sql)

    # 熔断数据
    # check_key_data_task(ds)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    # TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "false")
    """
        #功能函数
            alter语句: alter_partition()
            删除分区: delete_partition()
            生产success: touchz_success()

        #参数
            is_countries_online --是否开通多国家业务 默认(true 开通)
            db_name --hive 数据库的名称
            table_name --hive 表的名称
            data_oss_path --oss 数据目录的地址
            is_country_partition --是否有国家码分区,[默认(true 有country_code分区)]
            is_result_force_exist --数据是否强行产出,[默认(true 必须有数据才生成_SUCCESS)] false 数据没有也生成_SUCCESS 
            execute_time --当前脚本执行时间(%Y-%m-%d %H:%M:%S)
            is_hour_task --是否开通小时级任务,[默认(false)]
            frame_type --模板类型(只有 is_hour_task:'true' 时生效): utc 产出分区为utc时间,local 产出分区为本地时间,[默认(utc)]。
            is_offset --是否开启时间前后偏移(影响success 文件)
            execute_time_offset --执行时间偏移值(-1、0、1),在当前执行时间上,前后偏移原有时间,用于产出前后小时分区
            business_key --产品线名称

        #读取sql
            %_sql(ds,v_hour)

    """

    args = [{
        "dag": dag,
        "is_countries_online": "true",
        "db_name": db_name,
        "table_name": table_name,
        "data_oss_path": hdfs_path,
        "is_country_partition": "true",
        "is_result_force_exist": "false",
        "execute_time": v_date,
        "is_hour_task": "false",
        "frame_type": "local",
        "is_offset": "true",
        "execute_time_offset": -1,
        "business_key": "otrade"
    }]

    cf = CountriesAppFrame(args)

    # 删除分区
    cf.delete_partition()

    # 读取sql
    _sql = "\n" + cf.alter_partition(
    ) + "\n" + app_otrade_b2b_order_target_retailer_di_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生产success
    cf.touchz_success()
示例#26
0
def move_data_mssql(templates_dict, **kwargs):

    tmp_table = 'clean_mydata_tmp'
    table = 'clean_mydata'
    delimiter = "','"
    quote_char = "'\"'"
    escape_char = "'\\\\'"
    number_of_mappers = '4'
    mssql_table = 'mydata'

    # -------Convert Hive Table from PARQUET to TEXT --------------------------------------------

    hook = HiveCliHook()

    query = """
        DROP TABLE {tmp_table};

        CREATE TABLE {tmp_table}
            ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
            WITH SERDEPROPERTIES (
               "separatorChar" = {delimiter},
               "quoteChar"     = {quote_char},
               "escapeChar"    = {escape_char}
            )
        STORED AS TEXTFILE
        AS
        SELECT column1, column2, column3, column4, column5, column6,column7, column8, column9, column10, source_id
        FROM {table}

    """.format(tmp_table=tmp_table, table=table,
               delimiter=delimiter, quote_char=quote_char, escape_char=escape_char)

    hook.run_cli(schema=templates_dict['schema'], hql=query)

    # --------------Run sqoop--------------------------------------------------------------------

    # get default mssql connection
    sql_conn = BaseHook.get_connection('mssql_default')
    conn_str = "'jdbc:sqlserver://{host};databaseName={database}'".format(
        host=sql_conn.host,
        database=sql_conn.extra_dejson.get('database')
    )

    # get default hive cli connection
    hive_conn = BaseHook.get_connection('hive_cli_default')
    hdfs_export_dir = 'hdfs://{host}:{port}/user/hive/warehouse/my_hive_db.db/{table}'.format(
        host=hive_conn.host,
        port=8020,
        table=tmp_table
    )

    cmd = ['/usr/bin/sqoop', 'export', '--connect', conn_str,
           '--username', sql_conn.login, '--password', sql_conn.password,
           '--table', mssql_table,
           '--export-dir', hdfs_export_dir,
           '-m', number_of_mappers,
           '--input-fields-terminated-by', delimiter,
           '--input-enclosed-by', quote_char,
           '--input-escaped-by', escape_char,
           '--input-null-string', "'\\\N'"]
    cmd = ' '.join(cmd)

    print cmd

    logging.info("Executing sqoop")
    sp = Popen(cmd, shell=True)

    sp.wait()
    logging.info("Command exited with return code {0}".format(sp.returncode))
示例#27
0
def run_check_table(schema_table_db_name, schema_table_name,
                    target_table_db_name, target_table_name, conn_id,
                    hive_table_name, server_name, **kwargs):
    # SHOW TABLES in oride_db LIKE 'data_aa'
    check_sql = 'SHOW TABLES in %s LIKE \'%s\'' % (HIVE_DB, hive_table_name)
    hive2_conn = HiveServer2Hook().get_conn()
    cursor = hive2_conn.cursor()
    cursor.execute(check_sql)
    if len(cursor.fetchall()) == 0:
        logging.info('Create Hive Table: %s.%s', HIVE_DB, hive_table_name)
        # get table column
        column_sql = '''
                SELECT
                    COLUMN_NAME,
                    DATA_TYPE,
                    NUMERIC_PRECISION,
                    NUMERIC_SCALE,
                    COLUMN_COMMENT
                FROM
                    information_schema.columns
                WHERE
                    table_schema='{db_name}' and table_name='{table_name}'
            '''.format(db_name=schema_table_db_name,
                       table_name=schema_table_name)
        mysql_hook = MySqlHook(conn_id)
        mysql_conn = mysql_hook.get_conn()
        mysql_cursor = mysql_conn.cursor()
        mysql_cursor.execute(column_sql)
        results = mysql_cursor.fetchall()
        rows = []
        for result in results:
            if result[0] == 'dt':
                col_name = '_dt'
            else:
                col_name = result[0]
            if result[1] == 'timestamp' or result[1] == 'varchar' or result[1] == 'char' or result[1] == 'text' or \
                    result[1] == 'longtext' or \
                    result[1] == 'mediumtext' or \
                    result[1] == 'json' or \
                    result[1] == 'datetime':
                data_type = 'string'
            elif result[1] == 'decimal':
                data_type = result[1] + "(" + str(result[2]) + "," + str(
                    result[3]) + ")"
            else:
                data_type = result[1]
            rows.append("`%s` %s comment '%s'" %
                        (col_name, data_type, str(result[4]).replace(
                            '\n', '').replace('\r', '')))
        mysql_conn.close()

        # hive create table
        hive_hook = HiveCliHook()
        sql = ODS_CREATE_TABLE_SQL.format(
            db_name=HIVE_DB,
            table_name=hive_table_name,
            columns=",\n".join(rows),
            oss_path=OSS_PATH % ("{server_name}.{db_name}.{table_name}".format(
                server_name=server_name,
                db_name=target_table_db_name,
                table_name=target_table_name)))
        logging.info('Executing: %s', sql)
        hive_hook.run_cli(sql)

    else:
        sqoopSchema = SqoopSchemaUpdate()
        response = sqoopSchema.append_hive_schema(
            hive_db=HIVE_DB,
            hive_table=hive_table_name,
            mysql_db=schema_table_db_name,
            mysql_table=schema_table_name,
            mysql_conn=conn_id,
            oss_path=OSS_PATH % ("{server_name}.{db_name}.{table_name}".format(
                server_name=server_name,
                db_name=target_table_db_name,
                table_name=target_table_name)))
        if response:
            return True
    return
示例#28
0
 def get_hook(self):
     return HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id,
                        run_as=self.run_as)
示例#29
0
 def ddl(self):
     table = request.args.get("table")
     sql = "SHOW CREATE TABLE {table};".format(table=table)
     h = HiveCliHook(HIVE_CLI_CONN_ID)
     return h.run_cli(sql)
    def execute(self, context):
        # Downloading file from S3
        self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
        self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Downloading S3 file")

        if self.wildcard_match:
            if not self.s3.check_for_wildcard_key(self.s3_key):
                raise AirflowException("No key matches {0}".format(
                    self.s3_key))
            s3_key_object = self.s3.get_wildcard_key(self.s3_key)
        else:
            if not self.s3.check_for_key(self.s3_key):
                raise AirflowException("The key {0} does not exists".format(
                    self.s3_key))
            s3_key_object = self.s3.get_key(self.s3_key)
        root, file_ext = os.path.splitext(s3_key_object.key)
        with TemporaryDirectory(prefix='tmps32hive_') as tmp_dir,\
                NamedTemporaryFile(mode="w",
                                   dir=tmp_dir,
                                   suffix=file_ext) as f:
            logging.info("Dumping S3 key {0} contents to local"
                         " file {1}".format(s3_key_object.key, f.name))
            s3_key_object.get_contents_to_file(f)
            f.flush()
            self.s3.connection.close()
            if not self.headers:
                logging.info("Loading file {0} into Hive".format(f.name))
                self.hive.load_file(f.name,
                                    self.hive_table,
                                    field_dict=self.field_dict,
                                    create=self.create,
                                    partition=self.partition,
                                    delimiter=self.delimiter,
                                    recreate=self.recreate,
                                    tblproperties=self.tblproperties)
            else:
                # Decompressing file
                if self.input_compressed:
                    logging.info("Uncompressing file {0}".format(f.name))
                    fn_uncompressed = uncompress_file(f.name, file_ext,
                                                      tmp_dir)
                    logging.info("Uncompressed to {0}".format(fn_uncompressed))
                    # uncompressed file available now so deleting
                    # compressed file to save disk space
                    f.close()
                else:
                    fn_uncompressed = f.name

                # Testing if header matches field_dict
                if self.check_headers:
                    logging.info("Matching file header against field_dict")
                    header_list = self._get_top_row_as_list(fn_uncompressed)
                    if not self._match_headers(header_list):
                        raise AirflowException("Header check failed")

                # Deleting top header row
                logging.info(
                    "Removing header from file {0}".format(fn_uncompressed))
                headless_file = (self._delete_top_row_and_compress(
                    fn_uncompressed, file_ext, tmp_dir))
                logging.info("Headless file {0}".format(headless_file))
                logging.info(
                    "Loading file {0} into Hive".format(headless_file))
                self.hive.load_file(headless_file,
                                    self.hive_table,
                                    field_dict=self.field_dict,
                                    create=self.create,
                                    partition=self.partition,
                                    delimiter=self.delimiter,
                                    recreate=self.recreate,
                                    tblproperties=self.tblproperties)