def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = dwm_oride_driver_act_m_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
def execution_data_task_id(ds, ds_nodash, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = app_opay_bd_agent_report_d_sql_task(ds, ds_nodash) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
def execution_data_task_id(ds,**kargs): hive_hook = HiveCliHook() # 读取sql _sql = dim_oride_passenger_whitelist_base_sql_task(ds) # 执行Hive hive_hook.run_cli(_sql) # 熔断数据 check_key_data_task(ds) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
def execution_act_driver_task(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = app_oride_act_driver_cohort_m_sql_task(ds) # 执行hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ hdfs_path = get_table_info(3)[1] TaskTouchzSuccess().countries_touchz_success(ds, "oride_dw", get_table_info(3)[0], hdfs_path, "true", "true")
def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = ods_sqoop_base_adjustment_increase_record_di_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "false", "true")
def execution_data_task_id(ds, **kwargs): v_date = kwargs.get('v_execution_date') v_day = kwargs.get('v_execution_day') v_hour = kwargs.get('v_execution_hour') hive_hook = HiveCliHook() """ #功能函数 alter语句: alter_partition 删除分区: delete_partition 生产success: touchz_success #参数 第一个参数true: 所有国家是否上线。false 没有 第二个参数true: 数据目录是有country_code分区。false 没有 第三个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS #读取sql %_sql(ds,v_hour) 第一个参数ds: 天级任务 第二个参数v_hour: 小时级任务,需要使用 """ cf = CountriesPublicFrame("true", ds, db_name, table_name, hdfs_path, "true", "true") # 删除分区 cf.delete_partition() # 读取sql _sql = "\n" + cf.alter_partition() + "\n" + dm_oride_order_base_d_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 熔断数据,如果数据不能为0 # check_key_data_cnt_task(ds) # 生产success cf.touchz_success()
def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = dwm_oride_passenger_act_w_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success( '{pt}'.format(pt=airflow.macros.ds_add(ds, +6)), db_name, table_name, hdfs_path, "true", "true")
def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = app_opay_owealth_report_d_19_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(airflow.macros.ds_add(ds, +1), db_name, table_name, hdfs_path, "true", "true")
def execution_new_user_task(ds, **kargs): hive_hook = HiveCliHook() #读取sql _sql = app_oride_new_user_cohort_w_sql_task(ds) logging.info('Executing: %s', _sql) #执行hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ pt = airflow.macros.ds_add(ds, +6) hdfs_path = get_table_info(0)[1] TaskTouchzSuccess().countries_touchz_success(pt, "oride_dw", get_table_info(0)[0], hdfs_path, "true", "true")
def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = app_opos_user_shop_remain_month_m_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 熔断数据 # check_key_data_task(ds) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
def test_load_df_with_data_types(self, mock_run_cli): d = OrderedDict() d['b'] = [True] d['i'] = [-1] d['t'] = [1] d['f'] = [0.0] d['c'] = ['c'] d['M'] = [datetime.datetime(2018, 1, 1)] d['O'] = [object()] d['S'] = ['STRING'.encode('utf-8')] d['U'] = ['STRING'] d['V'] = [None] df = pd.DataFrame(d) hook = HiveCliHook() hook.load_df(df, 't') query = """ CREATE TABLE IF NOT EXISTS t ( b BOOLEAN, i BIGINT, t BIGINT, f DOUBLE, c STRING, M TIMESTAMP, O STRING, S STRING, U STRING, V STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS textfile ; """ def _trim(s): return re.sub("\s+", " ", s.strip()) self.assertEqual(_trim(mock_run_cli.call_args_list[0][0][0]), _trim(query))
def execution_data_task_id(ds, execution_date, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = dwd_oride_location_driver_event_hi_sql_task(ds, execution_date) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 熔断数据 # check_key_data_task(ds, execution_date) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "false", execution_date.strftime("%H"))
def execution_data_task_id(ds, **kwargs): hive_hook = HiveCliHook() v_hour = kwargs.get('v_execution_hour') # 读取sql _sql = dwd_driver_track_data_hi_sql_task(ds, v_hour) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "false", v_hour)
def test_load_df(self, mock_to_csv, mock_load_file): df = pd.DataFrame({"c": ["foo", "bar", "baz"]}) table = "t" delimiter = "," encoding = "utf-8" hook = HiveCliHook() hook.load_df(df=df, table=table, delimiter=delimiter, encoding=encoding) mock_to_csv.assert_called_once() kwargs = mock_to_csv.call_args[1] self.assertEqual(kwargs["header"], False) self.assertEqual(kwargs["index"], False) self.assertEqual(kwargs["sep"], delimiter.encode(encoding)) mock_load_file.assert_called_once() kwargs = mock_load_file.call_args[1] self.assertEqual(kwargs["delimiter"], delimiter) self.assertEqual(kwargs["field_dict"], {"c": u"STRING"}) self.assertEqual(kwargs["table"], table)
def execute(self, context=None): try: metastore_hook = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table_metadata = metastore_hook.get_table(self.table, db=self.schema) is_partitioned = len(table_metadata.partitionKeys) > 0 column_string = ', '.join([col.name for col in table_metadata.sd.cols]) where_clause = 'WHERE {}'.format(self.partition) if is_partitioned else '' self.hql = "SELECT COUNT(col2sum) FROM (SELECT COUNT(1) AS col2sum FROM {}.{} {} GROUP BY {}) t2 " \ "WHERE t2.col2sum > 1".format(self.schema, self.table, where_clause, column_string) hook = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id, mapred_queue=self.mapred_queue) hook.hive_cli_params = '-S' # suppress hive junk output output = hook.run_cli(hql=self.hql, schema=self.schema) output_row = int(output.strip()) if output_row > self.max_duplicates: raise AirflowException('There are {} duplicate records found whereas the max number of duplicates' ' can be {}'.format(output_row, self.max_duplicates)) except Exception as e: raise AirflowException('An error occurred with the following duplicate check query:\n\t{}\n{}' .format(self.hql, e))
def merge_pre_hi_data_task(hive_db, hive_all_hi_table_name, hive_hi_table_name, is_must_have_data, pt, now_hour, pre_hour_day, pre_hour, **kwargs): sqoopSchema = SqoopSchemaUpdate() hive_columns = sqoopSchema.get_hive_column_name(hive_db, hive_all_hi_table_name) hql = ADD_HI_SQL.format( db_name=hive_db, hive_all_hi_table_name=hive_all_hi_table_name, hive_hi_table_name=hive_hi_table_name, pt=pt, now_hour=now_hour, pre_hour_day=pre_hour_day, pre_hour=pre_hour, columns=',\n'.join(hive_columns) ) hive_hook = HiveCliHook() # 读取sql logging.info('Executing: %s', hql) # 执行Hive hive_hook.run_cli(hql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(pt, hive_db, hive_all_hi_table_name, ALL_HI_OSS_PATH % hive_all_hi_table_name, "false", is_must_have_data, now_hour)
def test_load_df_with_data_types(self, mock_run_cli): d = OrderedDict() d['b'] = [True] d['i'] = [-1] d['t'] = [1] d['f'] = [0.0] d['c'] = ['c'] d['M'] = [datetime.datetime(2018, 1, 1)] d['O'] = [object()] d['S'] = [b'STRING'] d['U'] = ['STRING'] d['V'] = [None] df = pd.DataFrame(d) hook = HiveCliHook() hook.load_df(df, 't') query = """ CREATE TABLE IF NOT EXISTS t ( b BOOLEAN, i BIGINT, t BIGINT, f DOUBLE, c STRING, M TIMESTAMP, O STRING, S STRING, U STRING, V STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS textfile ; """ assertEqualIgnoreMultipleSpaces(self, mock_run_cli.call_args_list[0][0][0], query)
def test_load_df(self, mock_to_csv, mock_load_file): df = pd.DataFrame({"c": ["foo", "bar", "baz"]}) table = "t" delimiter = "," encoding = "utf-8" hook = HiveCliHook() hook.load_df(df=df, table=table, delimiter=delimiter, encoding=encoding) assert mock_to_csv.call_count == 1 kwargs = mock_to_csv.call_args[1] self.assertEqual(kwargs["header"], False) self.assertEqual(kwargs["index"], False) self.assertEqual(kwargs["sep"], delimiter) assert mock_load_file.call_count == 1 kwargs = mock_load_file.call_args[1] self.assertEqual(kwargs["delimiter"], delimiter) self.assertEqual(kwargs["field_dict"], {"c": "STRING"}) self.assertTrue(isinstance(kwargs["field_dict"], OrderedDict)) self.assertEqual(kwargs["table"], table)
def test_load_df_with_data_types(self, mock_run_cli): ord_dict = OrderedDict() ord_dict['b'] = [True] ord_dict['i'] = [-1] ord_dict['t'] = [1] ord_dict['f'] = [0.0] ord_dict['c'] = ['c'] ord_dict['M'] = [datetime.datetime(2018, 1, 1)] ord_dict['O'] = [object()] ord_dict['S'] = [b'STRING'] ord_dict['U'] = ['STRING'] ord_dict['V'] = [None] df = pd.DataFrame(ord_dict) hook = HiveCliHook() hook.load_df(df, 't') query = """ CREATE TABLE IF NOT EXISTS t ( `b` BOOLEAN, `i` BIGINT, `t` BIGINT, `f` DOUBLE, `c` STRING, `M` TIMESTAMP, `O` STRING, `S` STRING, `U` STRING, `V` STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS textfile ; """ assert_equal_ignore_multiple_spaces( self, mock_run_cli.call_args_list[0][0][0], query)
def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = dwd_active_user_week_di_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 熔断数据 # check_key_data_task(ds) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ after_6_day = airflow.macros.ds_add(ds, +6) TaskTouchzSuccess().countries_touchz_success(after_6_day, db_name, table_name, hdfs_path, "true", "true")
def execute(self, context): # Downloading file from S3 self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.log.info("Downloading S3 file") if self.wildcard_match: if not self.s3.check_for_wildcard_key(self.s3_key): raise AirflowException("No key matches {0}" .format(self.s3_key)) s3_key_object = self.s3.get_wildcard_key(self.s3_key) else: if not self.s3.check_for_key(self.s3_key): raise AirflowException( "The key {0} does not exists".format(self.s3_key)) s3_key_object = self.s3.get_key(self.s3_key) _, file_ext = os.path.splitext(s3_key_object.key) if (self.select_expression and self.input_compressed and file_ext.lower() != '.gz'): raise AirflowException("GZIP is the only compression " + "format Amazon S3 Select supports") with TemporaryDirectory(prefix='tmps32hive_') as tmp_dir,\ NamedTemporaryFile(mode="wb", dir=tmp_dir, suffix=file_ext) as f: self.log.info( "Dumping S3 key %s contents to local file %s", s3_key_object.key, f.name ) if self.select_expression: option = {} if self.headers: option['FileHeaderInfo'] = 'USE' if self.delimiter: option['FieldDelimiter'] = self.delimiter input_serialization = {'CSV': option} if self.input_compressed: input_serialization['CompressionType'] = 'GZIP' content = self.s3.select_key( bucket_name=s3_key_object.bucket_name, key=s3_key_object.key, expression=self.select_expression, input_serialization=input_serialization ) f.write(content.encode("utf-8")) else: s3_key_object.download_fileobj(f) f.flush() if self.select_expression or not self.headers: self.log.info("Loading file %s into Hive", f.name) self.hive.load_file( f.name, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate, tblproperties=self.tblproperties) else: # Decompressing file if self.input_compressed: self.log.info("Uncompressing file %s", f.name) fn_uncompressed = uncompress_file(f.name, file_ext, tmp_dir) self.log.info("Uncompressed to %s", fn_uncompressed) # uncompressed file available now so deleting # compressed file to save disk space f.close() else: fn_uncompressed = f.name # Testing if header matches field_dict if self.check_headers: self.log.info("Matching file header against field_dict") header_list = self._get_top_row_as_list(fn_uncompressed) if not self._match_headers(header_list): raise AirflowException("Header check failed") # Deleting top header row self.log.info("Removing header from file %s", fn_uncompressed) headless_file = ( self._delete_top_row_and_compress(fn_uncompressed, file_ext, tmp_dir)) self.log.info("Headless file %s", headless_file) self.log.info("Loading file %s into Hive", headless_file) self.hive.load_file(headless_file, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate, tblproperties=self.tblproperties)
def test_run_cli(self): hook = HiveCliHook() hook.run_cli("SHOW DATABASES")
def get_hook(self): if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.gcp.hooks.bigquery import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'pig_cli': from airflow.hooks.pig_hook import PigCliHook return PigCliHook(pig_cli_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) elif self.conn_type == 'azure_data_lake': from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) elif self.conn_type == 'azure_cosmos': from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id) elif self.conn_type == 'cassandra': from airflow.contrib.hooks.cassandra_hook import CassandraHook return CassandraHook(cassandra_conn_id=self.conn_id) elif self.conn_type == 'mongo': from airflow.contrib.hooks.mongo_hook import MongoHook return MongoHook(conn_id=self.conn_id) elif self.conn_type == 'gcpcloudsql': from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id) elif self.conn_type == 'grpc': from airflow.contrib.hooks.grpc_hook import GrpcHook return GrpcHook(grpc_conn_id=self.conn_id) raise AirflowException("Unknown hook type {}".format(self.conn_type))
def get_hook(self): return HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id, run_as=self.run_as, mapred_queue=self.mapred_queue, mapred_queue_priority=self.mapred_queue_priority, mapred_job_name=self.mapred_job_name)
def execution_data_task_id(ds, dag, **kwargs): v_date = kwargs.get('v_execution_date') v_day = kwargs.get('v_execution_day') v_hour = kwargs.get('v_execution_hour') hive_hook = HiveCliHook() # 读取sql # _sql = app_otrade_b2b_order_target_retailer_di_sql_task(ds) # logging.info('Executing: %s', _sql) # 执行Hive # hive_hook.run_cli(_sql) # 熔断数据 # check_key_data_task(ds) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ # TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "false") """ #功能函数 alter语句: alter_partition() 删除分区: delete_partition() 生产success: touchz_success() #参数 is_countries_online --是否开通多国家业务 默认(true 开通) db_name --hive 数据库的名称 table_name --hive 表的名称 data_oss_path --oss 数据目录的地址 is_country_partition --是否有国家码分区,[默认(true 有country_code分区)] is_result_force_exist --数据是否强行产出,[默认(true 必须有数据才生成_SUCCESS)] false 数据没有也生成_SUCCESS execute_time --当前脚本执行时间(%Y-%m-%d %H:%M:%S) is_hour_task --是否开通小时级任务,[默认(false)] frame_type --模板类型(只有 is_hour_task:'true' 时生效): utc 产出分区为utc时间,local 产出分区为本地时间,[默认(utc)]。 is_offset --是否开启时间前后偏移(影响success 文件) execute_time_offset --执行时间偏移值(-1、0、1),在当前执行时间上,前后偏移原有时间,用于产出前后小时分区 business_key --产品线名称 #读取sql %_sql(ds,v_hour) """ args = [{ "dag": dag, "is_countries_online": "true", "db_name": db_name, "table_name": table_name, "data_oss_path": hdfs_path, "is_country_partition": "true", "is_result_force_exist": "false", "execute_time": v_date, "is_hour_task": "false", "frame_type": "local", "is_offset": "true", "execute_time_offset": -1, "business_key": "otrade" }] cf = CountriesAppFrame(args) # 删除分区 cf.delete_partition() # 读取sql _sql = "\n" + cf.alter_partition( ) + "\n" + app_otrade_b2b_order_target_retailer_di_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生产success cf.touchz_success()
def move_data_mssql(templates_dict, **kwargs): tmp_table = 'clean_mydata_tmp' table = 'clean_mydata' delimiter = "','" quote_char = "'\"'" escape_char = "'\\\\'" number_of_mappers = '4' mssql_table = 'mydata' # -------Convert Hive Table from PARQUET to TEXT -------------------------------------------- hook = HiveCliHook() query = """ DROP TABLE {tmp_table}; CREATE TABLE {tmp_table} ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( "separatorChar" = {delimiter}, "quoteChar" = {quote_char}, "escapeChar" = {escape_char} ) STORED AS TEXTFILE AS SELECT column1, column2, column3, column4, column5, column6,column7, column8, column9, column10, source_id FROM {table} """.format(tmp_table=tmp_table, table=table, delimiter=delimiter, quote_char=quote_char, escape_char=escape_char) hook.run_cli(schema=templates_dict['schema'], hql=query) # --------------Run sqoop-------------------------------------------------------------------- # get default mssql connection sql_conn = BaseHook.get_connection('mssql_default') conn_str = "'jdbc:sqlserver://{host};databaseName={database}'".format( host=sql_conn.host, database=sql_conn.extra_dejson.get('database') ) # get default hive cli connection hive_conn = BaseHook.get_connection('hive_cli_default') hdfs_export_dir = 'hdfs://{host}:{port}/user/hive/warehouse/my_hive_db.db/{table}'.format( host=hive_conn.host, port=8020, table=tmp_table ) cmd = ['/usr/bin/sqoop', 'export', '--connect', conn_str, '--username', sql_conn.login, '--password', sql_conn.password, '--table', mssql_table, '--export-dir', hdfs_export_dir, '-m', number_of_mappers, '--input-fields-terminated-by', delimiter, '--input-enclosed-by', quote_char, '--input-escaped-by', escape_char, '--input-null-string', "'\\\N'"] cmd = ' '.join(cmd) print cmd logging.info("Executing sqoop") sp = Popen(cmd, shell=True) sp.wait() logging.info("Command exited with return code {0}".format(sp.returncode))
def run_check_table(schema_table_db_name, schema_table_name, target_table_db_name, target_table_name, conn_id, hive_table_name, server_name, **kwargs): # SHOW TABLES in oride_db LIKE 'data_aa' check_sql = 'SHOW TABLES in %s LIKE \'%s\'' % (HIVE_DB, hive_table_name) hive2_conn = HiveServer2Hook().get_conn() cursor = hive2_conn.cursor() cursor.execute(check_sql) if len(cursor.fetchall()) == 0: logging.info('Create Hive Table: %s.%s', HIVE_DB, hive_table_name) # get table column column_sql = ''' SELECT COLUMN_NAME, DATA_TYPE, NUMERIC_PRECISION, NUMERIC_SCALE, COLUMN_COMMENT FROM information_schema.columns WHERE table_schema='{db_name}' and table_name='{table_name}' '''.format(db_name=schema_table_db_name, table_name=schema_table_name) mysql_hook = MySqlHook(conn_id) mysql_conn = mysql_hook.get_conn() mysql_cursor = mysql_conn.cursor() mysql_cursor.execute(column_sql) results = mysql_cursor.fetchall() rows = [] for result in results: if result[0] == 'dt': col_name = '_dt' else: col_name = result[0] if result[1] == 'timestamp' or result[1] == 'varchar' or result[1] == 'char' or result[1] == 'text' or \ result[1] == 'longtext' or \ result[1] == 'mediumtext' or \ result[1] == 'json' or \ result[1] == 'datetime': data_type = 'string' elif result[1] == 'decimal': data_type = result[1] + "(" + str(result[2]) + "," + str( result[3]) + ")" else: data_type = result[1] rows.append("`%s` %s comment '%s'" % (col_name, data_type, str(result[4]).replace( '\n', '').replace('\r', ''))) mysql_conn.close() # hive create table hive_hook = HiveCliHook() sql = ODS_CREATE_TABLE_SQL.format( db_name=HIVE_DB, table_name=hive_table_name, columns=",\n".join(rows), oss_path=OSS_PATH % ("{server_name}.{db_name}.{table_name}".format( server_name=server_name, db_name=target_table_db_name, table_name=target_table_name))) logging.info('Executing: %s', sql) hive_hook.run_cli(sql) else: sqoopSchema = SqoopSchemaUpdate() response = sqoopSchema.append_hive_schema( hive_db=HIVE_DB, hive_table=hive_table_name, mysql_db=schema_table_db_name, mysql_table=schema_table_name, mysql_conn=conn_id, oss_path=OSS_PATH % ("{server_name}.{db_name}.{table_name}".format( server_name=server_name, db_name=target_table_db_name, table_name=target_table_name))) if response: return True return
def get_hook(self): return HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id, run_as=self.run_as)
def ddl(self): table = request.args.get("table") sql = "SHOW CREATE TABLE {table};".format(table=table) h = HiveCliHook(HIVE_CLI_CONN_ID) return h.run_cli(sql)
def execute(self, context): # Downloading file from S3 self.s3 = S3Hook(s3_conn_id=self.s3_conn_id) self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Downloading S3 file") if self.wildcard_match: if not self.s3.check_for_wildcard_key(self.s3_key): raise AirflowException("No key matches {0}".format( self.s3_key)) s3_key_object = self.s3.get_wildcard_key(self.s3_key) else: if not self.s3.check_for_key(self.s3_key): raise AirflowException("The key {0} does not exists".format( self.s3_key)) s3_key_object = self.s3.get_key(self.s3_key) root, file_ext = os.path.splitext(s3_key_object.key) with TemporaryDirectory(prefix='tmps32hive_') as tmp_dir,\ NamedTemporaryFile(mode="w", dir=tmp_dir, suffix=file_ext) as f: logging.info("Dumping S3 key {0} contents to local" " file {1}".format(s3_key_object.key, f.name)) s3_key_object.get_contents_to_file(f) f.flush() self.s3.connection.close() if not self.headers: logging.info("Loading file {0} into Hive".format(f.name)) self.hive.load_file(f.name, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate, tblproperties=self.tblproperties) else: # Decompressing file if self.input_compressed: logging.info("Uncompressing file {0}".format(f.name)) fn_uncompressed = uncompress_file(f.name, file_ext, tmp_dir) logging.info("Uncompressed to {0}".format(fn_uncompressed)) # uncompressed file available now so deleting # compressed file to save disk space f.close() else: fn_uncompressed = f.name # Testing if header matches field_dict if self.check_headers: logging.info("Matching file header against field_dict") header_list = self._get_top_row_as_list(fn_uncompressed) if not self._match_headers(header_list): raise AirflowException("Header check failed") # Deleting top header row logging.info( "Removing header from file {0}".format(fn_uncompressed)) headless_file = (self._delete_top_row_and_compress( fn_uncompressed, file_ext, tmp_dir)) logging.info("Headless file {0}".format(headless_file)) logging.info( "Loading file {0} into Hive".format(headless_file)) self.hive.load_file(headless_file, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate, tblproperties=self.tblproperties)