def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'] sql = self.sql.strip().strip(';') hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE AS {sql}; """.format(**locals()) hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find('/user') static_path = hdfs_uri[pos:] druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) logging.info("Inserting rows into Druid") druid.load_from_hdfs(datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, metric_spec=self.metric_spec) logging.info("Load seems to have succeeded!")
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'] sql = self.sql.strip().strip(';') hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE AS {sql}; """.format(**locals()) #hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find('/user') static_path = hdfs_uri[pos:] druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) logging.info("Inserting rows into Druid") druid.load_from_hdfs( datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, metric_spec=self.metric_spec) logging.info("Load seems to have succeeded!")
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'] sql = self.sql.strip().strip(';') hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = '') AS {sql} """.format(**locals()) hive.run_cli(hql) #hqls = hql.split(';') #logging.info(str(hqls)) #from airflow.hooks import HiveServer2Hook #hive = HiveServer2Hook(hiveserver2_conn_id="hiveserver2_silver") #hive.get_results(hqls) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find('/user') static_path = hdfs_uri[pos:] schema, table = hive_table.split('.') druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) logging.info("Inserting rows into Druid") logging.info("HDFS path: " + static_path) druid.load_from_hdfs( datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, metric_spec=self.metric_spec, hadoop_dependency_coordinates=self.hadoop_dependency_coordinates) logging.info("Load seems to have succeeded!") logging.info( "Cleaning up by dropping the temp " "Hive table {}".format(hive_table)) hql = "DROP TABLE IF EXISTS {}".format(hive_table)
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'] sql = self.sql.strip().strip(';') hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = '') AS {sql} """.format(**locals()) hive.run_cli(hql) #hqls = hql.split(';') #logging.info(str(hqls)) #from airflow.hooks import HiveServer2Hook #hive = HiveServer2Hook(hiveserver2_conn_id="hiveserver2_silver") #hive.get_results(hqls) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find('/user') static_path = hdfs_uri[pos:] schema, table = hive_table.split('.') druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) logging.info("Inserting rows into Druid") logging.info("HDFS path: " + static_path) druid.load_from_hdfs( datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, metric_spec=self.metric_spec, hadoop_dependency_coordinates=self.hadoop_dependency_coordinates) logging.info("Load seems to have succeeded!") logging.info("Cleaning up by dropping the temp " "Hive table {}".format(hive_table)) hql = "DROP TABLE IF EXISTS {}".format(hive_table)
def table(self): table_name = request.args.get("table") m = HiveMetastoreHook(METASTORE_CONN_ID) table = m.get_table(table_name) return self.render( "metastore_browser/table.html", table=table, table_name=table_name, datetime=datetime, int=int)
def table(self): table_name = request.args.get("table") m = HiveMetastoreHook(METASTORE_CONN_ID) table = m.get_table(table_name) return self.render("metastore_browser/table.html", table=table, table_name=table_name, datetime=datetime, int=int)
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = { ('', 'count'): 'COUNT(*)' } for col, col_type in field_types.items(): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join([ v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = [ "{0} = '{1}'".format(k, v) for k, v in self.partition.items()] where_clause = " AND\n ".join(where_clause) sql = """ SELECT {exprs_str} FROM {self.table} WHERE {where_clause}; """.format(**locals()) hook = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info('Executing SQL check: ' + sql) row = hook.get_first(hql=sql) logging.info("Record: " + str(row)) if not row: raise Exception("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) logging.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}' LIMIT 1; """.format(**locals()) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}'; """.format(**locals()) mysql.run(sql) logging.info("Pivoting and loading cells into the Airflow db") rows = [ (self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows( table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ] )
def execute(self, context=None): metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs = {('', 'count'): 'COUNT(*)'} for col, col_type in field_types.items(): d = {} if self.assignment_func: d = self.assignment_func(col, col_type) if d is None: d = self.get_default_exprs(col, col_type) else: d = self.get_default_exprs(col, col_type) exprs.update(d) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join( [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause = [ "{0} = '{1}'".format(k, v) for k, v in self.partition.items() ] where_clause = " AND\n ".join(where_clause) sql = """ SELECT {exprs_str} FROM {self.table} WHERE {where_clause}; """.format(**locals()) hook = PrestoHook(presto_conn_id=self.presto_conn_id) logging.info('Executing SQL check: ' + sql) row = hook.get_first(hql=sql) logging.info("Record: " + str(row)) if not row: raise AirflowException("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) logging.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}' LIMIT 1; """.format(**locals()) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{self.table}' AND partition_repr='{part_json}' AND dttm='{self.dttm}'; """.format(**locals()) mysql.run(sql) logging.info("Pivoting and loading cells into the Airflow db") rows = [(self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)] mysql.insert_rows(table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ])