def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") hive_table = "druid." + context["task_instance_key_str"].replace(".", "_") sql = self.sql.strip().strip(";") hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = '') AS {sql} """.format( **locals() ) logging.info("Running command:\n {}".format(hql)) hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find("/user") static_path = hdfs_uri[pos:] schema, table = hive_table.split(".") druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) logging.info("Inserting rows into Druid") logging.info("HDFS path: " + static_path) try: druid.load_from_hdfs( datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, num_shards=self.num_shards, target_partition_size=self.target_partition_size, query_granularity=self.query_granularity, segment_granularity=self.segment_granularity, metric_spec=self.metric_spec, hadoop_dependency_coordinates=self.hadoop_dependency_coordinates, ) logging.info("Load seems to have succeeded!") finally: logging.info("Cleaning up by dropping the temp " "Hive table {}".format(hive_table)) hql = "DROP TABLE IF EXISTS {}".format(hive_table) hive.run_cli(hql)
def execute(self, context): hook = DruidHook( druid_ingest_conn_id=self.conn_id, max_ingestion_time=self.max_ingestion_time ) self.log.info("Sumitting %s", self.index_spec_str) hook.submit_indexing_job(self.index_spec_str)
def druid_indexing(task_id, **kwargs): task = 'init_task_{}'.format(task_id) ti = kwargs['ti'] xcom_values = pullXcom(ti, task, xcom_keys) # pull data from xcom object index_template = xcom_values['index_template'] # index_template = kwargs.get('templates_dict').get('index_template', None) # site = kwargs.get('templates_dict').get('site', None) tmp_index_template = index_template.replace("_template.json", "_template_{}.json".format(task_id)) print(tmp_index_template) hook = DruidHook( druid_ingest_conn_id='druid_ingest_default', max_ingestion_time=None ) with open(tmp_index_template) as data_file: index_spec = json.load(data_file) index_spec_str = json.dumps( index_spec, sort_keys=True, indent=4, separators=(',', ': ') ) log.info("Sumitting %s", index_spec_str) hook.submit_indexing_job(index_spec_str)
def DruidIndexTask(**kwargs): task = 'boot_task' ti = kwargs['ti'] xcom_keys = "index_template,source,date,log_enrich_path" xcom_values = pullXcom(ti, task, xcom_keys) # pull data from xcom object index_template = xcom_values['index_template'] source = xcom_values['source'] date = xcom_values['date'] enrichDataPath = xcom_values['log_enrich_path'] # read arguments # index_template = kwargs.get('templates_dict').get('index_template', None) # source = kwargs.get('templates_dict').get('source', None) # date = kwargs.get('templates_dict').get('date', None) # enrichDataPath = kwargs.get('templates_dict').get('enrichedDir', None) hdfs_path = buildDates(date, enrichDataPath) jReturn = json.loads(hdfs_path) interval = jReturn['interval'] hdfs_path = jReturn['path'][0] hook = DruidHook( druid_ingest_conn_id='druid_ingest_default', max_ingestion_time=None ) # Open the JSON file for reading jsonFile = open(index_template, "r") # Read the JSON into the buffer data = json.load(jsonFile) # Close the JSON file jsonFile.close() ## Working with buffered content jData = data['spec']['dataSchema']['dataSource'] data['spec']['dataSchema']['dataSource'] = str(jData).replace(jData, source) jData = data['spec']['dataSchema']['granularitySpec']['intervals'][0] data['spec']['dataSchema']['granularitySpec']['intervals'] = [str(jData).replace(jData, interval)] jData = data['spec']['ioConfig']['inputSpec']['paths'] data['spec']['ioConfig']['inputSpec']['paths'] = str(jData).replace(jData, hdfs_path) index_spec_str = json.dumps( data, sort_keys=True, indent=4, separators=(',', ': ') ) log.info("Sumitting %s", index_spec_str) hook.submit_indexing_job(index_spec_str) ## Save our changes to JSON file jsonFile = open(index_template, "w+") jsonFile.write(json.dumps(data, indent=4, sort_keys=True)) jsonFile.close() return index_template
def test_get_conn_url(self, mock_get_connection): get_conn_value = MagicMock() get_conn_value.host = 'test_host' get_conn_value.conn_type = 'https' get_conn_value.port = '1' get_conn_value.extra_dejson = {'endpoint': 'ingest'} mock_get_connection.return_value = get_conn_value hook = DruidHook(timeout=0, max_ingestion_time=5) self.assertEquals(hook.get_conn_url(), 'https://test_host:1/ingest')
def test_get_conn_url(self, mock_get_connection): get_conn_value = MagicMock() get_conn_value.host = 'test_host' get_conn_value.conn_type = 'https' get_conn_value.port = '1' get_conn_value.extra_dejson = {'endpoint': 'ingest'} mock_get_connection.return_value = get_conn_value hook = DruidHook(timeout=1, max_ingestion_time=5) self.assertEqual(hook.get_conn_url(), 'https://test_host:1/ingest')
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.log.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_') sql = self.sql.strip().strip(';') tblproperties = ''.join( ", '{}' = '{}'".format(k, v) for k, v in self.hive_tblproperties.items() ) hql = """\ SET mapred.output.compress=false; SET hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = ''{tblproperties}) AS {sql} """.format(**locals()) self.log.info("Running command:\n %s", hql) hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) # Get the Hive table and extract the columns t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] # Get the path on hdfs hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find('/user') static_path = hdfs_uri[pos:] schema, table = hive_table.split('.') druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) try: index_spec = self.construct_ingest_query( static_path=static_path, columns=columns, ) self.log.info("Inserting rows into Druid, hdfs path: %s", static_path) druid.submit_indexing_job(index_spec) self.log.info("Load seems to have succeeded!") finally: self.log.info( "Cleaning up by dropping the temp Hive table %s", hive_table ) hql = "DROP TABLE IF EXISTS {}".format(hive_table) hive.run_cli(hql)
def test_submit_ok(self, m): hook = DruidHook() m.post('http://druid-overlord:8081/druid/indexer/v1/task', text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}') m.get( 'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status', text='{"status":{"status": "SUCCESS"}}') # Exists just as it should hook.submit_indexing_job('Long json file')
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) _log.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'].replace( '.', '_') sql = self.sql.strip().strip(';') hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = '') AS {sql} """.format(**locals()) _log.info("Running command:\n {}".format(hql)) hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find('/user') static_path = hdfs_uri[pos:] schema, table = hive_table.split('.') druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) _log.info("Inserting rows into Druid") _log.info("HDFS path: " + static_path) try: druid.load_from_hdfs( datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, num_shards=self.num_shards, target_partition_size=self.target_partition_size, query_granularity=self.query_granularity, segment_granularity=self.segment_granularity, metric_spec=self.metric_spec, hadoop_dependency_coordinates=self. hadoop_dependency_coordinates) _log.info("Load seems to have succeeded!") finally: _log.info("Cleaning up by dropping the temp " "Hive table {}".format(hive_table)) hql = "DROP TABLE IF EXISTS {}".format(hive_table) hive.run_cli(hql)
def test_submit_gone_wrong(self, m): hook = DruidHook() m.post('http://druid-overlord:8081/druid/indexer/v1/task', text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}') m.get( 'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status', text='{"status":{"status": "FAILED"}}') # The job failed for some reason with self.assertRaises(AirflowException): hook.submit_indexing_job('Long json file')
def test_submit_timeout(self, m): hook = DruidHook(timeout=0, max_ingestion_time=5) m.post('http://druid-overlord:8081/druid/indexer/v1/task', text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}') m.get( 'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status', text='{"status":{"status": "RUNNING"}}') # Because the jobs keeps running with self.assertRaises(AirflowException): hook.submit_indexing_job('Long json file')
def test_submit_unknown_response(self, m): hook = DruidHook() m.post('http://druid-overlord:8081/druid/indexer/v1/task', text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}') m.get( 'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status', text='{"status":{"status": "UNKNOWN"}}') # An unknown error code with self.assertRaises(AirflowException): hook.submit_indexing_job('Long json file')
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.log.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_') sql = self.sql.strip().strip(';') tblproperties = ''.join([", '{}' = '{}'" .format(k, v) for k, v in self.hive_tblproperties.items()]) hql = """\ SET mapred.output.compress=false; SET hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = ''{tblproperties}) AS {sql} """.format(hive_table=hive_table, tblproperties=tblproperties, sql=sql) self.log.info("Running command:\n %s", hql) hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) # Get the Hive table and extract the columns t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] # Get the path on hdfs static_path = m.get_table(hive_table).sd.location schema, table = hive_table.split('.') druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) try: index_spec = self.construct_ingest_query( static_path=static_path, columns=columns, ) self.log.info("Inserting rows into Druid, hdfs path: %s", static_path) druid.submit_indexing_job(index_spec) self.log.info("Load seems to have succeeded!") finally: self.log.info( "Cleaning up by dropping the temp Hive table %s", hive_table ) hql = "DROP TABLE IF EXISTS {}".format(hive_table) hive.run_cli(hql)
def test_submit_ok(self, m): hook = DruidHook() m.post( 'http://druid-overlord:8081/druid/indexer/v1/task', text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}' ) m.get( 'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status', text='{"status":{"status": "SUCCESS"}}' ) # Exists just as it should hook.submit_indexing_job('Long json file')
def test_submit_gone_wrong(self, m): hook = DruidHook() m.post( 'http://druid-overlord:8081/druid/indexer/v1/task', text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}' ) m.get( 'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status', text='{"status":{"status": "FAILED"}}' ) # The job failed for some reason with self.assertRaises(AirflowException): hook.submit_indexing_job('Long json file')
def test_submit_unknown_response(self, m): hook = DruidHook() m.post( 'http://druid-overlord:8081/druid/indexer/v1/task', text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}' ) m.get( 'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status', text='{"status":{"status": "UNKNOWN"}}' ) # An unknown error code with self.assertRaises(AirflowException): hook.submit_indexing_job('Long json file')
def test_submit_timeout(self, m): hook = DruidHook(timeout=0, max_ingestion_time=5) m.post( 'http://druid-overlord:8081/druid/indexer/v1/task', text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}' ) m.get( 'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status', text='{"status":{"status": "RUNNING"}}' ) # Because the jobs keeps running with self.assertRaises(AirflowException): hook.submit_indexing_job('Long json file')
def execute(self, context): hook = DruidHook(druid_ingest_conn_id=self.conn_id) hook.submit_indexing_job(json.dumps(self.index_spec))
def updatejsonfileDelta(**kwargs): # ti = kwargs['ti'] # task_instance = kwargs['templates_dict'] index_template = kwargs.get('index_template', None) jsonFile = open(index_template, "r") source = kwargs.get('source', None) site = kwargs.get('site', None) delta = kwargs.get('delta', None) jsonFile = open(index_template, "r") # Open the JSON file for reading tmp_index_template = index_template.replace("_template.json", "_template_{}.json".format(site)) data = json.load(jsonFile) # Read the JSON into the buffer path = [] intervals = [] interval = "" # print("Delta -->{}".format(delta) for deltaDate, deltaPath in delta.iteritems(): # print('{} ---> {}'.format(deltaDate,deltaPath) path.append(deltaPath['path']) date = deltaPath['interval'] print('{} ---- {}'.format(path, str(date).replace("/","_"))) segments = list_segments(source, str(date).replace("/","_")) print(segments) interval = segments[0]['interval'] intervals.append(segments[0]['interval']) jData = data['spec']['dataSchema']['dataSource'] data['spec']['dataSchema']['dataSource'] = str(jData).replace(jData, source) jData = data['spec']['dataSchema']['granularitySpec']['intervals'][0] jData = data['spec']['ioConfig']['inputSpec']['children'][0]['paths'] data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['dataSource'] = source # data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['segments'] = segments # d = datetime.strptime(deltaDate, "%m-%d-%Y") # print(d.date() # # print(deltaDate.replace("-", "") # hdfs_path = buildDates(str(d.date()).replace("-", ""), deltaPath) # print(hdfs_path data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['intervals'] = intervals # str(jData).replace(jData, intervals)] data['spec']['ioConfig']['inputSpec']['children'][0]['paths'] = str(jData).replace(jData, ','.join(path)) # data['spec']['ioConfig']['inputSpec']['children'][0]['paths'] = ','.join(path) jData = data['spec']['dataSchema']['granularitySpec']['intervals'][0] data['spec']['dataSchema']['granularitySpec']['intervals'] = [str(jData).replace(jData, interval)] # data['spec']['dataSchema']['granularitySpec']['intervals'] = [interval] hook = DruidHook( druid_ingest_conn_id='druid_ingest_default', max_ingestion_time=None ) index_spec_str = json.dumps( data, sort_keys=True, indent=4, separators=(',', ': ') ) log.info("Sumitting %s", index_spec_str) # hook.submit_indexing_job(index_spec_str) print(index_spec_str) jsonFile = open(tmp_index_template, "w+") jsonFile.write(json.dumps(data, indent=4, sort_keys=True)) jsonFile.close() # exit(0) # interval = kwargs.get('interval', None) # date = kwargs.get('date', None) # print(date.replace("-", "") # enriched_dir = kwargs.get('enriched_dir', None) # hdfs_path = buildDates(date.replace("-", ""), enriched_dir) # jReturn = json.loads(hdfs_path) # # # interval = jReturn['interval'] # hdfs_path = jReturn['path'][0] # # hdfs_path = index_template # jReturn = json.load(jsonFile) # # interval = jReturn['interval'] # # # hdfs_path = jReturn['path'][0] # segments = list_segments(source, date) # interval = segments[0]['interval'] # print(segments[0]['interval'] # jsonFile = open(index_template, "r") # Open the JSON file for reading # tmp_index_template = index_template.replace("_template.json", "_template_{}.json".format(site)) # data = json.load(jsonFile) # Read the JSON into the buffer # jsonFile.close() # Close the JSON file # # ## Working with buffered content # # jData = data['spec']['dataSchema']['dataSource'] # data['spec']['dataSchema']['dataSource'] = str(jData).replace(jData, source) # jData = data['spec']['dataSchema']['granularitySpec']['intervals'][0] # data['spec']['dataSchema']['granularitySpec']['intervals'] = [str(jData).replace(jData, interval)] # jData = data['spec']['ioConfig']['inputSpec']['children'][0]['paths'] # data['spec']['ioConfig']['inputSpec']['children'][0]['paths'] = str(jData).replace(jData, hdfs_path) # data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['dataSource'] = source # data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['segments'] = segments # data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['intervals'] = [ # str(jData).replace(jData, interval)] # ## Save our changes to JSON file # # jsonFile = open(tmp_index_template, "w+") # jsonFile.write(json.dumps(data, indent=4, sort_keys=True)) # jsonFile.close() return jsonFile
def updatejsonfileDeltas(**kwargs): # ti = kwargs['ti'] # task_instance = kwargs['templates_dict'] index_template = kwargs.get('index_template', None) jsonFile = open(index_template, "r") source = kwargs.get('source', None) site = kwargs.get('site', None) # print('This is dict {}'.format(kwargs.get('delta', None)) delta = json.loads(kwargs.get('delta', None)) jsonFile = open(index_template, "r") # Open the JSON file for reading tmp_index_template = index_template.replace("_template.json" "", "_template_{}.json".format(site)) data = json.load(jsonFile) # Read the JSON into the buffer path = [] intervals = [] interval = "" # print("Delta -->{}".format(delta) # print(data['spec']['ioConfig']['inputSpec']['children'][0]['ingestionSpec']['intervals'] # print(data['spec']['ioConfig']['inputSpec']['children'][1]['paths'] # print(index_spec_str jData = data paths = [] ingestionSpec = [] # print(data['spec']['ioConfig']['inputSpec']['children'] # for i, (deltaDate, deltaPath) in enumerate(delta.iteritems()): # data['spec']['ioConfig']['inputSpec']['children'].append(i) # for i, (deltaDate, deltaPath) in enumerate(delta.iteritems()): # # print('{} {} ---> {} {} '.format(i, deltaDate,deltaPath['interval'],deltaPath['paths']) # # print('---------------------' # x = { # 'dataSource': deltaDate, # 'intervals': deltaPath['interval'] # } # ingestionSpec.append(x) # # data['spec']['ioConfig']['inputSpec']['children'][0]['ingestionSpec'] = [{ # # 'dataSource': deltaDate, # # 'intervals': deltaPath['interval'] # # }] # paths.append(deltaPath['paths']) # # data['spec']['ioConfig']['inputSpec']['children'][0]['ingestionSpec'] = { # # 'dataSource': deltaDate, # # 'intervals': deltaPath['interval'] # # } # # data['spec']['ioConfig']['inputSpec']['children'][1]['paths'] = deltaPath['paths'] # # data['spec']['ioConfig']['inputSpec']['children'].append({ # # 'dataSource': deltaDate, # # 'intervals': deltaPath['interval'] # # }) finalPath = [] for path in paths: # print("--{}--".format(path) for p in path: # print(p finalPath.append(p) for i, (deltaSource, deltaPath) in enumerate(delta.iteritems()): print("template file {} {} ".format(deltaSource,str(index_template).replace("src",deltaSource).lower())) jsonFile = open(str(index_template).replace("src",deltaSource).lower(), "r") data = json.load(jsonFile) x = { 'dataSource': str(deltaSource).lower(), 'intervals': deltaPath['interval'], # 'segments': deltaPath['segments'] } y = { 'type': 'static', 'paths': ','.join(deltaPath['paths']) } z = { "type" : "dataSource", "ingestionSpec": x } data['spec']['dataSchema']['granularitySpec']['intervals'] = deltaPath['interval'] data['spec']['ioConfig']['inputSpec']['children'][1] = y data['spec']['ioConfig']['inputSpec']['children'][0] = z # data['spec']['ioConfig']['inputSpec']['children'][0]['ingestionSpec']['segments'] = deltaPath['segments'] index_spec_str = json.dumps( data, sort_keys=True, indent=4, separators=(',', ': ') ) hook = DruidHook( druid_ingest_conn_id='druid_ingest_default', max_ingestion_time=None ) log.info("Sumitting %s", index_spec_str) # if str(deltaSource).lower() == "src": # hook.submit_indexing_job(index_spec_str) jsonFile = open("delta.json", "w+") jsonFile.write(json.dumps(data, indent=4, sort_keys=True)) jsonFile.close()
def druid_ingest(self): druid_hook = DruidHook(druid_ingest_conn_id=self.druid_conn_id) self.log.info("Submitting druid task: %s", json.dumps(self.druid_ingest_spec)) druid_hook.submit_indexing_job(self.druid_ingest_spec)