Пример #1
1
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        hive_table = "druid." + context["task_instance_key_str"].replace(".", "_")
        sql = self.sql.strip().strip(";")
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = '')
        AS
        {sql}
        """.format(
            **locals()
        )
        logging.info("Running command:\n {}".format(hql))
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find("/user")
        static_path = hdfs_uri[pos:]

        schema, table = hive_table.split(".")

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        logging.info("Inserting rows into Druid")
        logging.info("HDFS path: " + static_path)

        try:
            druid.load_from_hdfs(
                datasource=self.druid_datasource,
                intervals=self.intervals,
                static_path=static_path,
                ts_dim=self.ts_dim,
                columns=columns,
                num_shards=self.num_shards,
                target_partition_size=self.target_partition_size,
                query_granularity=self.query_granularity,
                segment_granularity=self.segment_granularity,
                metric_spec=self.metric_spec,
                hadoop_dependency_coordinates=self.hadoop_dependency_coordinates,
            )
            logging.info("Load seems to have succeeded!")
        finally:
            logging.info("Cleaning up by dropping the temp " "Hive table {}".format(hive_table))
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
Пример #2
0
 def execute(self, context):
     hook = DruidHook(
         druid_ingest_conn_id=self.conn_id,
         max_ingestion_time=self.max_ingestion_time
     )
     self.log.info("Sumitting %s", self.index_spec_str)
     hook.submit_indexing_job(self.index_spec_str)
Пример #3
0
 def execute(self, context):
     hook = DruidHook(
         druid_ingest_conn_id=self.conn_id,
         max_ingestion_time=self.max_ingestion_time
     )
     self.log.info("Sumitting %s", self.index_spec_str)
     hook.submit_indexing_job(self.index_spec_str)
Пример #4
0
def druid_indexing(task_id, **kwargs):
    task = 'init_task_{}'.format(task_id)
    ti = kwargs['ti']
    xcom_values = pullXcom(ti, task, xcom_keys)  # pull data from xcom object
    index_template = xcom_values['index_template']

    # index_template = kwargs.get('templates_dict').get('index_template', None)
    # site = kwargs.get('templates_dict').get('site', None)
    tmp_index_template = index_template.replace("_template.json", "_template_{}.json".format(task_id))
    print(tmp_index_template)
    hook = DruidHook(
        druid_ingest_conn_id='druid_ingest_default',
        max_ingestion_time=None
    )

    with open(tmp_index_template) as data_file:
        index_spec = json.load(data_file)
    index_spec_str = json.dumps(
        index_spec,
        sort_keys=True,
        indent=4,
        separators=(',', ': ')
    )
    log.info("Sumitting %s", index_spec_str)
    hook.submit_indexing_job(index_spec_str)
Пример #5
0
def DruidIndexTask(**kwargs):
    task = 'boot_task'
    ti = kwargs['ti']
    xcom_keys = "index_template,source,date,log_enrich_path"
    xcom_values = pullXcom(ti, task, xcom_keys)  # pull data from xcom object
    index_template = xcom_values['index_template']
    source = xcom_values['source']
    date = xcom_values['date']
    enrichDataPath = xcom_values['log_enrich_path']
    # read arguments
    # index_template = kwargs.get('templates_dict').get('index_template', None)
    # source = kwargs.get('templates_dict').get('source', None)

    # date = kwargs.get('templates_dict').get('date', None)
    # enrichDataPath = kwargs.get('templates_dict').get('enrichedDir', None)

    hdfs_path = buildDates(date, enrichDataPath)
    jReturn = json.loads(hdfs_path)

    interval = jReturn['interval']
    hdfs_path = jReturn['path'][0]

    hook = DruidHook(
        druid_ingest_conn_id='druid_ingest_default',
        max_ingestion_time=None
    )

    # Open the JSON file for reading
    jsonFile = open(index_template, "r")

    # Read the JSON into the buffer
    data = json.load(jsonFile)

    # Close the JSON file
    jsonFile.close()

    ## Working with buffered content
    jData = data['spec']['dataSchema']['dataSource']
    data['spec']['dataSchema']['dataSource'] = str(jData).replace(jData, source)
    jData = data['spec']['dataSchema']['granularitySpec']['intervals'][0]
    data['spec']['dataSchema']['granularitySpec']['intervals'] = [str(jData).replace(jData, interval)]
    jData = data['spec']['ioConfig']['inputSpec']['paths']
    data['spec']['ioConfig']['inputSpec']['paths'] = str(jData).replace(jData, hdfs_path)

    index_spec_str = json.dumps(
        data,
        sort_keys=True,
        indent=4,
        separators=(',', ': ')
    )
    log.info("Sumitting %s", index_spec_str)
    hook.submit_indexing_job(index_spec_str)

    ## Save our changes to JSON file
    jsonFile = open(index_template, "w+")
    jsonFile.write(json.dumps(data, indent=4, sort_keys=True))
    jsonFile.close()
    return index_template
 def test_get_conn_url(self, mock_get_connection):
     get_conn_value = MagicMock()
     get_conn_value.host = 'test_host'
     get_conn_value.conn_type = 'https'
     get_conn_value.port = '1'
     get_conn_value.extra_dejson = {'endpoint': 'ingest'}
     mock_get_connection.return_value = get_conn_value
     hook = DruidHook(timeout=0, max_ingestion_time=5)
     self.assertEquals(hook.get_conn_url(), 'https://test_host:1/ingest')
 def test_get_conn_url(self, mock_get_connection):
     get_conn_value = MagicMock()
     get_conn_value.host = 'test_host'
     get_conn_value.conn_type = 'https'
     get_conn_value.port = '1'
     get_conn_value.extra_dejson = {'endpoint': 'ingest'}
     mock_get_connection.return_value = get_conn_value
     hook = DruidHook(timeout=1, max_ingestion_time=5)
     self.assertEqual(hook.get_conn_url(), 'https://test_host:1/ingest')
Пример #8
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        self.log.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_')
        sql = self.sql.strip().strip(';')
        tblproperties = ''.join(
            ", '{}' = '{}'".format(k, v)
            for k, v in self.hive_tblproperties.items()
        )

        hql = """\
        SET mapred.output.compress=false;
        SET hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = ''{tblproperties})
        AS
        {sql}
        """.format(**locals())
        self.log.info("Running command:\n %s", hql)
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)

        # Get the Hive table and extract the columns
        t = m.get_table(hive_table)
        columns = [col.name for col in t.sd.cols]

        # Get the path on hdfs
        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find('/user')
        static_path = hdfs_uri[pos:]

        schema, table = hive_table.split('.')

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)

        try:
            index_spec = self.construct_ingest_query(
                static_path=static_path,
                columns=columns,
            )

            self.log.info("Inserting rows into Druid, hdfs path: %s", static_path)

            druid.submit_indexing_job(index_spec)

            self.log.info("Load seems to have succeeded!")
        finally:
            self.log.info(
                "Cleaning up by dropping the temp Hive table %s",
                hive_table
            )
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
    def test_submit_ok(self, m):
        hook = DruidHook()
        m.post('http://druid-overlord:8081/druid/indexer/v1/task',
               text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}')
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "SUCCESS"}}')

        # Exists just as it should
        hook.submit_indexing_job('Long json file')
Пример #10
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        _log.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str'].replace(
            '.', '_')
        sql = self.sql.strip().strip(';')
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = '')
        AS
        {sql}
        """.format(**locals())
        _log.info("Running command:\n {}".format(hql))
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find('/user')
        static_path = hdfs_uri[pos:]

        schema, table = hive_table.split('.')

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        _log.info("Inserting rows into Druid")
        _log.info("HDFS path: " + static_path)

        try:
            druid.load_from_hdfs(
                datasource=self.druid_datasource,
                intervals=self.intervals,
                static_path=static_path,
                ts_dim=self.ts_dim,
                columns=columns,
                num_shards=self.num_shards,
                target_partition_size=self.target_partition_size,
                query_granularity=self.query_granularity,
                segment_granularity=self.segment_granularity,
                metric_spec=self.metric_spec,
                hadoop_dependency_coordinates=self.
                hadoop_dependency_coordinates)
            _log.info("Load seems to have succeeded!")
        finally:
            _log.info("Cleaning up by dropping the temp "
                      "Hive table {}".format(hive_table))
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
    def test_submit_gone_wrong(self, m):
        hook = DruidHook()
        m.post('http://druid-overlord:8081/druid/indexer/v1/task',
               text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}')
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "FAILED"}}')

        # The job failed for some reason
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
    def test_submit_timeout(self, m):
        hook = DruidHook(timeout=0, max_ingestion_time=5)
        m.post('http://druid-overlord:8081/druid/indexer/v1/task',
               text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}')
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "RUNNING"}}')

        # Because the jobs keeps running
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
    def test_submit_unknown_response(self, m):
        hook = DruidHook()
        m.post('http://druid-overlord:8081/druid/indexer/v1/task',
               text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}')
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "UNKNOWN"}}')

        # An unknown error code
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
Пример #14
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        self.log.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_')
        sql = self.sql.strip().strip(';')
        tblproperties = ''.join([", '{}' = '{}'"
                                .format(k, v)
                                 for k, v in self.hive_tblproperties.items()])
        hql = """\
        SET mapred.output.compress=false;
        SET hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = ''{tblproperties})
        AS
        {sql}
        """.format(hive_table=hive_table, tblproperties=tblproperties, sql=sql)
        self.log.info("Running command:\n %s", hql)
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)

        # Get the Hive table and extract the columns
        t = m.get_table(hive_table)
        columns = [col.name for col in t.sd.cols]

        # Get the path on hdfs
        static_path = m.get_table(hive_table).sd.location

        schema, table = hive_table.split('.')

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)

        try:
            index_spec = self.construct_ingest_query(
                static_path=static_path,
                columns=columns,
            )

            self.log.info("Inserting rows into Druid, hdfs path: %s", static_path)

            druid.submit_indexing_job(index_spec)

            self.log.info("Load seems to have succeeded!")
        finally:
            self.log.info(
                "Cleaning up by dropping the temp Hive table %s",
                hive_table
            )
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
Пример #15
0
    def test_submit_ok(self, m):
        hook = DruidHook()
        m.post(
            'http://druid-overlord:8081/druid/indexer/v1/task',
            text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}'
        )
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "SUCCESS"}}'
        )

        # Exists just as it should
        hook.submit_indexing_job('Long json file')
Пример #16
0
    def test_submit_gone_wrong(self, m):
        hook = DruidHook()
        m.post(
            'http://druid-overlord:8081/druid/indexer/v1/task',
            text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}'
        )
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "FAILED"}}'
        )

        # The job failed for some reason
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
Пример #17
0
    def test_submit_unknown_response(self, m):
        hook = DruidHook()
        m.post(
            'http://druid-overlord:8081/druid/indexer/v1/task',
            text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}'
        )
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "UNKNOWN"}}'
        )

        # An unknown error code
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
Пример #18
0
    def test_submit_timeout(self, m):
        hook = DruidHook(timeout=0, max_ingestion_time=5)
        m.post(
            'http://druid-overlord:8081/druid/indexer/v1/task',
            text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}'
        )
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "RUNNING"}}'
        )

        # Because the jobs keeps running
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
Пример #19
0
 def execute(self, context):
     hook = DruidHook(druid_ingest_conn_id=self.conn_id)
     hook.submit_indexing_job(json.dumps(self.index_spec))
Пример #20
0
def updatejsonfileDelta(**kwargs):
    # ti = kwargs['ti']
    # task_instance = kwargs['templates_dict']
    index_template = kwargs.get('index_template', None)
    jsonFile = open(index_template, "r")
    source = kwargs.get('source', None)
    site = kwargs.get('site', None)
    delta = kwargs.get('delta', None)

    jsonFile = open(index_template, "r")  # Open the JSON file for reading
    tmp_index_template = index_template.replace("_template.json", "_template_{}.json".format(site))
    data = json.load(jsonFile)  # Read the JSON into the buffer
    path = []
    intervals = []
    interval = ""
    # print("Delta -->{}".format(delta)

    for deltaDate, deltaPath in delta.iteritems():
        # print('{} ---> {}'.format(deltaDate,deltaPath)
        path.append(deltaPath['path'])
        date = deltaPath['interval']
        print('{} ---- {}'.format(path, str(date).replace("/","_")))
        segments = list_segments(source,  str(date).replace("/","_"))
        print(segments)
        interval = segments[0]['interval']
        intervals.append(segments[0]['interval'])
        jData = data['spec']['dataSchema']['dataSource']
        data['spec']['dataSchema']['dataSource'] = str(jData).replace(jData, source)
        jData = data['spec']['dataSchema']['granularitySpec']['intervals'][0]

        jData = data['spec']['ioConfig']['inputSpec']['children'][0]['paths']
        data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['dataSource'] = source
        # data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['segments'] = segments


        # d = datetime.strptime(deltaDate, "%m-%d-%Y")
        # print(d.date()
        # # print(deltaDate.replace("-", "")
        # hdfs_path = buildDates(str(d.date()).replace("-", ""), deltaPath)
        # print(hdfs_path
    data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['intervals'] = intervals
    # str(jData).replace(jData, intervals)]
    data['spec']['ioConfig']['inputSpec']['children'][0]['paths'] = str(jData).replace(jData, ','.join(path))
    # data['spec']['ioConfig']['inputSpec']['children'][0]['paths'] = ','.join(path)
    jData = data['spec']['dataSchema']['granularitySpec']['intervals'][0]
    data['spec']['dataSchema']['granularitySpec']['intervals'] = [str(jData).replace(jData, interval)]
    # data['spec']['dataSchema']['granularitySpec']['intervals'] = [interval]
    hook = DruidHook(
        druid_ingest_conn_id='druid_ingest_default',
        max_ingestion_time=None
    )
    index_spec_str = json.dumps(
        data,
        sort_keys=True,
        indent=4,
        separators=(',', ': ')
    )
    log.info("Sumitting %s", index_spec_str)
    # hook.submit_indexing_job(index_spec_str)
    print(index_spec_str)
    jsonFile = open(tmp_index_template, "w+")
    jsonFile.write(json.dumps(data, indent=4, sort_keys=True))
    jsonFile.close()
    # exit(0)
    # interval = kwargs.get('interval', None)
    # date = kwargs.get('date', None)
    # print(date.replace("-", "")
    # enriched_dir = kwargs.get('enriched_dir', None)
    # hdfs_path = buildDates(date.replace("-", ""), enriched_dir)
    # jReturn = json.loads(hdfs_path)
    #
    # # interval = jReturn['interval']
    # hdfs_path = jReturn['path'][0]
    # # hdfs_path = index_template
    # jReturn = json.load(jsonFile)
    # # interval = jReturn['interval']
    #
    # # hdfs_path = jReturn['path'][0]
    # segments = list_segments(source, date)
    # interval = segments[0]['interval']
    # print(segments[0]['interval']
    # jsonFile = open(index_template, "r")  # Open the JSON file for reading
    # tmp_index_template = index_template.replace("_template.json", "_template_{}.json".format(site))
    # data = json.load(jsonFile)  # Read the JSON into the buffer
    # jsonFile.close()  # Close the JSON file
    #
    # ## Working with buffered content
    #
    # jData = data['spec']['dataSchema']['dataSource']
    # data['spec']['dataSchema']['dataSource'] = str(jData).replace(jData, source)
    # jData = data['spec']['dataSchema']['granularitySpec']['intervals'][0]
    # data['spec']['dataSchema']['granularitySpec']['intervals'] = [str(jData).replace(jData, interval)]
    # jData = data['spec']['ioConfig']['inputSpec']['children'][0]['paths']
    # data['spec']['ioConfig']['inputSpec']['children'][0]['paths'] = str(jData).replace(jData, hdfs_path)
    # data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['dataSource'] = source
    # data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['segments'] = segments
    # data['spec']['ioConfig']['inputSpec']['children'][1]['ingestionSpec']['intervals'] = [
    #     str(jData).replace(jData, interval)]
    # ## Save our changes to JSON file
    #
    # jsonFile = open(tmp_index_template, "w+")
    # jsonFile.write(json.dumps(data, indent=4, sort_keys=True))
    # jsonFile.close()
    return jsonFile
Пример #21
0
def updatejsonfileDeltas(**kwargs):
    # ti = kwargs['ti']
    # task_instance = kwargs['templates_dict']
    index_template = kwargs.get('index_template', None)
    jsonFile = open(index_template, "r")
    source = kwargs.get('source', None)
    site = kwargs.get('site', None)
    # print('This is dict {}'.format(kwargs.get('delta', None))
    delta = json.loads(kwargs.get('delta', None))
    jsonFile = open(index_template, "r")  # Open the JSON file for reading
    tmp_index_template = index_template.replace("_template.json"
                                                "", "_template_{}.json".format(site))
    data = json.load(jsonFile)  # Read the JSON into the buffer
    path = []
    intervals = []
    interval = ""
    # print("Delta -->{}".format(delta)
    # print(data['spec']['ioConfig']['inputSpec']['children'][0]['ingestionSpec']['intervals']
    # print(data['spec']['ioConfig']['inputSpec']['children'][1]['paths']

    # print(index_spec_str
    jData  = data
    paths = []
    ingestionSpec = []
    # print(data['spec']['ioConfig']['inputSpec']['children']
    # for i, (deltaDate, deltaPath) in enumerate(delta.iteritems()):
    #     data['spec']['ioConfig']['inputSpec']['children'].append(i)
    # for i, (deltaDate, deltaPath) in enumerate(delta.iteritems()):
    #     # print('{} {} ---> {} {} '.format(i, deltaDate,deltaPath['interval'],deltaPath['paths'])
    #     # print('---------------------'
    #     x = {
    #         'dataSource': deltaDate,
    #         'intervals': deltaPath['interval']
    #     }
    #     ingestionSpec.append(x)
    #     # data['spec']['ioConfig']['inputSpec']['children'][0]['ingestionSpec'] = [{
    #     #     'dataSource': deltaDate,
    #     #     'intervals': deltaPath['interval']
    #     # }]
    #     paths.append(deltaPath['paths'])
    #     # data['spec']['ioConfig']['inputSpec']['children'][0]['ingestionSpec'] = {
    #     #     'dataSource': deltaDate,
    #     #     'intervals': deltaPath['interval']
    #     # }
    #     # data['spec']['ioConfig']['inputSpec']['children'][1]['paths'] = deltaPath['paths']
    #     # data['spec']['ioConfig']['inputSpec']['children'].append({
    #     #     'dataSource': deltaDate,
    #     #     'intervals': deltaPath['interval']
    #     # })


    finalPath = []
    for path in paths:
        # print("--{}--".format(path)
        for p in path:
            # print(p
            finalPath.append(p)
    for i, (deltaSource, deltaPath) in enumerate(delta.iteritems()):

        print("template file {} {} ".format(deltaSource,str(index_template).replace("src",deltaSource).lower()))
        jsonFile = open(str(index_template).replace("src",deltaSource).lower(), "r")
        data = json.load(jsonFile)

        x = {
            'dataSource': str(deltaSource).lower(),
            'intervals': deltaPath['interval'],
            # 'segments': deltaPath['segments']
        }
        y = {
            'type': 'static',
            'paths': ','.join(deltaPath['paths'])
        }
        z = {
            "type" : "dataSource",
            "ingestionSpec": x
        }

        data['spec']['dataSchema']['granularitySpec']['intervals'] = deltaPath['interval']
        data['spec']['ioConfig']['inputSpec']['children'][1] = y
        data['spec']['ioConfig']['inputSpec']['children'][0] = z
        # data['spec']['ioConfig']['inputSpec']['children'][0]['ingestionSpec']['segments'] = deltaPath['segments']
        index_spec_str = json.dumps(
            data,
            sort_keys=True,
            indent=4,
            separators=(',', ': ')
        )
        hook = DruidHook(
            druid_ingest_conn_id='druid_ingest_default',
            max_ingestion_time=None
        )
        log.info("Sumitting %s", index_spec_str)
        # if str(deltaSource).lower() == "src":
        #     hook.submit_indexing_job(index_spec_str)

    jsonFile = open("delta.json", "w+")
    jsonFile.write(json.dumps(data, indent=4, sort_keys=True))
    jsonFile.close()
Пример #22
0
 def execute(self, context):
     hook = DruidHook(druid_ingest_conn_id=self.conn_id)
     hook.submit_indexing_job(json.dumps(self.index_spec))
Пример #23
0
    def druid_ingest(self):
        druid_hook = DruidHook(druid_ingest_conn_id=self.druid_conn_id)

        self.log.info("Submitting druid task: %s", json.dumps(self.druid_ingest_spec))
        druid_hook.submit_indexing_job(self.druid_ingest_spec)