Exemplo n.º 1
0
 def execute(self, context):
     hook = DruidHook(
         druid_ingest_conn_id=self.conn_id,
         max_ingestion_time=self.max_ingestion_time
     )
     self.log.info("Sumitting %s", self.index_spec_str)
     hook.submit_indexing_job(self.index_spec_str)
Exemplo n.º 2
0
 def execute(self, context):
     hook = DruidHook(
         druid_ingest_conn_id=self.conn_id,
         max_ingestion_time=self.max_ingestion_time
     )
     self.log.info("Sumitting %s", self.index_spec_str)
     hook.submit_indexing_job(self.index_spec_str)
Exemplo n.º 3
0
def druid_indexing(task_id, **kwargs):
    task = 'init_task_{}'.format(task_id)
    ti = kwargs['ti']
    xcom_values = pullXcom(ti, task, xcom_keys)  # pull data from xcom object
    index_template = xcom_values['index_template']

    # index_template = kwargs.get('templates_dict').get('index_template', None)
    # site = kwargs.get('templates_dict').get('site', None)
    tmp_index_template = index_template.replace("_template.json", "_template_{}.json".format(task_id))
    print(tmp_index_template)
    hook = DruidHook(
        druid_ingest_conn_id='druid_ingest_default',
        max_ingestion_time=None
    )

    with open(tmp_index_template) as data_file:
        index_spec = json.load(data_file)
    index_spec_str = json.dumps(
        index_spec,
        sort_keys=True,
        indent=4,
        separators=(',', ': ')
    )
    log.info("Sumitting %s", index_spec_str)
    hook.submit_indexing_job(index_spec_str)
Exemplo n.º 4
0
def DruidIndexTask(**kwargs):
    task = 'boot_task'
    ti = kwargs['ti']
    xcom_keys = "index_template,source,date,log_enrich_path"
    xcom_values = pullXcom(ti, task, xcom_keys)  # pull data from xcom object
    index_template = xcom_values['index_template']
    source = xcom_values['source']
    date = xcom_values['date']
    enrichDataPath = xcom_values['log_enrich_path']
    # read arguments
    # index_template = kwargs.get('templates_dict').get('index_template', None)
    # source = kwargs.get('templates_dict').get('source', None)

    # date = kwargs.get('templates_dict').get('date', None)
    # enrichDataPath = kwargs.get('templates_dict').get('enrichedDir', None)

    hdfs_path = buildDates(date, enrichDataPath)
    jReturn = json.loads(hdfs_path)

    interval = jReturn['interval']
    hdfs_path = jReturn['path'][0]

    hook = DruidHook(
        druid_ingest_conn_id='druid_ingest_default',
        max_ingestion_time=None
    )

    # Open the JSON file for reading
    jsonFile = open(index_template, "r")

    # Read the JSON into the buffer
    data = json.load(jsonFile)

    # Close the JSON file
    jsonFile.close()

    ## Working with buffered content
    jData = data['spec']['dataSchema']['dataSource']
    data['spec']['dataSchema']['dataSource'] = str(jData).replace(jData, source)
    jData = data['spec']['dataSchema']['granularitySpec']['intervals'][0]
    data['spec']['dataSchema']['granularitySpec']['intervals'] = [str(jData).replace(jData, interval)]
    jData = data['spec']['ioConfig']['inputSpec']['paths']
    data['spec']['ioConfig']['inputSpec']['paths'] = str(jData).replace(jData, hdfs_path)

    index_spec_str = json.dumps(
        data,
        sort_keys=True,
        indent=4,
        separators=(',', ': ')
    )
    log.info("Sumitting %s", index_spec_str)
    hook.submit_indexing_job(index_spec_str)

    ## Save our changes to JSON file
    jsonFile = open(index_template, "w+")
    jsonFile.write(json.dumps(data, indent=4, sort_keys=True))
    jsonFile.close()
    return index_template
Exemplo n.º 5
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        self.log.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_')
        sql = self.sql.strip().strip(';')
        tblproperties = ''.join(
            ", '{}' = '{}'".format(k, v)
            for k, v in self.hive_tblproperties.items()
        )

        hql = """\
        SET mapred.output.compress=false;
        SET hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = ''{tblproperties})
        AS
        {sql}
        """.format(**locals())
        self.log.info("Running command:\n %s", hql)
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)

        # Get the Hive table and extract the columns
        t = m.get_table(hive_table)
        columns = [col.name for col in t.sd.cols]

        # Get the path on hdfs
        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find('/user')
        static_path = hdfs_uri[pos:]

        schema, table = hive_table.split('.')

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)

        try:
            index_spec = self.construct_ingest_query(
                static_path=static_path,
                columns=columns,
            )

            self.log.info("Inserting rows into Druid, hdfs path: %s", static_path)

            druid.submit_indexing_job(index_spec)

            self.log.info("Load seems to have succeeded!")
        finally:
            self.log.info(
                "Cleaning up by dropping the temp Hive table %s",
                hive_table
            )
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
    def test_submit_ok(self, m):
        hook = DruidHook()
        m.post('http://druid-overlord:8081/druid/indexer/v1/task',
               text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}')
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "SUCCESS"}}')

        # Exists just as it should
        hook.submit_indexing_job('Long json file')
    def test_submit_gone_wrong(self, m):
        hook = DruidHook()
        m.post('http://druid-overlord:8081/druid/indexer/v1/task',
               text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}')
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "FAILED"}}')

        # The job failed for some reason
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
    def test_submit_timeout(self, m):
        hook = DruidHook(timeout=0, max_ingestion_time=5)
        m.post('http://druid-overlord:8081/druid/indexer/v1/task',
               text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}')
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "RUNNING"}}')

        # Because the jobs keeps running
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
    def test_submit_unknown_response(self, m):
        hook = DruidHook()
        m.post('http://druid-overlord:8081/druid/indexer/v1/task',
               text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}')
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "UNKNOWN"}}')

        # An unknown error code
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
Exemplo n.º 10
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        self.log.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_')
        sql = self.sql.strip().strip(';')
        tblproperties = ''.join([", '{}' = '{}'"
                                .format(k, v)
                                 for k, v in self.hive_tblproperties.items()])
        hql = """\
        SET mapred.output.compress=false;
        SET hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = ''{tblproperties})
        AS
        {sql}
        """.format(hive_table=hive_table, tblproperties=tblproperties, sql=sql)
        self.log.info("Running command:\n %s", hql)
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)

        # Get the Hive table and extract the columns
        t = m.get_table(hive_table)
        columns = [col.name for col in t.sd.cols]

        # Get the path on hdfs
        static_path = m.get_table(hive_table).sd.location

        schema, table = hive_table.split('.')

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)

        try:
            index_spec = self.construct_ingest_query(
                static_path=static_path,
                columns=columns,
            )

            self.log.info("Inserting rows into Druid, hdfs path: %s", static_path)

            druid.submit_indexing_job(index_spec)

            self.log.info("Load seems to have succeeded!")
        finally:
            self.log.info(
                "Cleaning up by dropping the temp Hive table %s",
                hive_table
            )
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
Exemplo n.º 11
0
    def test_submit_ok(self, m):
        hook = DruidHook()
        m.post(
            'http://druid-overlord:8081/druid/indexer/v1/task',
            text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}'
        )
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "SUCCESS"}}'
        )

        # Exists just as it should
        hook.submit_indexing_job('Long json file')
Exemplo n.º 12
0
    def test_submit_gone_wrong(self, m):
        hook = DruidHook()
        m.post(
            'http://druid-overlord:8081/druid/indexer/v1/task',
            text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}'
        )
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "FAILED"}}'
        )

        # The job failed for some reason
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
Exemplo n.º 13
0
    def test_submit_unknown_response(self, m):
        hook = DruidHook()
        m.post(
            'http://druid-overlord:8081/druid/indexer/v1/task',
            text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}'
        )
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "UNKNOWN"}}'
        )

        # An unknown error code
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
Exemplo n.º 14
0
    def test_submit_timeout(self, m):
        hook = DruidHook(timeout=0, max_ingestion_time=5)
        m.post(
            'http://druid-overlord:8081/druid/indexer/v1/task',
            text='{"task":"9f8a7359-77d4-4612-b0cd-cc2f6a3c28de"}'
        )
        m.get(
            'http://druid-overlord:8081/druid/indexer/v1/task/9f8a7359-77d4-4612-b0cd-cc2f6a3c28de/status',
            text='{"status":{"status": "RUNNING"}}'
        )

        # Because the jobs keeps running
        with self.assertRaises(AirflowException):
            hook.submit_indexing_job('Long json file')
Exemplo n.º 15
0
 def execute(self, context):
     hook = DruidHook(druid_ingest_conn_id=self.conn_id)
     hook.submit_indexing_job(json.dumps(self.index_spec))
Exemplo n.º 16
0
 def execute(self, context):
     hook = DruidHook(druid_ingest_conn_id=self.conn_id)
     hook.submit_indexing_job(json.dumps(self.index_spec))
Exemplo n.º 17
0
    def druid_ingest(self):
        druid_hook = DruidHook(druid_ingest_conn_id=self.druid_conn_id)

        self.log.info("Submitting druid task: %s", json.dumps(self.druid_ingest_spec))
        druid_hook.submit_indexing_job(self.druid_ingest_spec)