Пример #1
0
 def execute(self, context):
     hook = DruidHook(
         druid_ingest_conn_id=self.conn_id,
         max_ingestion_time=self.max_ingestion_time
     )
     self.log.info("Submitting %s", self.json_index_file)
     hook.submit_indexing_job(json.loads(self.json_index_file))
Пример #2
0
 def execute(self, context):
     hook = DruidHook(
         druid_ingest_conn_id=self.conn_id,
         max_ingestion_time=self.max_ingestion_time
     )
     self.log.info("Submitting %s", self.index_spec_str)
     hook.submit_indexing_job(self.index_spec_str)
Пример #3
0
 def execute(self, context: Dict[Any, Any]) -> None:
     hook = DruidHook(
         druid_ingest_conn_id=self.conn_id,
         timeout=self.timeout,
         max_ingestion_time=self.max_ingestion_time,
     )
     self.log.info("Submitting %s", self.json_index_file)
     hook.submit_indexing_job(self.json_index_file)
Пример #4
0
 def test_get_conn_url(self, mock_get_connection):
     get_conn_value = MagicMock()
     get_conn_value.host = 'test_host'
     get_conn_value.conn_type = 'https'
     get_conn_value.port = '1'
     get_conn_value.extra_dejson = {'endpoint': 'ingest'}
     mock_get_connection.return_value = get_conn_value
     hook = DruidHook(timeout=1, max_ingestion_time=5)
     self.assertEqual(hook.get_conn_url(), 'https://test_host:1/ingest')
Пример #5
0
    def execute(self, context: Dict[str, Any]) -> None:
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        self.log.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str'].replace(
            '.', '_')
        sql = self.sql.strip().strip(';')
        tblproperties = ''.join([
            ", '{}' = '{}'".format(k, v)
            for k, v in self.hive_tblproperties.items()
        ])
        hql = f"""\
        SET mapred.output.compress=false;
        SET hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = ''{tblproperties})
        AS
        {sql}
        """
        self.log.info("Running command:\n %s", hql)
        hive.run_cli(hql)

        meta_hook = HiveMetastoreHook(self.metastore_conn_id)

        # Get the Hive table and extract the columns
        table = meta_hook.get_table(hive_table)
        columns = [col.name for col in table.sd.cols]

        # Get the path on hdfs
        static_path = meta_hook.get_table(hive_table).sd.location

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)

        try:
            index_spec = self.construct_ingest_query(
                static_path=static_path,
                columns=columns,
            )

            self.log.info("Inserting rows into Druid, hdfs path: %s",
                          static_path)

            druid.submit_indexing_job(index_spec)

            self.log.info("Load seems to have succeeded!")
        finally:
            self.log.info("Cleaning up by dropping the temp Hive table %s",
                          hive_table)
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)