Пример #1
1
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)

        logging.info("Dumping MySQL query results to local file")
        conn = mysql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        with NamedTemporaryFile("w") as f:
            csv_writer = csv.writer(f, delimiter=self.delimiter)
            field_dict = OrderedDict()
            for field in cursor.description:
                field_dict[field[0]] = self.type_map(field[1])
            csv_writer.writerows(cursor)
            f.flush()
            cursor.close()
            conn.close()
            logging.info("Loading file into Hive")
            hive.load_file(
                f.name,
                self.hive_table,
                field_dict=field_dict,
                create=self.create,
                partition=self.partition,
                delimiter=self.delimiter,
                recreate=self.recreate)
Пример #2
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)

        logging.info("Dumping MySQL query results to local file")
        conn = mysql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        with NamedTemporaryFile("wb") as f:
            csv_writer = csv.writer(f,
                                    delimiter=self.delimiter,
                                    encoding="utf-8")
            field_dict = OrderedDict()
            for field in cursor.description:
                field_dict[field[0]] = self.type_map(field[1])
            csv_writer.writerows(cursor)
            f.flush()
            cursor.close()
            conn.close()
            logging.info("Loading file into Hive")
            hive.load_file(f.name,
                           self.hive_table,
                           field_dict=field_dict,
                           create=self.create,
                           partition=self.partition,
                           delimiter=self.delimiter,
                           recreate=self.recreate)
Пример #3
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str']
        sql = self.sql.strip().strip(';')
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE AS
        {sql};
        """.format(**locals())
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find('/user')
        static_path = hdfs_uri[pos:]

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        logging.info("Inserting rows into Druid")
        druid.load_from_hdfs(datasource=self.druid_datasource,
                             intervals=self.intervals,
                             static_path=static_path,
                             ts_dim=self.ts_dim,
                             columns=columns,
                             metric_spec=self.metric_spec)
        logging.info("Load seems to have succeeded!")
Пример #4
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        vertica = VerticaHook(vertica_conn_id=self.vertica_conn_id)

        logging.info("Dumping Vertica query results to local file")
        conn = vertica.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        with NamedTemporaryFile("w") as f:
            csv_writer = csv.writer(f, delimiter=self.delimiter, encoding='utf-8')
            field_dict = OrderedDict()
            col_count = 0
            for field in cursor.description:
                col_count += 1
                col_position = "Column{position}".format(position=col_count)
                field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1])
            csv_writer.writerows(cursor.iterate())
            f.flush()
            cursor.close()
            conn.close()
            logging.info("Loading file into Hive")
            hive.load_file(
                f.name,
                self.hive_table,
                field_dict=field_dict,
                create=self.create,
                partition=self.partition,
                delimiter=self.delimiter,
                recreate=self.recreate)
Пример #5
0
 def execute(self, context):
     self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
     self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
     logging.info("Downloading S3 file")
     if self.wildcard_match:
         if not self.s3.check_for_wildcard_key(self.s3_key):
             raise AirflowException("No key matches {0}".format(self.s3_key))
         s3_key_object = self.s3.get_wildcard_key(self.s3_key)
     else:
         if not self.s3.check_for_key(self.s3_key):
             raise AirflowException(
                 "The key {0} does not exists".format(self.s3_key))
         s3_key_object = self.s3.get_key(self.s3_key)
     with NamedTemporaryFile("w") as f:
         logging.info("Dumping S3 key {0} contents to local"
                      " file {1}".format(s3_key_object.key, f.name))
         s3_key_object.get_contents_to_file(f)
         f.flush()
         self.s3.connection.close()
         if not self.headers:
             logging.info("Loading file into Hive")
             self.hive.load_file(
                 f.name,
                 self.hive_table,
                 field_dict=self.field_dict,
                 create=self.create,
                 partition=self.partition,
                 delimiter=self.delimiter,
                 recreate=self.recreate)
         else:
             with open(f.name, 'r') as tmpf:
                 if self.check_headers:
                     header_l = tmpf.readline()
                     header_line = header_l.rstrip()
                     header_list = header_line.split(self.delimiter)
                     field_names = list(self.field_dict.keys())
                     test_field_match = [h1.lower() == h2.lower() for h1, h2
                                         in zip(header_list, field_names)]
                     if not all(test_field_match):
                         logging.warning("Headers do not match field names"
                                         "File headers:\n {header_list}\n"
                                         "Field names: \n {field_names}\n"
                                         "".format(**locals()))
                         raise AirflowException("Headers do not match the "
                                         "field_dict keys")
                 with NamedTemporaryFile("w") as f_no_headers:
                     tmpf.seek(0)
                     next(tmpf)
                     for line in tmpf:
                         f_no_headers.write(line)
                     f_no_headers.flush()
                     logging.info("Loading file without headers into Hive")
                     self.hive.load_file(
                         f_no_headers.name,
                         self.hive_table,
                         field_dict=self.field_dict,
                         create=self.create,
                         partition=self.partition,
                         delimiter=self.delimiter,
                         recreate=self.recreate)
Пример #6
0
 def __init__(self,
              s3_key,
              field_dict,
              hive_table,
              delimiter=',',
              create=True,
              recreate=False,
              partition=None,
              headers=False,
              check_headers=False,
              s3_conn_id='s3_default',
              hive_cli_conn_id='hive_cli_default',
              *args,
              **kwargs):
     super(S3ToHiveTransfer, self).__init__(*args, **kwargs)
     self.s3_key = s3_key
     self.field_dict = field_dict
     self.hive_table = hive_table
     self.delimiter = delimiter
     self.create = create
     self.recreate = recreate
     self.partition = partition
     self.headers = headers
     self.check_headers = check_headers
     self.hive = HiveCliHook(hive_cli_conn_id=hive_cli_conn_id)
     self.s3 = S3Hook(s3_conn_id=s3_conn_id)
Пример #7
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)

        logging.info("Dumping Microsoft SQL Server query results to local file")
        conn = mssql.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        with NamedTemporaryFile("w") as f:
            csv_writer = csv.writer(f, delimiter=self.delimiter, encoding="utf-8")
            field_dict = OrderedDict()
            col_count = 0
            for field in cursor.description:
                col_count += 1
                col_position = "Column{position}".format(position=col_count)
                field_dict[col_position if field[0] == "" else field[0]] = self.type_map(field[1])
            csv_writer.writerows(cursor)
            f.flush()
            cursor.close()
            conn.close()
            logging.info("Loading file into Hive")
            hive.load_file(
                f.name,
                self.hive_table,
                field_dict=field_dict,
                create=self.create,
                partition=self.partition,
                delimiter=self.delimiter,
                recreate=self.recreate,
            )
Пример #8
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str']
        sql = self.sql.strip().strip(';')
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = '')
        AS
        {sql}
        """.format(**locals())
        hive.run_cli(hql)
        #hqls = hql.split(';')
        #logging.info(str(hqls))
        #from airflow.hooks import HiveServer2Hook
        #hive = HiveServer2Hook(hiveserver2_conn_id="hiveserver2_silver")
        #hive.get_results(hqls)

        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find('/user')
        static_path = hdfs_uri[pos:]

        schema, table = hive_table.split('.')

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        logging.info("Inserting rows into Druid")
        logging.info("HDFS path: " + static_path)

        druid.load_from_hdfs(
            datasource=self.druid_datasource,
            intervals=self.intervals,
            static_path=static_path,
            ts_dim=self.ts_dim,
            columns=columns,
            metric_spec=self.metric_spec,
            hadoop_dependency_coordinates=self.hadoop_dependency_coordinates)
        logging.info("Load seems to have succeeded!")

        logging.info("Cleaning up by dropping the temp "
                     "Hive table {}".format(hive_table))
        hql = "DROP TABLE IF EXISTS {}".format(hive_table)
Пример #9
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str']
        sql = self.sql.strip().strip(';')
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = '')
        AS
        {sql}
        """.format(**locals())
        hive.run_cli(hql)
        #hqls = hql.split(';')
        #logging.info(str(hqls))
        #from airflow.hooks import HiveServer2Hook
        #hive = HiveServer2Hook(hiveserver2_conn_id="hiveserver2_silver")
        #hive.get_results(hqls)


        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find('/user')
        static_path = hdfs_uri[pos:]

        schema, table = hive_table.split('.')

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        logging.info("Inserting rows into Druid")
        logging.info("HDFS path: " + static_path)

        druid.load_from_hdfs(
            datasource=self.druid_datasource,
            intervals=self.intervals,
            static_path=static_path, ts_dim=self.ts_dim,
            columns=columns, metric_spec=self.metric_spec,
            hadoop_dependency_coordinates=self.hadoop_dependency_coordinates)
        logging.info("Load seems to have succeeded!")

        logging.info(
            "Cleaning up by dropping the temp "
            "Hive table {}".format(hive_table))
        hql = "DROP TABLE IF EXISTS {}".format(hive_table)
Пример #10
0
 def execute(self, context):
     self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
     self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
     logging.info("Downloading S3 file")
     if self.wildcard_match:
         if not self.s3.check_for_wildcard_key(self.s3_key):
             raise AirflowException("No key matches {0}".format(self.s3_key))
         s3_key_object = self.s3.get_wildcard_key(self.s3_key)
     else:
         if not self.s3.check_for_key(self.s3_key):
             raise AirflowException(
                 "The key {0} does not exists".format(self.s3_key))
         s3_key_object = self.s3.get_key(self.s3_key)
     with NamedTemporaryFile("w") as f:
         logging.info("Dumping S3 key {0} contents to local"
                      " file {1}".format(s3_key_object.key, f.name))
         s3_key_object.get_contents_to_file(f)
         f.flush()
         self.s3.connection.close()
         if not self.headers:
             logging.info("Loading file into Hive")
             self.hive.load_file(
                 f.name,
                 self.hive_table,
                 field_dict=self.field_dict,
                 create=self.create,
                 partition=self.partition,
                 delimiter=self.delimiter,
                 recreate=self.recreate)
         else:
             with open(f.name, 'r') as tmpf:
                 if self.check_headers:
                     header_l = tmpf.readline()
                     header_line = header_l.rstrip()
                     header_list = header_line.split(self.delimiter)
                     field_names = list(self.field_dict.keys())
                     test_field_match = [h1.lower() == h2.lower() for h1, h2
                                         in zip(header_list, field_names)]
                     if not all(test_field_match):
                         logging.warning("Headers do not match field names"
                                         "File headers:\n {header_list}\n"
                                         "Field names: \n {field_names}\n"
                                         "".format(**locals()))
                         raise AirflowException("Headers do not match the "
                                         "field_dict keys")
                 with NamedTemporaryFile("w") as f_no_headers:
                     tmpf.seek(0)
                     next(tmpf)
                     for line in tmpf:
                         f_no_headers.write(line)
                     f_no_headers.flush()
                     logging.info("Loading file without headers into Hive")
                     self.hive.load_file(
                         f_no_headers.name,
                         self.hive_table,
                         field_dict=self.field_dict,
                         create=self.create,
                         partition=self.partition,
                         delimiter=self.delimiter,
                         recreate=self.recreate)
Пример #11
0
 def __init__(
         self,
         s3_key,
         field_dict,
         hive_table,
         delimiter=',',
         create=True,
         recreate=False,
         partition=None,
         headers=False,
         check_headers=False,
         s3_conn_id='s3_default',
         hive_cli_conn_id='hive_cli_default',
         *args, **kwargs):
     super(S3ToHiveTransfer, self).__init__(*args, **kwargs)
     self.s3_key = s3_key
     self.field_dict = field_dict
     self.hive_table = hive_table
     self.delimiter = delimiter
     self.create = create
     self.recreate = recreate
     self.partition = partition
     self.headers = headers
     self.check_headers = check_headers
     self.hive = HiveCliHook(hive_cli_conn_id=hive_cli_conn_id)
     self.s3 = S3Hook(s3_conn_id=s3_conn_id)
Пример #12
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        hive_table = "druid." + context["task_instance_key_str"]
        sql = self.sql.strip().strip(";")
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE AS
        {sql};
        """.format(
            **locals()
        )
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find("/user")
        static_path = hdfs_uri[pos:]

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        logging.info("Inserting rows into Druid")
        druid.load_from_hdfs(
            datasource=self.druid_datasource,
            intervals=self.intervals,
            static_path=static_path,
            ts_dim=self.ts_dim,
            columns=columns,
            metric_spec=self.metric_spec,
        )
        logging.info("Load seems to have succeeded!")
Пример #13
0
 def ddl(self):
     table = request.args.get("table")
     sql = "SHOW CREATE TABLE {table};".format(table=table)
     h = HiveCliHook(HIVE_CLI_CONN_ID)
     return h.run_cli(sql)
Пример #14
0
class S3ToHiveTransfer(BaseOperator):
    """
    Moves data from S3 to Hive. The operator downloads a file from S3,
    stores the file locally before loading it into a Hive table.
    If the ``create`` or ``recreate`` arguments are set to ``True``,
    a ``CREATE TABLE`` and ``DROP TABLE`` statements are generated.
    Hive data types are inferred from the cursors's metadata from.

    Note that the table generated in Hive uses ``STORED AS textfile``
    which isn't the most efficient serialization format. If a
    large amount of data is loaded and/or if the tables gets
    queried considerably, you may want to use this operator only to
    stage the data into a temporary table before loading it into its
    final destination using a ``HiveOperator``.

    :param s3_key: The key to be retrieved from S3
    :type s3_key: str
    :param field_dict: A dictionary of the fields name in the file
        as keys and their Hive types as values
    :type field_dict: dict
    :param hive_table: target Hive table, use dot notation to target a
        specific database
    :type hive_table: str
    :param create: whether to create the table if it doesn't exist
    :type create: bool
    :param recreate: whether to drop and recreate the table at every
        execution
    :type recreate: bool
    :param partition: target partition as a dict of partition columns
        and values
    :type partition: dict
    :param headers: whether the file contains column names on the first
        line
    :type headers: bool
    :param headers: whether the column names on the first line should be
        checked against the keys of field_dict
    :type headers: bool
    :param delimiter: field delimiter in the file
    :type delimiter: str
    :param s3_conn_id: source s3 connection
    :type s3_conn_id: str
    :param hive_conn_id: desctination hive connection
    :type hive_conn_id: str
    """

    __mapper_args__ = {'polymorphic_identity': 'S3ToHiveOperator'}
    template_fields = ('s3_key', 'partition', 'hive_table')
    template_ext = ()
    ui_color = '#a0e08c'

    @apply_defaults
    def __init__(self,
                 s3_key,
                 field_dict,
                 hive_table,
                 delimiter=',',
                 create=True,
                 recreate=False,
                 partition=None,
                 headers=False,
                 check_headers=False,
                 s3_conn_id='s3_default',
                 hive_cli_conn_id='hive_cli_default',
                 *args,
                 **kwargs):
        super(S3ToHiveTransfer, self).__init__(*args, **kwargs)
        self.s3_key = s3_key
        self.field_dict = field_dict
        self.hive_table = hive_table
        self.delimiter = delimiter
        self.create = create
        self.recreate = recreate
        self.partition = partition
        self.headers = headers
        self.check_headers = check_headers
        self.hive = HiveCliHook(hive_cli_conn_id=hive_cli_conn_id)
        self.s3 = S3Hook(s3_conn_id=s3_conn_id)

    def execute(self, context):
        logging.info("Downloading S3 file")
        if not self.s3.check_for_key(self.s3_key):
            raise Exception("The key {0} does not exists".format(self.s3_key))
        s3_key_object = self.s3.get_key(self.s3_key)
        with NamedTemporaryFile("w") as f:
            logging.info("Dumping S3 file {0} contents to local"
                         " file {1}".format(self.s3_key, f.name))
            s3_key_object.get_contents_to_file(f)
            f.flush()
            self.s3.connection.close()
            if not self.headers:
                logging.info("Loading file into Hive")
                self.hive.load_file(f.name,
                                    self.hive_table,
                                    field_dict=self.field_dict,
                                    create=self.create,
                                    partition=self.partition,
                                    delimiter=self.delimiter,
                                    recreate=self.recreate)
            else:
                with open(f.name, 'r') as tmpf:
                    if self.check_headers:
                        header_l = tmpf.readline()
                        header_line = header_l.rstrip()
                        header_list = header_line.split(self.delimiter)
                        field_names = list(self.field_dict.keys())
                        test_field_match = [
                            h1.lower() == h2.lower()
                            for h1, h2 in zip(header_list, field_names)
                        ]
                        if not all(test_field_match):
                            logging.warning("Headers do not match field names"
                                            "File headers:\n {header_list}\n"
                                            "Field names: \n {field_names}\n"
                                            "".format(**locals()))
                            raise Exception("Headers do not match the "
                                            "field_dict keys")
                    with NamedTemporaryFile("w") as f_no_headers:
                        tmpf.seek(0)
                        next(tmpf)
                        for line in tmpf:
                            f_no_headers.write(line)
                        f_no_headers.flush()
                        logging.info("Loading file without headers into Hive")
                        self.hive.load_file(f_no_headers.name,
                                            self.hive_table,
                                            field_dict=self.field_dict,
                                            create=self.create,
                                            partition=self.partition,
                                            delimiter=self.delimiter,
                                            recreate=self.recreate)
Пример #15
0
class S3ToHiveTransfer(BaseOperator):
    """
    Moves data from S3 to Hive. The operator downloads a file from S3,
    stores the file locally before loading it into a Hive table.
    If the ``create`` or ``recreate`` arguments are set to ``True``,
    a ``CREATE TABLE`` and ``DROP TABLE`` statements are generated.
    Hive data types are inferred from the cursor's metadata from.

    Note that the table generated in Hive uses ``STORED AS textfile``
    which isn't the most efficient serialization format. If a
    large amount of data is loaded and/or if the tables gets
    queried considerably, you may want to use this operator only to
    stage the data into a temporary table before loading it into its
    final destination using a ``HiveOperator``.

    :param s3_key: The key to be retrieved from S3
    :type s3_key: str
    :param field_dict: A dictionary of the fields name in the file
        as keys and their Hive types as values
    :type field_dict: dict
    :param hive_table: target Hive table, use dot notation to target a
        specific database
    :type hive_table: str
    :param create: whether to create the table if it doesn't exist
    :type create: bool
    :param recreate: whether to drop and recreate the table at every
        execution
    :type recreate: bool
    :param partition: target partition as a dict of partition columns
        and values
    :type partition: dict
    :param headers: whether the file contains column names on the first
        line
    :type headers: bool
    :param check_headers: whether the column names on the first line should be
        checked against the keys of field_dict
    :type check_headers: bool
    :param wildcard_match: whether the s3_key should be interpreted as a Unix
        wildcard pattern
    :type wildcard_match: bool
    :param delimiter: field delimiter in the file
    :type delimiter: str
    :param s3_conn_id: source s3 connection
    :type s3_conn_id: str
    :param hive_conn_id: destination hive connection
    :type hive_conn_id: str
    """

    template_fields = ('s3_key', 'partition', 'hive_table')
    template_ext = ()
    ui_color = '#a0e08c'

    @apply_defaults
    def __init__(
            self,
            s3_key,
            field_dict,
            hive_table,
            delimiter=',',
            create=True,
            recreate=False,
            partition=None,
            headers=False,
            check_headers=False,
            wildcard_match=False,
            s3_conn_id='s3_default',
            hive_cli_conn_id='hive_cli_default',
            *args, **kwargs):
        super(S3ToHiveTransfer, self).__init__(*args, **kwargs)
        self.s3_key = s3_key
        self.field_dict = field_dict
        self.hive_table = hive_table
        self.delimiter = delimiter
        self.create = create
        self.recreate = recreate
        self.partition = partition
        self.headers = headers
        self.check_headers = check_headers
        self.wildcard_match = wildcard_match
        self.hive_cli_conn_id = hive_cli_conn_id
        self.s3_conn_id = s3_conn_id

    def execute(self, context):
        self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)
        logging.info("Downloading S3 file")
        if self.wildcard_match:
            if not self.s3.check_for_wildcard_key(self.s3_key):
                raise AirflowException("No key matches {0}".format(self.s3_key))
            s3_key_object = self.s3.get_wildcard_key(self.s3_key)
        else:
            if not self.s3.check_for_key(self.s3_key):
                raise AirflowException(
                    "The key {0} does not exists".format(self.s3_key))
            s3_key_object = self.s3.get_key(self.s3_key)
        with NamedTemporaryFile("w") as f:
            logging.info("Dumping S3 key {0} contents to local"
                         " file {1}".format(s3_key_object.key, f.name))
            s3_key_object.get_contents_to_file(f)
            f.flush()
            self.s3.connection.close()
            if not self.headers:
                logging.info("Loading file into Hive")
                self.hive.load_file(
                    f.name,
                    self.hive_table,
                    field_dict=self.field_dict,
                    create=self.create,
                    partition=self.partition,
                    delimiter=self.delimiter,
                    recreate=self.recreate)
            else:
                with open(f.name, 'r') as tmpf:
                    if self.check_headers:
                        header_l = tmpf.readline()
                        header_line = header_l.rstrip()
                        header_list = header_line.split(self.delimiter)
                        field_names = list(self.field_dict.keys())
                        test_field_match = [h1.lower() == h2.lower() for h1, h2
                                            in zip(header_list, field_names)]
                        if not all(test_field_match):
                            logging.warning("Headers do not match field names"
                                            "File headers:\n {header_list}\n"
                                            "Field names: \n {field_names}\n"
                                            "".format(**locals()))
                            raise AirflowException("Headers do not match the "
                                            "field_dict keys")
                    with NamedTemporaryFile("w") as f_no_headers:
                        tmpf.seek(0)
                        next(tmpf)
                        for line in tmpf:
                            f_no_headers.write(line)
                        f_no_headers.flush()
                        logging.info("Loading file without headers into Hive")
                        self.hive.load_file(
                            f_no_headers.name,
                            self.hive_table,
                            field_dict=self.field_dict,
                            create=self.create,
                            partition=self.partition,
                            delimiter=self.delimiter,
                            recreate=self.recreate)
Пример #16
0
 def get_hook(self):
     return HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id,
                        run_as=self.run_as)
Пример #17
0
 def ddl(self):
     table = request.args.get("table")
     sql = "SHOW CREATE TABLE {table};".format(table=table)
     h = HiveCliHook(HIVE_CLI_CONN_ID)
     return h.run_cli(sql)