Пример #1
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        pg = PostgresHook(postgres_conn_id=self.postgres_conn_id)

        logging.info("Dumping postgres query results to local file")
        conn = pg.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql, self.parameters)
        with NamedTemporaryFile("wb") as f:
            csv_writer = csv.writer(f,
                                    delimiter=self.delimiter,
                                    encoding="utf-8")
            field_dict = OrderedDict()
            for field in cursor.description:
                field_dict[field[0]] = self.type_map(field[1])
            csv_writer.writerows(cursor)
            f.flush()
            cursor.close()
            conn.close()
            logging.info("Loading file into Hive")

            hive.load_file(f.name,
                           self.hive_table,
                           field_dict=field_dict,
                           create=self.create,
                           partition=self.partition,
                           delimiter=self.delimiter,
                           recreate=self.recreate)
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)

        hive.load_avro(self.hdfs_dir,
                       self.hive_table,
                       schemafile=self.schemafile,
                       create=self.create,
                       recreate=self.recreate,
                       partition=self.partition)
Пример #3
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        pg = PostgresHook(postgres_conn_id=self.postgres_conn_id)

        logging.info("Dumping postgres query results to local file")
        conn = pg.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql, self.parameters)
        with NamedTemporaryFile("wb") as f:
            csv_writer = csv.writer(f,
                                    delimiter=self.delimiter,
                                    encoding="utf-8")
            field_dict = OrderedDict()
            fields_to_hash = set([])
            ctr = 0
            for field in cursor.description:
                field_dict[field[0]] = self.type_map(field[0], field[1])
                if field[0].startswith('hkey_'):
                    fields_to_hash.add(ctr)
                ctr += 1

            field_dict['record_source'] = 'STRING'
            field_dict['load_dtm'] = 'TIMESTAMP'
            field_dict['seq_num'] = 'BIGINT'

            seq = long(1)
            for row in cursor:
                new_row = []
                for idx, val in enumerate(list(row)):
                    if idx in fields_to_hash:
                        m = hashlib.sha1()
                        m.update(val)
                        new_row.append(m.hexdigest().upper())
                    else:
                        new_row.append(val)

                csv_writer.writerow(new_row +
                                    [self.record_source, self.load_dtm, seq])
                seq += long(1)

            f.flush()
            cursor.close()
            conn.close()
            logging.info("Loading file into Hive")

            hive.load_file(f.name,
                           self.hive_table,
                           field_dict=field_dict,
                           create=self.create,
                           delimiter=self.delimiter,
                           recreate=self.recreate,
                           partition=self.partition)
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        file_hook = FileHook(file_conn_id=self.file_conn_id)
        abs_path = file_hook.complete_file_path(self.relative_file_path)

        logging.info("Retrieving file and loading into Hive")
        with NamedTemporaryFile("wb") as f:
            logging.info("Loading file into Hive")

            hive.load_file(abs_path,
                           self.hive_table,
                           schemafile=self.schemafile,
                           create=self.create,
                           recreate=self.recreate,
                           partition=self.partition)