def get_table_and_columns(
            self, schema_name,
            table_name) -> Tuple[DataTable, List[DataColumn]]:
        if not self._engine.dialect.has_table(
                self._conn, table_name=table_name, schema=schema_name):
            return None, []

        table = DataTable(
            name=table_name,
            type=None,
            owner=None,
            table_created_at=None,
            table_updated_by=None,
            table_updated_at=None,
            data_size_bytes=None,
            location=None,
            partitions=None,
            raw_description="",
        )

        raw_columns = self._inspect.get_columns(table_name=table_name,
                                                schema=schema_name)
        columns = list(
            map(
                lambda col: DataColumn(
                    name=col["name"],
                    type=str(col["type"]),
                    comment=
                    f"Default:{col['default']} Nullable:{col['nullable']}",
                ),
                raw_columns,
            ))

        return table, columns
示例#2
0
    def get_table_and_columns(
            self, schema_name,
            table_name) -> Tuple[DataTable, List[DataColumn]]:
        table, columns = super(HMSThriftMetastoreLoader,
                               self).get_table_and_columns(
                                   schema_name, table_name)
        if table:
            query = f"desc {schema_name}.{table_name}"
            self._cursor.run(query, run_async=False)

            # First row contains only headers
            thrift_columns = self._cursor.get()[1:]
            seen = set()

            for column in thrift_columns:
                name = column[0]
                if name == "" or name.startswith("#"):
                    continue

                if name not in seen:
                    columns.append(
                        DataColumn(name=name,
                                   type=column[1],
                                   comment=column[2]))
                seen.add(name)
        return table, columns
    def get_table_and_columns(
            self, schema_name,
            table_name) -> Tuple[DataTable, List[DataColumn]]:
        raw_table_info = next(
            iter(
                self._engine.execute(f"""
            SELECT
                TABLE_TYPE,
                CREATE_TIME,
                UPDATE_TIME,
                data_length + index_length
            FROM
                INFORMATION_SCHEMA.TABLES
            WHERE
                TABLE_SCHEMA="{schema_name}" AND TABLE_NAME="{table_name}"
        """)),
            None,
        )

        if not raw_table_info:
            return None, []

        table = DataTable(
            name=table_name,
            type=raw_table_info[0],
            owner=None,
            table_created_at=DATETIME_TO_UTC(raw_table_info[1])
            if raw_table_info[1] is not None else None,
            table_updated_by=None,
            table_updated_at=DATETIME_TO_UTC(raw_table_info[2])
            if raw_table_info[2] is not None else None,
            data_size_bytes=raw_table_info[3],
            location=None,
            partitions=None,
            raw_description=ujson.pdumps(list(raw_table_info)),
        )

        raw_columns = self._inspect.get_columns(table_name=table_name,
                                                schema=schema_name)
        columns = list(
            map(
                lambda col: DataColumn(
                    name=col["name"],
                    type=str(col["type"]),
                    comment=
                    f"Default:{col['default']} Nullable:{col['nullable']}",
                ),
                raw_columns,
            ))

        return table, columns
示例#4
0
    def get_table_and_columns(
            self, schema_name: str,
            table_name: str) -> Tuple[DataTable, List[DataColumn]]:
        glue_table = self.glue_client.get_table(schema_name,
                                                table_name).get("Table")

        if self.load_partitions:
            partitions = self.glue_client.get_hms_style_partitions(
                schema_name, table_name)
        else:
            partitions = []

        table = DataTable(
            name=glue_table.get("Name"),
            type=glue_table.get("TableType"),
            owner=glue_table.get("Owner"),
            table_created_at=int(
                glue_table.get("CreateTime", datetime(1970, 1,
                                                      1)).timestamp()),
            table_updated_at=int(
                glue_table.get("UpdateTime", datetime(1970, 1,
                                                      1)).timestamp()),
            location=glue_table.get("StorageDescriptor").get("Location"),
            partitions=partitions,
            raw_description=glue_table.get("Description"),
        )

        columns = [
            DataColumn(col.get("Name"), col.get("Type"), col.get("Comment"))
            for col in glue_table.get("StorageDescriptor").get("Columns")
        ]

        columns.extend([
            DataColumn(col.get("Name"), col.get("Type"), col.get("Comment"))
            for col in glue_table.get("PartitionKeys")
        ])

        return table, columns
示例#5
0
    def get_table_and_columns(
            self, schema_name,
            table_name) -> Tuple[DataTable, List[DataColumn]]:
        description = get_hive_metastore_table_description(
            self.hmc, schema_name, table_name)
        if not description:
            return None, []

        parameters = description.parameters
        sd = description.sd
        partitions = (self.get_partitions(schema_name, table_name)
                      if self.load_partitions else [])

        last_modified_time = parameters.get("last_modified_time")
        last_modified_time = (int(last_modified_time)
                              if last_modified_time is not None else None)

        total_size = parameters.get("totalSize")
        total_size = int(total_size) if total_size is not None else None

        table = DataTable(
            name=description.tableName,
            type=description.tableType,
            owner=description.owner,
            table_created_at=description.createTime,
            table_updated_by=parameters.get("last_modified_by"),
            table_updated_at=last_modified_time,
            data_size_bytes=total_size,
            location=sd.location,
            partitions=partitions,
            raw_description=ujson.pdumps(description,
                                         default=lambda o: o.__dict__),
        )

        columns = list(
            map(
                lambda col: DataColumn(
                    name=col.name, type=col.type, comment=col.comment),
                sd.cols + description.partitionKeys,
            ))
        return table, columns
    def test_get_table_and_columns(self):
        self.client.create_database(DatabaseInput={"Name": DB_NAME_A})
        self.client.create_table(DatabaseName=DB_NAME_A,
                                 TableInput=TABLE_INPUT_A_1)
        self.client.create_partition(
            DatabaseName=DB_NAME_A,
            TableName=TABLE_NAME_A_1,
            PartitionInput=PARTITION_INPUT_A_1,
        )
        self.client.create_partition(
            DatabaseName=DB_NAME_A,
            TableName=TABLE_NAME_A_1,
            PartitionInput=PARTITION_INPUT_A_2,
        )

        table = DataTable(
            name=TABLE_NAME_A_1,
            type=TABLE_INPUT_A_1.get("TableType"),
            owner=TABLE_INPUT_A_1.get("Owner"),
            table_created_at=int(datetime(1970, 1, 1).timestamp()),
            table_updated_at=int(datetime(1970, 1, 1).timestamp()),
            location=f"s3://mybucket/{DB_NAME_A}/{TABLE_NAME_A_1}",
            partitions=[
                "partition_date=2021-01-01/partition_hour=15",
                "partition_date=2021-03-03/partition_hour=20",
            ],
            raw_description=TABLE_INPUT_A_1.get("Description"),
        )

        columns = [
            DataColumn(
                TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[0].get(
                    "Name"),
                TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[0].get(
                    "Type"),
                TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[0].get(
                    "Comment"),
            ),
            DataColumn(
                TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[1].get(
                    "Name"),
                TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[1].get(
                    "Type"),
                TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[1].get(
                    "Comment"),
            ),
            DataColumn(
                TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[2].get(
                    "Name"),
                TABLE_INPUT_A_1.get("StorageDescriptor").get("Columns")[2].get(
                    "Type"),
            ),
            DataColumn(
                TABLE_INPUT_A_1.get("PartitionKeys")[0].get("Name"),
                TABLE_INPUT_A_1.get("PartitionKeys")[0].get("Type"),
            ),
            DataColumn(
                TABLE_INPUT_A_1.get("PartitionKeys")[1].get("Name"),
                TABLE_INPUT_A_1.get("PartitionKeys")[1].get("Type"),
            ),
        ]

        result_table, result_columns = self.loader.get_table_and_columns(
            DB_NAME_A, TABLE_NAME_A_1)

        self.assertEqual(result_table, table)
        self.assertEqual(result_columns, columns)