예제 #1
0
    def setUpClass(cls):
        cls.spacer = ('', nan, nan)

        cls.schema = [('# col_name', 'data_type', 'comment'), cls.spacer,
                      ('foo', 'int', nan), ('bar', 'tinyint', nan),
                      ('baz', 'bigint', nan)]

        cls.partitions = [('# Partition Information', nan, nan),
                          ('# col_name', 'data_type', 'comment'), cls.spacer,
                          ('qux', 'bigint', nan)]

        cls.info = [
            ('# Detailed Table Information', nan, nan),
            ('Database:', 'tpcds', nan),
            ('Owner:', 'wesm', nan),
            ('CreateTime:', 'Sun Nov 08 01:09:42 PST 2015', nan),
            ('LastAccessTime:', 'UNKNOWN', nan),
            ('Protect Mode:', 'None', nan),
            ('Retention:', '0', nan),
            ('Location:', ('hdfs://host-name:20500/my.db'
                           '/dbname.table_name'), nan),
            ('Table Type:', 'EXTERNAL_TABLE', nan),
            ('Table Parameters:', nan, nan),
            ('', 'EXTERNAL', 'TRUE'),
            ('', 'STATS_GENERATED_VIA_STATS_TASK', 'true'),
            ('', 'numRows', '183592'),
            ('', 'transient_lastDdlTime', '1447340941'),
        ]

        cls.storage_info = [
            ('# Storage Information', nan, nan),
            ('SerDe Library:', ('org.apache.hadoop'
                                '.hive.serde2.lazy.LazySimpleSerDe'), nan),
            ('InputFormat:', 'org.apache.hadoop.mapred.TextInputFormat', nan),
            ('OutputFormat:',
             'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
             nan), ('Compressed:', 'No', nan), ('Num Buckets:', '0', nan),
            ('Bucket Columns:', '[]', nan), ('Sort Columns:', '[]', nan),
            ('Storage Desc Params:', nan, nan), ('', 'field.delim', '|'),
            ('', 'serialization.format', '|')
        ]

        cls.part_metadata = pd.DataFrame.from_records(
            _glue_lists_spacer(
                cls.spacer,
                [cls.schema, cls.partitions, cls.info, cls.storage_info]),
            columns=['name', 'type', 'comment'])

        cls.unpart_metadata = pd.DataFrame.from_records(
            _glue_lists_spacer(cls.spacer,
                               [cls.schema, cls.info, cls.storage_info]),
            columns=['name', 'type', 'comment'])

        cls.parsed_part = parse_metadata(cls.part_metadata)
        cls.parsed_unpart = parse_metadata(cls.unpart_metadata)
예제 #2
0
    def setUpClass(cls):
        cls.spacer = ('', nan, nan)

        cls.schema = [
            ('# col_name', 'data_type', 'comment'),
            cls.spacer,
            ('foo', 'int', nan),
            ('bar', 'tinyint', nan),
            ('baz', 'bigint', nan)
        ]

        cls.partitions = [
            ('# Partition Information', nan, nan),
            ('# col_name', 'data_type', 'comment'),
            cls.spacer,
            ('qux', 'bigint', nan)
        ]

        cls.info = [
            ('# Detailed Table Information', nan, nan),
            ('Database:', 'tpcds', nan),
            ('Owner:', 'wesm', nan),
            ('CreateTime:', 'Sun Nov 08 01:09:42 PST 2015', nan),
            ('LastAccessTime:', 'UNKNOWN', nan),
            ('Protect Mode:', 'None', nan),
            ('Retention:', '0', nan),
            ('Location:', ('hdfs://host-name:20500/my.db'
                           '/dbname.table_name'), nan),
            ('Table Type:', 'EXTERNAL_TABLE', nan),
            ('Table Parameters:', nan, nan),
            ('', 'EXTERNAL', 'TRUE'),
            ('', 'STATS_GENERATED_VIA_STATS_TASK', 'true'),
            ('', 'numRows', '183592'),
            ('', 'transient_lastDdlTime', '1447369741'),
        ]

        cls.storage_info = [
            ('# Storage Information', nan, nan),
            ('SerDe Library:', ('org.apache.hadoop'
                                '.hive.serde2.lazy.LazySimpleSerDe'), nan),
            ('InputFormat:', 'org.apache.hadoop.mapred.TextInputFormat', nan),
            ('OutputFormat:',
             'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
             nan),
            ('Compressed:', 'No', nan),
            ('Num Buckets:', '0', nan),
            ('Bucket Columns:', '[]', nan),
            ('Sort Columns:', '[]', nan),
            ('Storage Desc Params:', nan, nan),
            ('', 'field.delim', '|'),
            ('', 'serialization.format', '|')
        ]

        cls.part_metadata = pd.DataFrame.from_records(
            _glue_lists_spacer(cls.spacer, [cls.schema, cls.partitions,
                                            cls.info, cls.storage_info]),
            columns=['name', 'type', 'comment'])

        cls.unpart_metadata = pd.DataFrame.from_records(
            _glue_lists_spacer(cls.spacer, [cls.schema, cls.info,
                                            cls.storage_info]),
            columns=['name', 'type', 'comment'])

        cls.parsed_part = parse_metadata(cls.part_metadata)
        cls.parsed_unpart = parse_metadata(cls.unpart_metadata)