def test_uint64(self):
     """Test decoding int64 as uint64."""
     schema = pt.SchemaElement(type=pt.Type.INT64,
                               name="test",
                               converted_type=pt.ConvertedType.UINT_64)
     self.assertEqual(
         convert_column([-6884376], schema)[0], 18446744073702667240)
 def test_utf8(self):
     """Test bytes representing utf-8 string."""
     schema = pt.SchemaElement(type=pt.Type.BYTE_ARRAY,
                               name="test",
                               converted_type=pt.ConvertedType.UTF8)
     data = b'foo\xf0\x9f\x91\xbe'
     self.assertEqual(convert_column([data], schema)[0], 'foo👾')
예제 #3
0
 def test_utf8_empty_string(self):
     """Test bytes representing utf-8 string with empty strings."""
     schema = pt.SchemaElement(type=pt.Type.BYTE_ARRAY,
                               name="test",
                               converted_type=pt.ConvertedType.UTF8)
     data = [b'', b'foo\xf0\x9f\x91\xbe', b'']
     self.assertEqual(convert_column(data, schema), ['', 'foo👾', ''])
 def test_json(self):
     """Test bytes representing json."""
     schema = pt.SchemaElement(type=pt.Type.BYTE_ARRAY,
                               name="test",
                               converted_type=pt.ConvertedType.JSON)
     self.assertEqual(
         convert_column([b'{"foo": ["bar", "\\ud83d\\udc7e"]}'], schema)[0],
         {'foo': ['bar', '👾']})
 def test_date(self):
     """Test int32 encoding a date."""
     schema = pt.SchemaElement(
         type=pt.Type.INT32,
         name="test",
         converted_type=pt.ConvertedType.DATE,
     )
     self.assertEqual(
         convert_column([731888], schema)[0], datetime.date(2004, 11, 3))
 def test_time_millis(self):
     """Test int32 encoding a timedelta in millis."""
     schema = pt.SchemaElement(
         type=pt.Type.INT32,
         name="test",
         converted_type=pt.ConvertedType.TIME_MILLIS,
     )
     self.assertEqual(
         convert_column([731888], schema)[0],
         datetime.timedelta(milliseconds=731888))
 def test_timestamp_millis(self):
     """Test int64 encoding a datetime."""
     schema = pt.SchemaElement(
         type=pt.Type.INT64,
         name="test",
         converted_type=pt.ConvertedType.TIMESTAMP_MILLIS,
     )
     self.assertEqual(
         convert_column([1099511625014], schema)[0],
         datetime.datetime(2004, 11, 3, 19, 53, 45, 14 * 1000))
    def test_int32(self):
        """Test decimal data stored as int32."""
        schema = pt.SchemaElement(type=pt.Type.INT32,
                                  name="test",
                                  converted_type=pt.ConvertedType.DECIMAL,
                                  scale=10,
                                  precision=9)

        self.assertEqual(
            convert_column([9876543210], schema)[0], Decimal('9.87654321'))
 def test_bson(self):
     """Test bytes representing bson."""
     schema = pt.SchemaElement(type=pt.Type.BYTE_ARRAY,
                               name="test",
                               converted_type=pt.ConvertedType.BSON)
     self.assertEqual(
         convert_column([
             b'&\x00\x00\x00\x04foo\x00\x1c\x00\x00\x00\x020'
             b'\x00\x04\x00\x00\x00bar\x00\x021\x00\x05\x00\x00\x00\xf0\x9f\x91\xbe\x00\x00\x00'
         ], schema)[0], {'foo': ['bar', '👾']})
    def test_binary(self):
        """Test decimal data stored as bytes."""
        schema = pt.SchemaElement(type=pt.Type.BYTE_ARRAY,
                                  name="test",
                                  converted_type=pt.ConvertedType.DECIMAL,
                                  scale=3,
                                  precision=13)

        self.assertEqual(
            convert_column([b'\x02\x00\x00\x00\x00\x00\x00\x00\x00\x01'],
                           schema)[0], Decimal('94447329657392904273.93'))
    def test_int64(self):
        """Test decimal data stored as int64."""
        schema = pt.SchemaElement(type=pt.Type.INT64,
                                  name="test",
                                  converted_type=pt.ConvertedType.DECIMAL,
                                  scale=3,
                                  precision=13)

        self.assertEqual(
            convert_column([1099511627776], schema)[0],
            Decimal('10995116277.76'))
예제 #12
0
 def test_json(self):
     """Test bytes representing json."""
     schema = pt.SchemaElement(
         type=pt.Type.BYTE_ARRAY,
         name="test",
         converted_type=pt.ConvertedType.JSON
     )
     self.assertEqual(
         convert_column([b'{"foo": ["bar", "\\ud83d\\udc7e"]}'], schema)[0],
         {'foo': ['bar', '👾']}
     )
예제 #13
0
 def test_time_millis(self):
     """Test int32 encoding a timedelta in millis."""
     schema = pt.SchemaElement(
         type=pt.Type.INT32,
         name="test",
         converted_type=pt.ConvertedType.TIME_MILLIS,
     )
     self.assertEqual(
         convert_column([731888], schema)[0],
         datetime.timedelta(milliseconds=731888)
     )
예제 #14
0
 def test_timestamp_millis(self):
     """Test int64 encoding a datetime."""
     schema = pt.SchemaElement(
         type=pt.Type.INT64,
         name="test",
         converted_type=pt.ConvertedType.TIMESTAMP_MILLIS,
     )
     self.assertEqual(
         convert_column([1099511625014], schema)[0],
         datetime.datetime(2004, 11, 3, 19, 53, 45, 14 * 1000)
     )
예제 #15
0
 def test_date(self):
     """Test int32 encoding a date."""
     schema = pt.SchemaElement(
         type=pt.Type.INT32,
         name="test",
         converted_type=pt.ConvertedType.DATE,
     )
     self.assertEqual(
         convert_column([731888], schema)[0],
         datetime.date(2004, 11, 3)
     )
    def test_fixedlength(self):
        """Test decimal data stored as fixed length bytes."""
        schema = pt.SchemaElement(type=pt.Type.FIXED_LEN_BYTE_ARRAY,
                                  type_length=3,
                                  name="test",
                                  converted_type=pt.ConvertedType.DECIMAL,
                                  scale=3,
                                  precision=13)

        self.assertEqual(
            convert_column([b'\x02\x00\x01'], schema)[0], Decimal('1310.73'))
예제 #17
0
 def test_uint16(self):
     """Test decoding int32 as uint16."""
     schema = pt.SchemaElement(
         type=pt.Type.INT32,
         name="test",
         converted_type=pt.ConvertedType.UINT_16
     )
     self.assertEqual(
         convert_column([-3], schema)[0],
         65533
     )
예제 #18
0
 def test_uint32(self):
     """Test decoding int32 as uint32."""
     schema = pt.SchemaElement(
         type=pt.Type.INT32,
         name="test",
         converted_type=pt.ConvertedType.UINT_32
     )
     self.assertEqual(
         convert_column([-6884376], schema)[0],
         4288082920
     )
예제 #19
0
 def test_uint64(self):
     """Test decoding int64 as uint64."""
     schema = pt.SchemaElement(
         type=pt.Type.INT64,
         name="test",
         converted_type=pt.ConvertedType.UINT_64
     )
     self.assertEqual(
         convert_column([-6884376], schema)[0],
         18446744073702667240
     )
예제 #20
0
 def test_utf8(self):
     """Test bytes representing utf-8 string."""
     schema = pt.SchemaElement(
         type=pt.Type.BYTE_ARRAY,
         name="test",
         converted_type=pt.ConvertedType.UTF8
     )
     data = b'foo\xf0\x9f\x91\xbe'
     self.assertEqual(
         convert_column([data], schema)[0],
         'foo👾'
     )
예제 #21
0
 def test_utf8_empty_string(self):
     """Test bytes representing utf-8 string with empty strings."""
     schema = pt.SchemaElement(
         type=pt.Type.BYTE_ARRAY,
         name="test",
         converted_type=pt.ConvertedType.UTF8
     )
     data = [b'', b'foo\xf0\x9f\x91\xbe', b'']
     self.assertEqual(
         convert_column(data, schema),
         ['', 'foo👾', '']
     )
예제 #22
0
 def test_bson(self):
     """Test bytes representing bson."""
     schema = pt.SchemaElement(
         type=pt.Type.BYTE_ARRAY,
         name="test",
         converted_type=pt.ConvertedType.BSON
     )
     self.assertEqual(
         convert_column(
             [b'&\x00\x00\x00\x04foo\x00\x1c\x00\x00\x00\x020'
              b'\x00\x04\x00\x00\x00bar\x00\x021\x00\x05\x00\x00\x00\xf0\x9f\x91\xbe\x00\x00\x00'], schema)[0],
         {'foo': ['bar', '👾']}
     )
예제 #23
0
    def test_int32(self):
        """Test decimal data stored as int32."""
        schema = pt.SchemaElement(
            type=pt.Type.INT32,
            name="test",
            converted_type=pt.ConvertedType.DECIMAL,
            scale=10,
            precision=9
        )

        self.assertEqual(
            convert_column([9876543210], schema)[0],
            Decimal('9.87654321')
        )
예제 #24
0
    def test_int64(self):
        """Test decimal data stored as int64."""
        schema = pt.SchemaElement(
            type=pt.Type.INT64,
            name="test",
            converted_type=pt.ConvertedType.DECIMAL,
            scale=3,
            precision=13
        )

        self.assertEqual(
            convert_column([1099511627776], schema)[0],
            Decimal('10995116277.76')
        )
예제 #25
0
    def test_binary(self):
        """Test decimal data stored as bytes."""
        schema = pt.SchemaElement(
            type=pt.Type.BYTE_ARRAY,
            name="test",
            converted_type=pt.ConvertedType.DECIMAL,
            scale=3,
            precision=13
        )

        self.assertEqual(
            convert_column([b'\x02\x00\x00\x00\x00\x00\x00\x00\x00\x01'], schema)[0],
            Decimal('94447329657392904273.93')
        )
예제 #26
0
    def test_fixedlength(self):
        """Test decimal data stored as fixed length bytes."""
        schema = pt.SchemaElement(
            type=pt.Type.FIXED_LEN_BYTE_ARRAY,
            type_length=3,
            name="test",
            converted_type=pt.ConvertedType.DECIMAL,
            scale=3,
            precision=13
        )

        self.assertEqual(
            convert_column([b'\x02\x00\x01'], schema)[0],
            Decimal('1310.73')
        )
예제 #27
0
 def get_columns(self, columns=None):
     """
     Load given columns as a dataframe.
     
     Columns is either a list (a subset of self.cols), or if None,
     gets all columns.
     
     Will attempt to transform 'Converted' types.
     """
     columns = columns or self.cols
     res = defaultdict(list)
     # Alternative to appending values to a list is to make arrays
     # beforehand using the schema, and assign
     for rg in self.rg:
         # Alternative to reading whole file: iterate over row-groups
         # or be able to limit max number of rows returned
         cg = rg.columns
         for col in cg:
             name = ".".join(x.decode()
                             for x in col.meta_data.path_in_schema)
             ind = [s for s in self.schema if s.fullname == name]
             width = ind[0].type_length
             if name not in columns:
                 continue
             offset = parquet._get_offset(col.meta_data)
             self.fo.seek(offset, 0)
             values_seen = 0
             cmd = col.meta_data
             cmd.width = width
             dict_items = []
             while values_seen < rg.num_rows:
                 ph = parquet._read_page_header(self.fo)
                 if ph.type == parquet.PageType.DATA_PAGE:
                     values = parquet.read_data_page(
                         self.fo, self.schema_helper, ph, cmd, dict_items)
                     res[name] += values
                     values_seen += ph.data_page_header.num_values
                 else:
                     dict_items = parquet.read_dictionary_page(
                         self.fo, ph, cmd, width)
     out = pd.DataFrame(res)
     for col in columns:
         schemae = [s for s in self.schema if col == s.name.decode()][0]
         if schemae.converted_type:
             out[col] = convert_column(out[col], schemae)
     return out
 def test_uint16(self):
     """Test decoding int32 as uint16."""
     schema = pt.SchemaElement(type=pt.Type.INT32,
                               name="test",
                               converted_type=pt.ConvertedType.UINT_16)
     self.assertEqual(convert_column([-3], schema)[0], 65533)
 def test_uint32(self):
     """Test decoding int32 as uint32."""
     schema = pt.SchemaElement(type=pt.Type.INT32,
                               name="test",
                               converted_type=pt.ConvertedType.UINT_32)
     self.assertEqual(convert_column([-6884376], schema)[0], 4288082920)