def test_is_match(self): """ Determine if the parquet file can be automatically identified from byte stream or file path """ for input_file in self.file_or_buf_list: self.assertTrue(ParquetData.is_match(input_file["path"]))
def test_is_structured(self): # Default construction data = ParquetData() self.assertTrue(data.is_structured) # With option specifying dataframe as data_format data = ParquetData(options={"data_format": "dataframe"}) self.assertTrue(data.is_structured) # With option specifying records as data_format data = ParquetData(options={"data_format": "records"}) self.assertFalse(data.is_structured) # With option specifying json as data_format data = ParquetData(options={"data_format": "json"}) self.assertFalse(data.is_structured)
def test_mixed_non_string_col(self): """ Determine if parquet can handle mixed non-string column types. """ test_file = os.path.join(test_root_path, "data", "parquet", "mixed_datetime_data_col.parquet") parq_data = ParquetData(test_file) # assert str and not bytes self.assertIsInstance(parq_data.data["col2"][1], str) self.assertIsInstance(parq_data.data["col2"][3], str) # assert no 'b"data"' encapsulated, just 'data' self.assertNotIn('b"', parq_data.data["col2"][1]) self.assertNotIn("b'", parq_data.data["col2"][1]) self.assertNotIn('b"', parq_data.data["col2"][3]) self.assertNotIn("b'", parq_data.data["col2"][3])
def test_file_encoding(self): """Tests to ensure file_encoding set to None""" for input_file in self.file_or_buf_list: data = ParquetData(input_file["path"]) self.assertIsNone(data.file_encoding)