def test_serialize_40mb_dataframe(self): # Arrange blob_name = settings.storage.medium_size_blob original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name) original_dataframe = pd.read_csv(BytesIO(original_data), header=0, sep=",", encoding='utf-8-sig') self._write_blob_contents(blob_name, original_data) # Act start_time = datetime.now() writer = BytesIO() serialize_dataframe(writer, DataTypeIds.GenericCSV, original_dataframe) elapsed_time = datetime.now() - start_time result_data = writer.getvalue() self._write_serialized_frame(blob_name, result_data) # Assert result_dataframe = pd.read_csv(BytesIO(result_data), header=0, sep=",", encoding='utf-8-sig') assert_frame_equal(original_dataframe, result_dataframe) self.assertLess(elapsed_time.total_seconds(), 10)
def test_serialize_to_plain_text(self): # Arrange data = ['This is the first', 'This is second line'] dataframe = pd.DataFrame(data) # Act writer = BytesIO() serialize_dataframe(writer, DataTypeIds.PlainText, dataframe) result = writer.getvalue() # Assert self.assertGreater(len(result), 0) self.assertEqual(result, b'This is the first\nThis is second line\n')
def test_serialize_to_tsv(self): # Arrange data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}] dataframe = pd.DataFrame(data) # Act writer = BytesIO() serialize_dataframe(writer, DataTypeIds.GenericTSV, dataframe) result = writer.getvalue() # Assert self.assertGreater(len(result), 0) self.assertEqual(result, b'a\tb\tc\n1.0\t2.0\t\n5.1\t10.1\t20.1\n')
def test_serialize_to_csv_no_header(self): # Arrange data = [{"a": 1.0, "b": 2.0}, {"a": 5.1, "b": 10.1, "c": 20.1}] dataframe = pd.DataFrame(data) # Act writer = BytesIO() serialize_dataframe(writer, DataTypeIds.GenericCSVNoHeader, dataframe) result = writer.getvalue() # Assert self.assertGreater(len(result), 0) self.assertEqual(result, b"1.0,2.0,\n5.1,10.1,20.1\n")
def test_serialize_to_csv(self): # Arrange data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}] dataframe = pd.DataFrame(data) # Act writer = BytesIO() serialize_dataframe(writer, DataTypeIds.GenericCSV, dataframe) result = writer.getvalue() # Assert self.assertGreater(len(result), 0) self.assertEqual(result, b'a,b,c\n1.0,2.0,\n5.1,10.1,20.1\n')
def test_serialize_to_tsv_no_header(self): # Arrange data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}] dataframe = pd.DataFrame(data) # Act writer = BytesIO() serialize_dataframe(writer, DataTypeIds.GenericTSVNoHeader, dataframe) result = writer.getvalue() # Assert self.assertGreater(len(result), 0) self.assertEqual(result, b'1.0\t2.0\t\n5.1\t10.1\t20.1\n')
def test_serialize_to_tsv(self): # Arrange data = [{"a": 1.0, "b": 2.0}, {"a": 5.1, "b": 10.1, "c": 20.1}] dataframe = pd.DataFrame(data) # Act writer = BytesIO() serialize_dataframe(writer, DataTypeIds.GenericTSV, dataframe) result = writer.getvalue() # Assert self.assertGreater(len(result), 0) self.assertEqual(result, b"a\tb\tc\n1.0\t2.0\t\n5.1\t10.1\t20.1\n")
def test_serialize_to_csv_no_header(self): # Arrange data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}] dataframe = pd.DataFrame(data) # Act writer = BytesIO() serialize_dataframe(writer, DataTypeIds.GenericCSVNoHeader, dataframe) result = writer.getvalue() # Assert self.assertGreater(len(result), 0) self.assertEqual(result, b'1.0,2.0,\n5.1,10.1,20.1\n')
def test_deserialize_from_csv_no_header(self): # Arrange data = b'1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n' # Act reader = BytesIO(data) result = deserialize_dataframe(reader, DataTypeIds.GenericCSVNoHeader) # Assert self.assertIsNotNone(result) expected = [ { 0: 1.0, 1: 2.0 }, { 0: 5.1, 1: 10.1, 2: 20.1 }, { 0: 50.2, 2: 50.3 }, ] assert_frame_equal(pd.DataFrame(expected), result)
def test_deserialize_from_csv_spaces(self): # Arrange data = b'a, b, c\n1.0, two, nan\n5.1, "ten point one", 20.1\n50.2, , 50.3\n' # Act reader = BytesIO(data) result = deserialize_dataframe(reader, DataTypeIds.GenericCSV) # Assert self.assertIsNotNone(result) expected = [ { 'a': 1.0, 'b': 'two' }, { 'a': 5.1, 'b': 'ten point one', 'c': 20.1 }, { 'a': 50.2, 'c': 50.3 }, ] assert_frame_equal(pd.DataFrame(expected), result)
def test_deserialize_from_csv_bom(self): # Arrange data = b'\xef\xbb\xbfa,b,c\n1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n' # Act reader = BytesIO(data) result = deserialize_dataframe(reader, DataTypeIds.GenericCSV) # Assert self.assertIsNotNone(result) expected = [ { 'a': 1.0, 'b': 2.0 }, { 'a': 5.1, 'b': 10.1, 'c': 20.1 }, { 'a': 50.2, 'c': 50.3 }, ] assert_frame_equal(pd.DataFrame(expected), result)
def test_deserialize_from_arff(self): # Arrange data = b"""@RELATION Unnamed @ATTRIBUTE Class NUMERIC @ATTRIBUTE age NUMERIC @ATTRIBUTE menopause NUMERIC @ATTRIBUTE tumor-size NUMERIC @DATA 0,5,1,1 0,5,4,4 1,4,8,8 """ # Act reader = BytesIO(data) result = deserialize_dataframe(reader, DataTypeIds.ARFF) print(result) # Assert self.assertIsNotNone(result) expected = [ {'Class': 0., 'age': 5., 'menopause': 1., 'tumor-size':1.}, {'Class': 0., 'age': 5., 'menopause': 4., 'tumor-size':4.}, {'Class': 1., 'age': 4., 'menopause': 8., 'tumor-size':8.}, ] assert_frame_equal(pd.DataFrame(expected), result)
def test_deserialize_from_unsupported_data_type_id(self): # Arrange data = b'1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n' # Act reader = BytesIO(data) with self.assertRaises(UnsupportedDatasetTypeError): result = deserialize_dataframe(reader, 'Unsupported')
def test_deserialize_from_plain_text_bom(self): # Arrange data = b'\xef\xbb\xbfJohn enjoyed his vacation in California. His personal favorite on the trip was Los Angeles.\r\nMicrosoft announced upgrades to their line of products for information workers. The announcement was made at a partner conference at Boston.' # Act reader = BytesIO(data) result = deserialize_dataframe(reader, DataTypeIds.PlainText) # Assert self.assertIsNotNone(result) expected = [ {0: 'John enjoyed his vacation in California. His personal favorite on the trip was Los Angeles.'}, {0: 'Microsoft announced upgrades to their line of products for information workers. The announcement was made at a partner conference at Boston.'}, ] assert_frame_equal(pd.DataFrame(expected), result)
def test_download_blob_then_upload_as_dataframe_then_read_dataset(self): def datatypeid_from_header_and_format(header, format): if format == 'csv': if header == 'wh': return DataTypeIds.GenericCSV else: return DataTypeIds.GenericCSVNoHeader elif format == 'tsv': if header == 'wh': return DataTypeIds.GenericTSV else: return DataTypeIds.GenericTSVNoHeader elif format == 'txt': return DataTypeIds.PlainText else: self.assertTrue(False, 'Unexpected format') def split_blob_name(blob_name): # blob naming convention: # name_<header>.<format> # <header>: WH: with header # NH: no header # <format>: CSV: comma separated # TSV: tab separated # TXT: newline separated name, format = blob_name.lower().split('.') if format != 'txt': name, header = name.split('_') else: header = 'nh' return name, format, header for blob_name in settings.storage.blobs: print(blob_name) name, format, header = split_blob_name(blob_name) # Read the data from blob storage original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name) self._write_blob_contents(blob_name, original_data) # Parse the data to a dataframe using Pandas original_dataframe = pd.read_csv( BytesIO(original_data), header=0 if header == 'wh' else None, sep=',' if format == 'csv' else '\t' if format == 'tsv' else '\n', encoding='utf-8-sig' ) # Upload the dataframe as a new dataset dataset_name = 'unittest' + name + id_generator() description = 'safe to be deleted - ' + dataset_name data_type_id = datatypeid_from_header_and_format(header, format) self.workspace.datasets.add_from_dataframe( original_dataframe, data_type_id, dataset_name, description, ) # Get the new dataset dataset = self.workspace.datasets[dataset_name] self.assertIsNotNone(dataset) # Read the dataset as a dataframe result_data = dataset.read_as_binary() self._write_serialized_frame(blob_name, result_data) result_dataframe = dataset.to_dataframe() # Verify that the dataframes are equal assert_frame_equal(original_dataframe, result_dataframe)