示例#1
0
    def test_serialize_40mb_dataframe(self):
        # Arrange
        blob_name = settings.storage.medium_size_blob
        original_data = self.blob.get_blob_to_bytes(settings.storage.container,
                                                    blob_name)
        original_dataframe = pd.read_csv(BytesIO(original_data),
                                         header=0,
                                         sep=",",
                                         encoding='utf-8-sig')

        self._write_blob_contents(blob_name, original_data)

        # Act
        start_time = datetime.now()
        writer = BytesIO()
        serialize_dataframe(writer, DataTypeIds.GenericCSV, original_dataframe)
        elapsed_time = datetime.now() - start_time
        result_data = writer.getvalue()

        self._write_serialized_frame(blob_name, result_data)

        # Assert
        result_dataframe = pd.read_csv(BytesIO(result_data),
                                       header=0,
                                       sep=",",
                                       encoding='utf-8-sig')
        assert_frame_equal(original_dataframe, result_dataframe)
        self.assertLess(elapsed_time.total_seconds(), 10)
    def test_serialize_to_plain_text(self):
        # Arrange
        data = ['This is the first', 'This is second line']
        dataframe = pd.DataFrame(data)

        # Act
        writer = BytesIO()
        serialize_dataframe(writer, DataTypeIds.PlainText, dataframe)
        result = writer.getvalue()

        # Assert
        self.assertGreater(len(result), 0)
        self.assertEqual(result, b'This is the first\nThis is second line\n')
    def test_serialize_to_tsv(self):
        # Arrange
        data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}]
        dataframe = pd.DataFrame(data)

        # Act
        writer = BytesIO()
        serialize_dataframe(writer, DataTypeIds.GenericTSV, dataframe)
        result = writer.getvalue()

        # Assert
        self.assertGreater(len(result), 0)
        self.assertEqual(result, b'a\tb\tc\n1.0\t2.0\t\n5.1\t10.1\t20.1\n')
    def test_serialize_to_plain_text(self):
        # Arrange
        data = ['This is the first', 'This is second line']
        dataframe = pd.DataFrame(data)

        # Act
        writer = BytesIO()
        serialize_dataframe(writer, DataTypeIds.PlainText, dataframe)
        result = writer.getvalue()

        # Assert
        self.assertGreater(len(result), 0)
        self.assertEqual(result, b'This is the first\nThis is second line\n')
    def test_serialize_to_csv_no_header(self):
        # Arrange
        data = [{"a": 1.0, "b": 2.0}, {"a": 5.1, "b": 10.1, "c": 20.1}]
        dataframe = pd.DataFrame(data)

        # Act
        writer = BytesIO()
        serialize_dataframe(writer, DataTypeIds.GenericCSVNoHeader, dataframe)
        result = writer.getvalue()

        # Assert
        self.assertGreater(len(result), 0)
        self.assertEqual(result, b"1.0,2.0,\n5.1,10.1,20.1\n")
    def test_serialize_to_csv(self):
        # Arrange
        data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}]
        dataframe = pd.DataFrame(data)

        # Act
        writer = BytesIO()
        serialize_dataframe(writer, DataTypeIds.GenericCSV, dataframe)
        result = writer.getvalue()

        # Assert
        self.assertGreater(len(result), 0)
        self.assertEqual(result, b'a,b,c\n1.0,2.0,\n5.1,10.1,20.1\n')
    def test_serialize_to_tsv_no_header(self):
        # Arrange
        data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}]
        dataframe = pd.DataFrame(data)

        # Act
        writer = BytesIO()
        serialize_dataframe(writer, DataTypeIds.GenericTSVNoHeader, dataframe)
        result = writer.getvalue()

        # Assert
        self.assertGreater(len(result), 0)
        self.assertEqual(result, b'1.0\t2.0\t\n5.1\t10.1\t20.1\n')
    def test_serialize_to_tsv(self):
        # Arrange
        data = [{"a": 1.0, "b": 2.0}, {"a": 5.1, "b": 10.1, "c": 20.1}]
        dataframe = pd.DataFrame(data)

        # Act
        writer = BytesIO()
        serialize_dataframe(writer, DataTypeIds.GenericTSV, dataframe)
        result = writer.getvalue()

        # Assert
        self.assertGreater(len(result), 0)
        self.assertEqual(result, b"a\tb\tc\n1.0\t2.0\t\n5.1\t10.1\t20.1\n")
    def test_serialize_to_csv_no_header(self):
        # Arrange
        data = [{'a': 1.0, 'b': 2.0}, {'a': 5.1, 'b': 10.1, 'c': 20.1}]
        dataframe = pd.DataFrame(data)

        # Act
        writer = BytesIO()
        serialize_dataframe(writer, DataTypeIds.GenericCSVNoHeader, dataframe)
        result = writer.getvalue()

        # Assert
        self.assertGreater(len(result), 0)
        self.assertEqual(result, b'1.0,2.0,\n5.1,10.1,20.1\n')
    def test_deserialize_from_csv_no_header(self):
        # Arrange
        data = b'1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n'

        # Act
        reader = BytesIO(data)
        result = deserialize_dataframe(reader, DataTypeIds.GenericCSVNoHeader)

        # Assert
        self.assertIsNotNone(result)
        expected = [
            {
                0: 1.0,
                1: 2.0
            },
            {
                0: 5.1,
                1: 10.1,
                2: 20.1
            },
            {
                0: 50.2,
                2: 50.3
            },
        ]
        assert_frame_equal(pd.DataFrame(expected), result)
    def test_deserialize_from_csv_spaces(self):
        # Arrange
        data = b'a, b, c\n1.0, two, nan\n5.1, "ten point one", 20.1\n50.2, , 50.3\n'

        # Act
        reader = BytesIO(data)
        result = deserialize_dataframe(reader, DataTypeIds.GenericCSV)

        # Assert
        self.assertIsNotNone(result)
        expected = [
            {
                'a': 1.0,
                'b': 'two'
            },
            {
                'a': 5.1,
                'b': 'ten point one',
                'c': 20.1
            },
            {
                'a': 50.2,
                'c': 50.3
            },
        ]
        assert_frame_equal(pd.DataFrame(expected), result)
    def test_deserialize_from_csv_bom(self):
        # Arrange
        data = b'\xef\xbb\xbfa,b,c\n1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n'

        # Act
        reader = BytesIO(data)
        result = deserialize_dataframe(reader, DataTypeIds.GenericCSV)

        # Assert
        self.assertIsNotNone(result)
        expected = [
            {
                'a': 1.0,
                'b': 2.0
            },
            {
                'a': 5.1,
                'b': 10.1,
                'c': 20.1
            },
            {
                'a': 50.2,
                'c': 50.3
            },
        ]
        assert_frame_equal(pd.DataFrame(expected), result)
示例#13
0
    def test_deserialize_from_arff(self):
        # Arrange
        data = b"""@RELATION	Unnamed

@ATTRIBUTE	Class	NUMERIC
@ATTRIBUTE	age	NUMERIC
@ATTRIBUTE	menopause	NUMERIC
@ATTRIBUTE	tumor-size	NUMERIC

@DATA
0,5,1,1
0,5,4,4
1,4,8,8

"""

        # Act
        reader = BytesIO(data)
        result = deserialize_dataframe(reader, DataTypeIds.ARFF)
        print(result)

        # Assert
        self.assertIsNotNone(result)
        expected = [
            {'Class': 0., 'age': 5., 'menopause': 1., 'tumor-size':1.},
            {'Class': 0., 'age': 5., 'menopause': 4., 'tumor-size':4.},
            {'Class': 1., 'age': 4., 'menopause': 8., 'tumor-size':8.},
        ]
        assert_frame_equal(pd.DataFrame(expected), result)
    def test_deserialize_from_unsupported_data_type_id(self):
        # Arrange
        data = b'1.0,2.0,nan\n5.1,10.1,20.1\n50.2,,50.3\n'

        # Act
        reader = BytesIO(data)
        with self.assertRaises(UnsupportedDatasetTypeError):
            result = deserialize_dataframe(reader, 'Unsupported')
    def test_serialize_40mb_dataframe(self):
        # Arrange
        blob_name = settings.storage.medium_size_blob
        original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name)
        original_dataframe = pd.read_csv(BytesIO(original_data), header=0, sep=",", encoding='utf-8-sig')

        self._write_blob_contents(blob_name, original_data)

        # Act
        start_time = datetime.now()
        writer = BytesIO()
        serialize_dataframe(writer, DataTypeIds.GenericCSV, original_dataframe)
        elapsed_time = datetime.now() - start_time
        result_data = writer.getvalue()

        self._write_serialized_frame(blob_name, result_data)

        # Assert
        result_dataframe = pd.read_csv(BytesIO(result_data), header=0, sep=",", encoding='utf-8-sig')
        assert_frame_equal(original_dataframe, result_dataframe)
        self.assertLess(elapsed_time.total_seconds(), 10)
示例#16
0
    def test_deserialize_from_plain_text_bom(self):
        # Arrange
        data = b'\xef\xbb\xbfJohn enjoyed his vacation in California. His personal favorite on the trip was Los Angeles.\r\nMicrosoft announced upgrades to their line of products for information workers. The announcement was made at a partner conference at Boston.'

        # Act
        reader = BytesIO(data)
        result = deserialize_dataframe(reader, DataTypeIds.PlainText)

        # Assert
        self.assertIsNotNone(result)
        expected = [
            {0: 'John enjoyed his vacation in California. His personal favorite on the trip was Los Angeles.'},
            {0: 'Microsoft announced upgrades to their line of products for information workers. The announcement was made at a partner conference at Boston.'},
        ]
        assert_frame_equal(pd.DataFrame(expected), result)
示例#17
0
    def test_download_blob_then_upload_as_dataframe_then_read_dataset(self):
        def datatypeid_from_header_and_format(header, format):
            if format == 'csv':
                if header == 'wh':
                    return DataTypeIds.GenericCSV
                else:
                    return DataTypeIds.GenericCSVNoHeader
            elif format == 'tsv':
                if header == 'wh':
                    return DataTypeIds.GenericTSV
                else:
                    return DataTypeIds.GenericTSVNoHeader
            elif format == 'txt':
                return DataTypeIds.PlainText
            else:
                self.assertTrue(False, 'Unexpected format')

        def split_blob_name(blob_name):
            # blob naming convention:
            # name_<header>.<format>
            # <header>: WH: with header
            #           NH: no header
            # <format>: CSV: comma separated
            #           TSV: tab separated
            #           TXT: newline separated
            name, format = blob_name.lower().split('.')
            if format != 'txt':
                name, header = name.split('_')
            else:
                header = 'nh'

            return name, format, header

        for blob_name in settings.storage.blobs:
            print(blob_name)

            name, format, header = split_blob_name(blob_name)

            # Read the data from blob storage
            original_data = self.blob.get_blob_to_bytes(settings.storage.container, blob_name)
            self._write_blob_contents(blob_name, original_data)

            # Parse the data to a dataframe using Pandas
            original_dataframe = pd.read_csv(
                BytesIO(original_data),
                header=0 if header == 'wh' else None,
                sep=',' if format == 'csv' else '\t' if format == 'tsv' else '\n',
                encoding='utf-8-sig'
            )

            # Upload the dataframe as a new dataset
            dataset_name = 'unittest' + name + id_generator()
            description = 'safe to be deleted - ' + dataset_name
            data_type_id = datatypeid_from_header_and_format(header, format)
            self.workspace.datasets.add_from_dataframe(
                original_dataframe,
                data_type_id,
                dataset_name,
                description,
            )

            # Get the new dataset
            dataset = self.workspace.datasets[dataset_name]
            self.assertIsNotNone(dataset)

            # Read the dataset as a dataframe
            result_data = dataset.read_as_binary()
            self._write_serialized_frame(blob_name, result_data)
            result_dataframe = dataset.to_dataframe()

            # Verify that the dataframes are equal
            assert_frame_equal(original_dataframe, result_dataframe)