def testSubsetOfColumnNamesWithSubsetSchema(self): """Tests when column names and schema features are a subset of columns.""" schema = text_format.Parse( """ feature { name: "int_feature" type: INT value_count { min: 0 max: 2 } } """, schema_pb2.Schema()) tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=["int_feature"], schema=schema) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] expected_arrow_schema = pa.schema([ pa.field("int_feature", pa.large_list(pa.int64())), ]) self._ValidateRecordBatch(record_batch, expected_arrow_schema) with beam.Pipeline() as p: record_batch_pcoll = (p | tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
def testProjection(self): """Test projecting of a TFXIO.""" schema = schema_pb2.Schema() schema.CopyFrom(_UNORDERED_SCHEMA) tensor_representations = { "string_tensor": schema_pb2.TensorRepresentation( dense_tensor=schema_pb2.TensorRepresentation.DenseTensor( column_name="string_feature")), "float_tensor": schema_pb2.TensorRepresentation( sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor( dense_shape=schema_pb2.FixedShape( dim=[schema_pb2.FixedShape.Dim(size=10)]), index_column_names=["int_feature"], value_column_name="float_feature")), } tensor_representation_util.SetTensorRepresentationsInSchema( schema, tensor_representations) tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=_COLUMN_NAMES, schema=schema, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) projected_tfxio = tfxio.Project(["float_tensor"]) # The projected_tfxio has the projected schema self.assertTrue(projected_tfxio.ArrowSchema().equals( _EXPECTED_PROJECTED_ARROW_SCHEMA)) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(record_batch, _EXPECTED_PROJECTED_ARROW_SCHEMA) expected_schema = projected_tfxio.ArrowSchema() self.assertListEqual( record_batch.schema.names, expected_schema.names, "actual: {}; expected: {}".format(record_batch.schema.names, expected_schema.names)) self.assertListEqual( record_batch.schema.types, expected_schema.types, "actual: {}; expected: {}".format(record_batch.schema.types, expected_schema.types)) tensor_adapter = projected_tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 1) self.assertIn("float_tensor", dict_of_tensors) pipeline = beam.Pipeline() record_batch_pcoll = (pipeline | projected_tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = pipeline.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "parquet", "parquet")
def testOptionalColumnNamesAndSchema(self): """Tests when schema and column names are not provided.""" tfxio = ParquetTFXIO(file_pattern=self._example_file) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA) with beam.Pipeline() as p: record_batch_pcoll = (p | tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
def testUnorderedSchema(self): """Tests various valid schemas.""" tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=_COLUMN_NAMES, schema=_UNORDERED_SCHEMA) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA) with beam.Pipeline() as p: record_batch_pcoll = (p | tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
def testSubsetOfColumnNamesWithCompleteSchema(self): """Tests when column names is a subset of schema features.""" tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=["int_feature"], schema=_SCHEMA) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] expected_arrow_schema = pa.schema([ pa.field("int_feature", pa.large_list(pa.int64())), ]) self._ValidateRecordBatch(record_batch, expected_arrow_schema) with beam.Pipeline() as p: record_batch_pcoll = (p | tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
def testImplicitTensorRepresentations(self): """Tests inferring of tensor representation.""" tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=_COLUMN_NAMES, schema=_UNORDERED_SCHEMA, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) self.assertEqual( { "int_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "int_feature"}""", schema_pb2.TensorRepresentation()), "float_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "float_feature"}""", schema_pb2.TensorRepresentation()), "string_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "string_feature" }""", schema_pb2.TensorRepresentation()), }, tfxio.TensorRepresentations()) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA) self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) tensor_adapter = tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 3) self.assertIn("int_feature", dict_of_tensors) self.assertIn("float_feature", dict_of_tensors) self.assertIn("string_feature", dict_of_tensors) pipeline = beam.Pipeline() record_batch_pcoll = (pipeline | tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = pipeline.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "parquet", "parquet")
def testOptionalSchema(self): """Tests when the schema is not provided.""" tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=_COLUMN_NAMES, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) self.assertEqual(tfxio.ArrowSchema(), _EXPECTED_ARROW_SCHEMA) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA) pipeline = beam.Pipeline() record_batch_pcoll = (pipeline | tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = pipeline.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "parquet", "parquet")
def testExplicitTensorRepresentations(self): """Tests when the tensor representation is explicitely provided in the schema.""" schema = schema_pb2.Schema() schema.CopyFrom(_SCHEMA) tensor_representations = { "my_feature": text_format.Parse( """ dense_tensor { column_name: "string_feature" shape { dim { size: 1 } } default_value { bytes_value: "abc" } }""", schema_pb2.TensorRepresentation()) } schema.tensor_representation_group[""].CopyFrom( schema_pb2.TensorRepresentationGroup( tensor_representation=tensor_representations)) tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=_COLUMN_NAMES, schema=schema, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) self.assertEqual(tensor_representations, tfxio.TensorRepresentations())