def testRecordBatchAndTensorAdapter(self): column_name = "raw_record" telemetry_descriptors = ["some", "component"] tfxio = raw_tf_record.RawTfRecordTFXIO( self._raw_record_file, column_name, telemetry_descriptors=telemetry_descriptors) expected_type = (pa.large_list(pa.large_binary()) if _ProducesLargeTypes(tfxio) else pa.list_(pa.binary())) got_schema = tfxio.ArrowSchema() self.assertTrue( got_schema.equals(pa.schema([pa.field(column_name, expected_type)])), "got: {}".format(got_schema)) def _AssertFn(record_batches): self.assertLen(record_batches, 1) record_batch = record_batches[0] self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) self.assertTrue(record_batch.columns[0].equals( pa.array([[r] for r in _RAW_RECORDS], type=expected_type))) tensor_adapter = tfxio.TensorAdapter() tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(tensors, 1) self.assertIn(column_name, tensors) p = beam.Pipeline() record_batch_pcoll = p | tfxio.BeamSource(batch_size=len(_RAW_RECORDS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, telemetry_descriptors, "bytes", "tfrecords_gzip")
def testE2E(self, attach_raw_records): raw_column_name = "raw_records" if attach_raw_records else None tfxio = self._MakeTFXIO(_SCHEMA, raw_column_name) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(tfxio, record_batch, raw_column_name) self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) tensor_adapter = tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 4) self.assertIn("int_feature", dict_of_tensors) self.assertIn("float_feature", dict_of_tensors) self.assertIn("seq_string_feature", dict_of_tensors) self.assertIn("seq_int_feature", dict_of_tensors) p = beam.Pipeline() record_batch_pcoll = p | tfxio.BeamSource(batch_size=1000) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "tf_sequence_example", "tfrecords_gzip")
def testImplicitTensorRepresentations(self): tfxio = self._MakeTFXIO(_SCHEMA) self.assertEqual( { "int_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "int_feature" }""", schema_pb2.TensorRepresentation()), "float_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "float_feature" }""", schema_pb2.TensorRepresentation()), "string_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "string_feature" }""", schema_pb2.TensorRepresentation()), }, tfxio.TensorRepresentations()) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(tfxio, record_batch) self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) tensor_adapter = tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 3) self.assertIn("int_feature", dict_of_tensors) self.assertIn("float_feature", dict_of_tensors) self.assertIn("string_feature", dict_of_tensors) p = beam.Pipeline() record_batch_pcoll = p | tfxio.BeamSource(batch_size=1000) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics( self, pipeline_result, _TELEMETRY_DESCRIPTORS, "tf_example", "tfrecords_gzip")
def testProjection(self): """Test projecting of a TFXIO.""" schema = schema_pb2.Schema() schema.CopyFrom(_UNORDERED_SCHEMA) tensor_representations = { "string_tensor": schema_pb2.TensorRepresentation( dense_tensor=schema_pb2.TensorRepresentation.DenseTensor( column_name="string_feature")), "float_tensor": schema_pb2.TensorRepresentation( sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor( dense_shape=schema_pb2.FixedShape( dim=[schema_pb2.FixedShape.Dim(size=10)]), index_column_names=["int_feature"], value_column_name="float_feature")), } tensor_representation_util.SetTensorRepresentationsInSchema( schema, tensor_representations) tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=_COLUMN_NAMES, schema=schema, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) projected_tfxio = tfxio.Project(["float_tensor"]) # The projected_tfxio has the projected schema self.assertTrue(projected_tfxio.ArrowSchema().equals( _EXPECTED_PROJECTED_ARROW_SCHEMA)) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(record_batch, _EXPECTED_PROJECTED_ARROW_SCHEMA) expected_schema = projected_tfxio.ArrowSchema() self.assertListEqual( record_batch.schema.names, expected_schema.names, "actual: {}; expected: {}".format(record_batch.schema.names, expected_schema.names)) self.assertListEqual( record_batch.schema.types, expected_schema.types, "actual: {}; expected: {}".format(record_batch.schema.types, expected_schema.types)) tensor_adapter = projected_tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 1) self.assertIn("float_tensor", dict_of_tensors) pipeline = beam.Pipeline() record_batch_pcoll = (pipeline | projected_tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = pipeline.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "parquet", "parquet")
def test_simple(self, attach_raw_records): raw_record_column_name = "_raw_records" if attach_raw_records else None tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO( self._input_path, self._decoder_path, _TELEMETRY_DESCRIPTORS, raw_record_column_name=raw_record_column_name) expected_fields = [ pa.field("st1", pa.list_(pa.binary())), pa.field("st2", pa.list_(pa.binary())), ] if attach_raw_records: raw_record_column_type = (pa.large_list(pa.large_binary()) if tfxio._can_produce_large_types else pa.list_(pa.binary())) expected_fields.append( pa.field(raw_record_column_name, raw_record_column_type)) self.assertTrue(tfxio.ArrowSchema().equals(pa.schema(expected_fields)), tfxio.ArrowSchema()) self.assertEqual( tfxio.TensorRepresentations(), { "st1": text_format.Parse( """varlen_sparse_tensor { column_name: "st1" }""", schema_pb2.TensorRepresentation()), "st2": text_format.Parse( """varlen_sparse_tensor { column_name: "st2" }""", schema_pb2.TensorRepresentation()) }) tensor_adapter = tfxio.TensorAdapter() self.assertEqual(tensor_adapter.TypeSpecs(), _DecoderForTesting().output_type_specs()) def _assert_fn(list_of_rb): self.assertLen(list_of_rb, 1) rb = list_of_rb[0] self.assertTrue(rb.schema.equals(tfxio.ArrowSchema())) tensors = tensor_adapter.ToBatchTensors(rb) self.assertLen(tensors, 2) for tensor_name in ("st1", "st2"): self.assertIn(tensor_name, tensors) st = tensors[tensor_name] self.assertAllEqual(st.values, _RECORDS) self.assertAllEqual(st.indices, [[0, 0], [1, 0]]) self.assertAllEqual(st.dense_shape, [2, 1]) p = beam.Pipeline() rb_pcoll = p | tfxio.BeamSource(batch_size=len(_RECORDS)) beam_testing_util.assert_that(rb_pcoll, _assert_fn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "tensor", "tfrecords_gzip")
def testImplicitTensorRepresentations(self, use_beam_record_csv_tfxio): """Tests inferring of tensor representation.""" tfxio = self._MakeTFXIO( _COLUMN_NAMES, schema=_SCHEMA, make_beam_record_tfxio=use_beam_record_csv_tfxio) self.assertEqual( { "int_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "int_feature"}""", schema_pb2.TensorRepresentation()), "float_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "float_feature"}""", schema_pb2.TensorRepresentation()), "string_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "string_feature" }""", schema_pb2.TensorRepresentation()), }, tfxio.TensorRepresentations()) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(tfxio, record_batch) self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) tensor_adapter = tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 3) self.assertIn("int_feature", dict_of_tensors) self.assertIn("float_feature", dict_of_tensors) self.assertIn("string_feature", dict_of_tensors) p = beam.Pipeline() record_batch_pcoll = (self._MakePipelineInputs( p, use_beam_record_csv_tfxio) | tfxio.BeamSource(batch_size=len(_ROWS))) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "csv", _EXPECTED_PHYSICAL_FORMAT)
def testImplicitTensorRepresentations(self): """Tests inferring of tensor representation.""" tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=_COLUMN_NAMES, schema=_UNORDERED_SCHEMA, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) self.assertEqual( { "int_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "int_feature"}""", schema_pb2.TensorRepresentation()), "float_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "float_feature"}""", schema_pb2.TensorRepresentation()), "string_feature": text_format.Parse( """varlen_sparse_tensor { column_name: "string_feature" }""", schema_pb2.TensorRepresentation()), }, tfxio.TensorRepresentations()) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA) self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema())) tensor_adapter = tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 3) self.assertIn("int_feature", dict_of_tensors) self.assertIn("float_feature", dict_of_tensors) self.assertIn("string_feature", dict_of_tensors) pipeline = beam.Pipeline() record_batch_pcoll = (pipeline | tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = pipeline.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "parquet", "parquet")
def testOptionalSchema(self): """Tests when the schema is not provided.""" tfxio = ParquetTFXIO(file_pattern=self._example_file, column_names=_COLUMN_NAMES, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) self.assertEqual(tfxio.ArrowSchema(), _EXPECTED_ARROW_SCHEMA) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA) pipeline = beam.Pipeline() record_batch_pcoll = (pipeline | tfxio.BeamSource(batch_size=_NUM_ROWS)) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = pipeline.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "parquet", "parquet")
def testProjection(self, use_beam_record_csv_tfxio): """Test projecting of a TFXIO.""" tfxio = self._MakeTFXIO( _COLUMN_NAMES, schema=_SCHEMA, make_beam_record_tfxio=use_beam_record_csv_tfxio) projected_tfxio = tfxio.Project(["int_feature"]) # The projected_tfxio still has original schema self.assertTrue(projected_tfxio.ArrowSchema().equals( _GetExpectedArrowSchema(tfxio))) def _AssertFn(record_batch_list): self.assertLen(record_batch_list, 1) record_batch = record_batch_list[0] self._ValidateRecordBatch(tfxio, record_batch) expected_schema = projected_tfxio.ArrowSchema() self.assertTrue( record_batch.schema.equals(expected_schema), "actual: {}; expected: {}".format(record_batch.schema, expected_schema)) tensor_adapter = projected_tfxio.TensorAdapter() dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch) self.assertLen(dict_of_tensors, 1) self.assertIn("int_feature", dict_of_tensors) p = beam.Pipeline() record_batch_pcoll = (self._MakePipelineInputs( p, use_beam_record_csv_tfxio) | tfxio.BeamSource(batch_size=len(_ROWS))) beam_testing_util.assert_that(record_batch_pcoll, _AssertFn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics(self, pipeline_result, _TELEMETRY_DESCRIPTORS, "csv", _EXPECTED_PHYSICAL_FORMAT)
def test_beam_source_and_tensor_adapter( self, attach_raw_records, create_decoder, beam_record_tfxio=False): decoder = create_decoder() raw_record_column_name = "_raw_records" if attach_raw_records else None decoder_path = _write_decoder(decoder) if beam_record_tfxio: tfxio = record_to_tensor_tfxio.BeamRecordToTensorTFXIO( saved_decoder_path=decoder_path, telemetry_descriptors=_TELEMETRY_DESCRIPTORS, physical_format="tfrecords_gzip", raw_record_column_name=raw_record_column_name) else: tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO( self._input_path, decoder_path, _TELEMETRY_DESCRIPTORS, raw_record_column_name=raw_record_column_name) expected_tensor_representations = { "st1": text_format.Parse("""varlen_sparse_tensor { column_name: "st1" }""", schema_pb2.TensorRepresentation()), "st2": text_format.Parse("""varlen_sparse_tensor { column_name: "st2" }""", schema_pb2.TensorRepresentation()) } if isinstance(decoder, _DecoderForTestingWithRecordIndex): expected_fields = [ pa.field("ragged_record_index", pa.large_list(pa.int64())), pa.field("sparse_record_index", pa.large_list(pa.int64())), pa.field("st1", pa.large_list(pa.large_binary())), pa.field("st2", pa.large_list(pa.large_binary())), ] expected_tensor_representations["ragged_record_index"] = ( text_format.Parse( """ragged_tensor { feature_path: { step: "ragged_record_index" } row_partition_dtype: INT64 }""", schema_pb2.TensorRepresentation())) expected_tensor_representations["sparse_record_index"] = ( text_format.Parse( """varlen_sparse_tensor { column_name: "sparse_record_index" }""", schema_pb2.TensorRepresentation())) else: expected_fields = [ pa.field("st1", pa.large_list(pa.large_binary())), pa.field("st2", pa.large_list(pa.large_binary())), ] if attach_raw_records: expected_fields.append( pa.field(raw_record_column_name, pa.large_list(pa.large_binary()))) self.assertTrue(tfxio.ArrowSchema().equals( pa.schema(expected_fields)), tfxio.ArrowSchema()) self.assertEqual( tfxio.TensorRepresentations(), expected_tensor_representations) tensor_adapter = tfxio.TensorAdapter() self.assertEqual(tensor_adapter.TypeSpecs(), decoder.output_type_specs()) def _assert_fn(list_of_rb): self.assertLen(list_of_rb, 1) rb = list_of_rb[0] self.assertTrue(rb.schema.equals(tfxio.ArrowSchema())) if attach_raw_records: self.assertEqual(rb.column(rb.num_columns - 1).flatten().to_pylist(), _RECORDS) tensors = tensor_adapter.ToBatchTensors(rb) for tensor_name in ("st1", "st2"): self.assertIn(tensor_name, tensors) st = tensors[tensor_name] self.assertAllEqual(st.values, _RECORDS) self.assertAllEqual(st.indices, [[0, 0], [1, 0]]) self.assertAllEqual(st.dense_shape, [2, 1]) p = beam.Pipeline() pipeline_input = (p | beam.Create(_RECORDS)) if beam_record_tfxio else p rb_pcoll = pipeline_input | tfxio.BeamSource(batch_size=len(_RECORDS)) beam_testing_util.assert_that(rb_pcoll, _assert_fn) pipeline_result = p.run() pipeline_result.wait_until_finish() telemetry_test_util.ValidateMetrics( self, pipeline_result, _TELEMETRY_DESCRIPTORS, "tensor", "tfrecords_gzip")