Exemplo n.º 1
0
    def testRecordBatchAndTensorAdapter(self):
        column_name = "raw_record"
        telemetry_descriptors = ["some", "component"]
        tfxio = raw_tf_record.RawTfRecordTFXIO(
            self._raw_record_file,
            column_name,
            telemetry_descriptors=telemetry_descriptors)
        expected_type = (pa.large_list(pa.large_binary()) if
                         _ProducesLargeTypes(tfxio) else pa.list_(pa.binary()))

        got_schema = tfxio.ArrowSchema()
        self.assertTrue(
            got_schema.equals(pa.schema([pa.field(column_name,
                                                  expected_type)])),
            "got: {}".format(got_schema))

        def _AssertFn(record_batches):
            self.assertLen(record_batches, 1)
            record_batch = record_batches[0]
            self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
            self.assertTrue(record_batch.columns[0].equals(
                pa.array([[r] for r in _RAW_RECORDS], type=expected_type)))
            tensor_adapter = tfxio.TensorAdapter()
            tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(tensors, 1)
            self.assertIn(column_name, tensors)

        p = beam.Pipeline()
        record_batch_pcoll = p | tfxio.BeamSource(batch_size=len(_RAW_RECORDS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = p.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            telemetry_descriptors, "bytes",
                                            "tfrecords_gzip")
    def testE2E(self, attach_raw_records):
        raw_column_name = "raw_records" if attach_raw_records else None
        tfxio = self._MakeTFXIO(_SCHEMA, raw_column_name)

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(tfxio, record_batch, raw_column_name)
            self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
            tensor_adapter = tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 4)
            self.assertIn("int_feature", dict_of_tensors)
            self.assertIn("float_feature", dict_of_tensors)
            self.assertIn("seq_string_feature", dict_of_tensors)
            self.assertIn("seq_int_feature", dict_of_tensors)

        p = beam.Pipeline()
        record_batch_pcoll = p | tfxio.BeamSource(batch_size=1000)
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = p.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS,
                                            "tf_sequence_example",
                                            "tfrecords_gzip")
Exemplo n.º 3
0
  def testImplicitTensorRepresentations(self):
    tfxio = self._MakeTFXIO(_SCHEMA)
    self.assertEqual(
        {
            "int_feature": text_format.Parse(
                """varlen_sparse_tensor { column_name: "int_feature" }""",
                schema_pb2.TensorRepresentation()),
            "float_feature": text_format.Parse(
                """varlen_sparse_tensor { column_name: "float_feature" }""",
                schema_pb2.TensorRepresentation()),
            "string_feature": text_format.Parse(
                """varlen_sparse_tensor { column_name: "string_feature" }""",
                schema_pb2.TensorRepresentation()),
        }, tfxio.TensorRepresentations())

    def _AssertFn(record_batch_list):
      self.assertLen(record_batch_list, 1)
      record_batch = record_batch_list[0]
      self._ValidateRecordBatch(tfxio, record_batch)
      self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
      tensor_adapter = tfxio.TensorAdapter()
      dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
      self.assertLen(dict_of_tensors, 3)
      self.assertIn("int_feature", dict_of_tensors)
      self.assertIn("float_feature", dict_of_tensors)
      self.assertIn("string_feature", dict_of_tensors)

    p = beam.Pipeline()
    record_batch_pcoll = p | tfxio.BeamSource(batch_size=1000)
    beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
    pipeline_result = p.run()
    pipeline_result.wait_until_finish()
    telemetry_test_util.ValidateMetrics(
        self, pipeline_result, _TELEMETRY_DESCRIPTORS,
        "tf_example", "tfrecords_gzip")
Exemplo n.º 4
0
    def testProjection(self):
        """Test projecting of a TFXIO."""
        schema = schema_pb2.Schema()
        schema.CopyFrom(_UNORDERED_SCHEMA)
        tensor_representations = {
            "string_tensor":
            schema_pb2.TensorRepresentation(
                dense_tensor=schema_pb2.TensorRepresentation.DenseTensor(
                    column_name="string_feature")),
            "float_tensor":
            schema_pb2.TensorRepresentation(
                sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor(
                    dense_shape=schema_pb2.FixedShape(
                        dim=[schema_pb2.FixedShape.Dim(size=10)]),
                    index_column_names=["int_feature"],
                    value_column_name="float_feature")),
        }
        tensor_representation_util.SetTensorRepresentationsInSchema(
            schema, tensor_representations)

        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             schema=schema,
                             telemetry_descriptors=_TELEMETRY_DESCRIPTORS)

        projected_tfxio = tfxio.Project(["float_tensor"])

        # The projected_tfxio has the projected schema
        self.assertTrue(projected_tfxio.ArrowSchema().equals(
            _EXPECTED_PROJECTED_ARROW_SCHEMA))

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch,
                                      _EXPECTED_PROJECTED_ARROW_SCHEMA)
            expected_schema = projected_tfxio.ArrowSchema()
            self.assertListEqual(
                record_batch.schema.names, expected_schema.names,
                "actual: {}; expected: {}".format(record_batch.schema.names,
                                                  expected_schema.names))
            self.assertListEqual(
                record_batch.schema.types, expected_schema.types,
                "actual: {}; expected: {}".format(record_batch.schema.types,
                                                  expected_schema.types))
            tensor_adapter = projected_tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 1)
            self.assertIn("float_tensor", dict_of_tensors)

        pipeline = beam.Pipeline()
        record_batch_pcoll = (pipeline |
                              projected_tfxio.BeamSource(batch_size=_NUM_ROWS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "parquet",
                                            "parquet")
    def test_simple(self, attach_raw_records):
        raw_record_column_name = "_raw_records" if attach_raw_records else None
        tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO(
            self._input_path,
            self._decoder_path,
            _TELEMETRY_DESCRIPTORS,
            raw_record_column_name=raw_record_column_name)
        expected_fields = [
            pa.field("st1", pa.list_(pa.binary())),
            pa.field("st2", pa.list_(pa.binary())),
        ]
        if attach_raw_records:
            raw_record_column_type = (pa.large_list(pa.large_binary())
                                      if tfxio._can_produce_large_types else
                                      pa.list_(pa.binary()))
            expected_fields.append(
                pa.field(raw_record_column_name, raw_record_column_type))
        self.assertTrue(tfxio.ArrowSchema().equals(pa.schema(expected_fields)),
                        tfxio.ArrowSchema())
        self.assertEqual(
            tfxio.TensorRepresentations(), {
                "st1":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "st1" }""",
                    schema_pb2.TensorRepresentation()),
                "st2":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "st2" }""",
                    schema_pb2.TensorRepresentation())
            })

        tensor_adapter = tfxio.TensorAdapter()
        self.assertEqual(tensor_adapter.TypeSpecs(),
                         _DecoderForTesting().output_type_specs())

        def _assert_fn(list_of_rb):
            self.assertLen(list_of_rb, 1)
            rb = list_of_rb[0]
            self.assertTrue(rb.schema.equals(tfxio.ArrowSchema()))
            tensors = tensor_adapter.ToBatchTensors(rb)
            self.assertLen(tensors, 2)
            for tensor_name in ("st1", "st2"):
                self.assertIn(tensor_name, tensors)
                st = tensors[tensor_name]
                self.assertAllEqual(st.values, _RECORDS)
                self.assertAllEqual(st.indices, [[0, 0], [1, 0]])
                self.assertAllEqual(st.dense_shape, [2, 1])

        p = beam.Pipeline()
        rb_pcoll = p | tfxio.BeamSource(batch_size=len(_RECORDS))
        beam_testing_util.assert_that(rb_pcoll, _assert_fn)
        pipeline_result = p.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "tensor",
                                            "tfrecords_gzip")
Exemplo n.º 6
0
    def testImplicitTensorRepresentations(self, use_beam_record_csv_tfxio):
        """Tests inferring of tensor representation."""
        tfxio = self._MakeTFXIO(
            _COLUMN_NAMES,
            schema=_SCHEMA,
            make_beam_record_tfxio=use_beam_record_csv_tfxio)
        self.assertEqual(
            {
                "int_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "int_feature"}""",
                    schema_pb2.TensorRepresentation()),
                "float_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "float_feature"}""",
                    schema_pb2.TensorRepresentation()),
                "string_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "string_feature" }""",
                    schema_pb2.TensorRepresentation()),
            }, tfxio.TensorRepresentations())

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(tfxio, record_batch)
            self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
            tensor_adapter = tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 3)
            self.assertIn("int_feature", dict_of_tensors)
            self.assertIn("float_feature", dict_of_tensors)
            self.assertIn("string_feature", dict_of_tensors)

        p = beam.Pipeline()
        record_batch_pcoll = (self._MakePipelineInputs(
            p, use_beam_record_csv_tfxio)
                              | tfxio.BeamSource(batch_size=len(_ROWS)))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = p.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "csv",
                                            _EXPECTED_PHYSICAL_FORMAT)
Exemplo n.º 7
0
    def testImplicitTensorRepresentations(self):
        """Tests inferring of tensor representation."""
        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             schema=_UNORDERED_SCHEMA,
                             telemetry_descriptors=_TELEMETRY_DESCRIPTORS)
        self.assertEqual(
            {
                "int_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "int_feature"}""",
                    schema_pb2.TensorRepresentation()),
                "float_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "float_feature"}""",
                    schema_pb2.TensorRepresentation()),
                "string_feature":
                text_format.Parse(
                    """varlen_sparse_tensor { column_name: "string_feature" }""",
                    schema_pb2.TensorRepresentation()),
            }, tfxio.TensorRepresentations())

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA)
            self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
            tensor_adapter = tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 3)
            self.assertIn("int_feature", dict_of_tensors)
            self.assertIn("float_feature", dict_of_tensors)
            self.assertIn("string_feature", dict_of_tensors)

        pipeline = beam.Pipeline()
        record_batch_pcoll = (pipeline
                              | tfxio.BeamSource(batch_size=_NUM_ROWS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "parquet",
                                            "parquet")
Exemplo n.º 8
0
    def testOptionalSchema(self):
        """Tests when the schema is not provided."""
        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             telemetry_descriptors=_TELEMETRY_DESCRIPTORS)

        self.assertEqual(tfxio.ArrowSchema(), _EXPECTED_ARROW_SCHEMA)

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch, _EXPECTED_ARROW_SCHEMA)

        pipeline = beam.Pipeline()
        record_batch_pcoll = (pipeline
                              | tfxio.BeamSource(batch_size=_NUM_ROWS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "parquet",
                                            "parquet")
Exemplo n.º 9
0
    def testProjection(self, use_beam_record_csv_tfxio):
        """Test projecting of a TFXIO."""
        tfxio = self._MakeTFXIO(
            _COLUMN_NAMES,
            schema=_SCHEMA,
            make_beam_record_tfxio=use_beam_record_csv_tfxio)

        projected_tfxio = tfxio.Project(["int_feature"])

        # The projected_tfxio still has original schema
        self.assertTrue(projected_tfxio.ArrowSchema().equals(
            _GetExpectedArrowSchema(tfxio)))

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(tfxio, record_batch)
            expected_schema = projected_tfxio.ArrowSchema()
            self.assertTrue(
                record_batch.schema.equals(expected_schema),
                "actual: {}; expected: {}".format(record_batch.schema,
                                                  expected_schema))
            tensor_adapter = projected_tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 1)
            self.assertIn("int_feature", dict_of_tensors)

        p = beam.Pipeline()
        record_batch_pcoll = (self._MakePipelineInputs(
            p, use_beam_record_csv_tfxio)
                              | tfxio.BeamSource(batch_size=len(_ROWS)))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = p.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "csv",
                                            _EXPECTED_PHYSICAL_FORMAT)
Exemplo n.º 10
0
  def test_beam_source_and_tensor_adapter(
      self, attach_raw_records, create_decoder, beam_record_tfxio=False):
    decoder = create_decoder()
    raw_record_column_name = "_raw_records" if attach_raw_records else None
    decoder_path = _write_decoder(decoder)
    if beam_record_tfxio:
      tfxio = record_to_tensor_tfxio.BeamRecordToTensorTFXIO(
          saved_decoder_path=decoder_path,
          telemetry_descriptors=_TELEMETRY_DESCRIPTORS,
          physical_format="tfrecords_gzip",
          raw_record_column_name=raw_record_column_name)
    else:
      tfxio = record_to_tensor_tfxio.TFRecordToTensorTFXIO(
          self._input_path,
          decoder_path,
          _TELEMETRY_DESCRIPTORS,
          raw_record_column_name=raw_record_column_name)
    expected_tensor_representations = {
        "st1":
            text_format.Parse("""varlen_sparse_tensor { column_name: "st1" }""",
                              schema_pb2.TensorRepresentation()),
        "st2":
            text_format.Parse("""varlen_sparse_tensor { column_name: "st2" }""",
                              schema_pb2.TensorRepresentation())
    }
    if isinstance(decoder, _DecoderForTestingWithRecordIndex):
      expected_fields = [
          pa.field("ragged_record_index", pa.large_list(pa.int64())),
          pa.field("sparse_record_index", pa.large_list(pa.int64())),
          pa.field("st1", pa.large_list(pa.large_binary())),
          pa.field("st2", pa.large_list(pa.large_binary())),
      ]
      expected_tensor_representations["ragged_record_index"] = (
          text_format.Parse(
              """ragged_tensor {
                   feature_path: { step: "ragged_record_index" }
                   row_partition_dtype: INT64
                 }""", schema_pb2.TensorRepresentation()))
      expected_tensor_representations["sparse_record_index"] = (
          text_format.Parse(
              """varlen_sparse_tensor { column_name: "sparse_record_index" }""",
              schema_pb2.TensorRepresentation()))
    else:
      expected_fields = [
          pa.field("st1", pa.large_list(pa.large_binary())),
          pa.field("st2", pa.large_list(pa.large_binary())),
      ]
    if attach_raw_records:
      expected_fields.append(
          pa.field(raw_record_column_name, pa.large_list(pa.large_binary())))
    self.assertTrue(tfxio.ArrowSchema().equals(
        pa.schema(expected_fields)), tfxio.ArrowSchema())

    self.assertEqual(
        tfxio.TensorRepresentations(), expected_tensor_representations)

    tensor_adapter = tfxio.TensorAdapter()
    self.assertEqual(tensor_adapter.TypeSpecs(),
                     decoder.output_type_specs())

    def _assert_fn(list_of_rb):
      self.assertLen(list_of_rb, 1)
      rb = list_of_rb[0]
      self.assertTrue(rb.schema.equals(tfxio.ArrowSchema()))
      if attach_raw_records:
        self.assertEqual(rb.column(rb.num_columns - 1).flatten().to_pylist(),
                         _RECORDS)
      tensors = tensor_adapter.ToBatchTensors(rb)
      for tensor_name in ("st1", "st2"):
        self.assertIn(tensor_name, tensors)
        st = tensors[tensor_name]
        self.assertAllEqual(st.values, _RECORDS)
        self.assertAllEqual(st.indices, [[0, 0], [1, 0]])
        self.assertAllEqual(st.dense_shape, [2, 1])

    p = beam.Pipeline()
    pipeline_input = (p | beam.Create(_RECORDS)) if beam_record_tfxio else p
    rb_pcoll = pipeline_input | tfxio.BeamSource(batch_size=len(_RECORDS))
    beam_testing_util.assert_that(rb_pcoll, _assert_fn)
    pipeline_result = p.run()
    pipeline_result.wait_until_finish()
    telemetry_test_util.ValidateMetrics(
        self, pipeline_result, _TELEMETRY_DESCRIPTORS,
        "tensor", "tfrecords_gzip")