def testLargerBatchSize(self):
        with tf.Session() as sess:
            with tempfile.NamedTemporaryFile(mode='w+t',
                                             delete=False) as vocab:
                raw_data = {
                    'label': ['1', '2'],
                    'text_a':
                    ['The boy jumped into the air.', 'The cat sat on a hat.'],
                    'lang': ['en', 'en'],
                }
                expected_wordpieces = [
                    'The', '[UNK]', 'jumped', 'in', '##to', 'the', 'air', '.',
                    'The', 'cat', 'sat', 'on', 'a', 'h', '##at', '.'
                ]
                vocab.writelines([word + '\n' for word in self.vocab])
                vocab.flush()
                preprocessing_fn = utils.metrics_preprocessing_fn(
                    vocab.name, 'text_a', 'lang')
                outputs = preprocessing_fn(raw_data)
                tf.tables_initializer().run()
                outputs = sess.run(outputs)

                self.assertSequenceAlmostEqual(outputs['lang'], ['en', 'en'])
                self.assertSequenceAlmostEqual(outputs['num_preserved_chars'],
                                               [20, 16])
                self.assertSequenceAlmostEqual(outputs['num_dropped_chars'],
                                               [3, 0])
                self.assertSequenceAlmostEqual(outputs['wordpieces'].values,
                                               expected_wordpieces)
                self.assertSequenceAlmostEqual(
                    outputs['num_non_unk_wordpieces'], [7, 8])
Exemplo n.º 2
0
  def run_metrics():
    """Creates a pipeline to measure wordpiece vocab metrics over a corpus."""

    metrics_pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
      # Read raw data and convert to TF Transform encoded dict.
      raw_data = (
          metrics_pipeline
          | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
              data_file, coder=beam.coders.ProtoCoder(tf.train.Example))
          | 'DecodeInputData' >> beam.Map(example_converter.decode))

      # Apply transform to wordpiece-tokenize input.
      (metrics_transformed_data, _), _ = (
          (raw_data, raw_metadata)
          | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset(
              utils.metrics_preprocessing_fn(FLAGS.vocab_file,
                                             FLAGS.text_key,
                                             FLAGS.language_code_key)))

      # Initialize CSV coder. Aggregate values for each lang, calculate metrics,
      # and write to output to a CSV file.
      csv_converter = tft.coders.CsvCoder(columns, csv_schema)
      _ = (
          metrics_transformed_data
          | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo())
          | 'CombineStatsForLang' >> beam.CombineGlobally(utils.AggregateLang())
          | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics())
          | 'EncodeMetrics' >> beam.Map(csv_converter.encode)
          | 'WriteMetrics' >> beam.io.WriteToText(
              metrics_file, shard_name_template='', header=','.join(columns)))
    return metrics_pipeline
    def testSingleElement(self):
        with tf.Session() as sess:
            with tempfile.NamedTemporaryFile(mode='w+t',
                                             delete=False) as vocab:
                vocab.writelines([word + '\n' for word in self.vocab])
                vocab.flush()
                preprocessing_fn = utils.metrics_preprocessing_fn(
                    vocab.name, 'text_a', 'lang')
                outputs = preprocessing_fn(self.raw_data)
                tf.tables_initializer().run()
                outputs = sess.run(outputs)

                self.assertEqual(outputs['lang'], 'en')
                self.assertEqual(outputs['num_non_unk_wordpieces'], 7)
                self.assertEqual(outputs['num_preserved_chars'], 20)
                self.assertEqual(outputs['num_dropped_chars'], 3)
                self.assertSequenceAlmostEqual(outputs['wordpieces'].values,
                                               self.expected_wordpieces)
Exemplo n.º 4
0
def calculate_metrics():
    """Returns a pipeline to compute wordpiece model stats given a vocab and corpus."""

    # Schema of input dataset.
    raw_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.from_feature_spec({
            'text':
            tf.FixedLenFeature([], tf.string),
            'language_code':
            tf.FixedLenFeature([], tf.string),
        }))

    # Schema to format metrics as CSV.
    csv_schema = dataset_schema.from_feature_spec({
        'lang':
        tf.FixedLenFeature([], tf.string),
        'sample_count':
        tf.FixedLenFeature([], tf.int64),
        'micro_drop_char_percent':
        tf.FixedLenFeature([], tf.string),
        'macro_drop_char_percent':
        tf.FixedLenFeature([], tf.string),
        'micro_compress_ratio':
        tf.FixedLenFeature([], tf.string),
        'macro_compress_ratio':
        tf.FixedLenFeature([], tf.string),
        'unweighted_en_wp_overlap_percent':
        tf.FixedLenFeature([], tf.string),
        'weighted_en_wp_overlap_percent':
        tf.FixedLenFeature([], tf.string),
    })

    columns = [
        'lang', 'sample_count', 'micro_drop_char_percent',
        'macro_drop_char_percent', 'micro_compress_ratio',
        'macro_compress_ratio', 'unweighted_en_wp_overlap_percent',
        'weighted_en_wp_overlap_percent'
    ]

    # Create pipeline.
    pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        example_converter = tft.coders.ExampleProtoCoder(raw_metadata.schema,
                                                         serialized=False)
        csv_converter = tft.coders.CsvCoder(columns, csv_schema)

        # Read raw data and convert to TF Transform encoded dict.
        raw_data = (pipeline
                    | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                        FLAGS.input_file,
                        coder=beam.coders.ProtoCoder(tf.train.Example))
                    | 'DecodeInputData' >> beam.Map(example_converter.decode))

        # Apply transform to wordpiece-tokenize input.
        (transformed_data, _), _ = (
            (raw_data, raw_metadata)
            | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset(
                utils.metrics_preprocessing_fn(
                    FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key))
        )

        # Aggregate values for each lang, calculate metrics, and write to output.
        _ = (transformed_data
             |
             'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo())
             | 'CombineStatsForLang' >> beam.CombineGlobally(
                 utils.AggregateLang())
             | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics())
             | 'EncodeMetrics' >> beam.Map(csv_converter.encode)
             | 'WriteMetrics' >> beam.io.WriteToText(FLAGS.output_file,
                                                     shard_name_template='',
                                                     header=','.join(columns)))

    return pipeline