def testLargerBatchSize(self): with tf.Session() as sess: with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as vocab: raw_data = { 'label': ['1', '2'], 'text_a': ['The boy jumped into the air.', 'The cat sat on a hat.'], 'lang': ['en', 'en'], } expected_wordpieces = ['The', '[UNK]', 'jumped', 'in', '##to', 'the', 'air', '.', 'The', 'cat', 'sat', 'on', 'a', 'h', '##at', '.'] vocab.writelines([word + '\n' for word in self.vocab]) vocab.flush() preprocessing_fn = utils.metrics_preprocessing_fn( vocab.name, 'text_a', 'lang') outputs = preprocessing_fn(raw_data) tf.tables_initializer().run() outputs = sess.run(outputs) self.assertSequenceAlmostEqual(outputs['lang'], ['en', 'en']) self.assertSequenceAlmostEqual(outputs['num_preserved_chars'], [20, 16]) self.assertSequenceAlmostEqual(outputs['num_dropped_chars'], [3, 0]) self.assertSequenceAlmostEqual(outputs['wordpieces'].values, expected_wordpieces) self.assertSequenceAlmostEqual(outputs['num_non_unk_wordpieces'], [7, 8])
def run_metrics(): """Creates a pipeline to measure wordpiece vocab metrics over a corpus.""" metrics_pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # Read raw data and convert to TF Transform encoded dict. raw_data = ( metrics_pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( data_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply transform to wordpiece-tokenize input. (metrics_transformed_data, _), _ = ( (raw_data, raw_metadata) | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset( utils.metrics_preprocessing_fn(FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key))) # Initialize CSV coder. Aggregate values for each lang, calculate metrics, # and write to output to a CSV file. csv_converter = tft.coders.CsvCoder(columns, csv_schema) _ = ( metrics_transformed_data | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo()) | 'CombineStatsForLang' >> beam.CombineGlobally(utils.AggregateLang()) | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics()) | 'EncodeMetrics' >> beam.Map(csv_converter.encode) | 'WriteMetrics' >> beam.io.WriteToText( metrics_file, shard_name_template='', header=','.join(columns))) return metrics_pipeline
def testSingleElement(self): with tf.Session() as sess: with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as vocab: vocab.writelines([word + '\n' for word in self.vocab]) vocab.flush() preprocessing_fn = utils.metrics_preprocessing_fn( vocab.name, 'text_a', 'lang') outputs = preprocessing_fn(self.raw_data) tf.tables_initializer().run() outputs = sess.run(outputs) self.assertEqual(outputs['lang'], 'en') self.assertEqual(outputs['num_non_unk_wordpieces'], 7) self.assertEqual(outputs['num_preserved_chars'], 20) self.assertEqual(outputs['num_dropped_chars'], 3) self.assertSequenceAlmostEqual(outputs['wordpieces'].values, self.expected_wordpieces)
def calculate_metrics(): """Returns a pipeline to compute wordpiece model stats given a vocab and corpus.""" # Schema of input dataset. raw_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'text': tf.FixedLenFeature([], tf.string), 'language_code': tf.FixedLenFeature([], tf.string), })) # Schema to format metrics as CSV. csv_schema = dataset_schema.from_feature_spec({ 'lang': tf.FixedLenFeature([], tf.string), 'sample_count': tf.FixedLenFeature([], tf.int64), 'micro_drop_char_percent': tf.FixedLenFeature([], tf.string), 'macro_drop_char_percent': tf.FixedLenFeature([], tf.string), 'micro_compress_ratio': tf.FixedLenFeature([], tf.string), 'macro_compress_ratio': tf.FixedLenFeature([], tf.string), 'unweighted_en_wp_overlap_percent': tf.FixedLenFeature([], tf.string), 'weighted_en_wp_overlap_percent': tf.FixedLenFeature([], tf.string), }) columns = [ 'lang', 'sample_count', 'micro_drop_char_percent', 'macro_drop_char_percent', 'micro_compress_ratio', 'macro_compress_ratio', 'unweighted_en_wp_overlap_percent', 'weighted_en_wp_overlap_percent' ] # Create pipeline. pipeline = beam.Pipeline() with tft_beam.Context(temp_dir=tempfile.mkdtemp()): example_converter = tft.coders.ExampleProtoCoder(raw_metadata.schema, serialized=False) csv_converter = tft.coders.CsvCoder(columns, csv_schema) # Read raw data and convert to TF Transform encoded dict. raw_data = (pipeline | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord( FLAGS.input_file, coder=beam.coders.ProtoCoder(tf.train.Example)) | 'DecodeInputData' >> beam.Map(example_converter.decode)) # Apply transform to wordpiece-tokenize input. (transformed_data, _), _ = ( (raw_data, raw_metadata) | 'WordpieceTokenizeInput' >> tft_beam.AnalyzeAndTransformDataset( utils.metrics_preprocessing_fn( FLAGS.vocab_file, FLAGS.text_key, FLAGS.language_code_key)) ) # Aggregate values for each lang, calculate metrics, and write to output. _ = (transformed_data | 'CompileTokenInfo' >> beam.ParDo(utils.CompileTokenizationInfo()) | 'CombineStatsForLang' >> beam.CombineGlobally( utils.AggregateLang()) | 'CalculateMetrics' >> beam.ParDo(utils.CalculateMetrics()) | 'EncodeMetrics' >> beam.Map(csv_converter.encode) | 'WriteMetrics' >> beam.io.WriteToText(FLAGS.output_file, shard_name_template='', header=','.join(columns))) return pipeline