def word_count(input_path, output_path, raw_metadata, min_token_frequency=2):
    """Returns a pipeline counting words and writing the output.

  Args:
    input_path: recordio file to read
    output_path: path in which to write the output
    raw_metadata: metadata of input tf.Examples
    min_token_frequency: the min frequency for a token to be included
  """

    lang_set = set(FLAGS.lang_set.split(','))

    # Create pipeline.
    pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        converter = tft.coders.ExampleProtoCoder(raw_metadata.schema,
                                                 serialized=False)

        # Read raw data and convert to TF Transform encoded dict.
        raw_data = (
            pipeline
            | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                input_path, coder=beam.coders.ProtoCoder(tf.train.Example))
            | 'DecodeInputData' >> beam.Map(converter.decode))

        # Apply TF Transform.
        (transformed_data, _), _ = (
            (raw_data, raw_metadata)
            |
            'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset(
                utils.count_preprocessing_fn(FLAGS.text_key,
                                             FLAGS.language_code_key)))

        # Filter by languages.
        tokens = (
            transformed_data
            | 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set)))

        # Calculate smoothing coefficients.
        coeffs = (tokens
                  | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
                      utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

        # Apply smoothing, aggregate counts, and sort words by count.
        _ = (tokens
             | 'ApplyExponentialSmoothing' >> beam.ParDo(
                 utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs))
             | 'SumCounts' >> beam.CombinePerKey(sum)
             | 'FilterLowCounts' >> beam.ParDo(
                 utils.FilterByCount(FLAGS.max_word_length,
                                     min_token_frequency))
             |
             'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount())
             | 'Flatten' >> beam.FlatMap(lambda x: x)
             | 'FormatCounts' >> beam.Map(lambda tc: '%s\t%s' % (tc[0], tc[1]))
             | 'WriteSortedCount' >> beam.io.WriteToText(
                 output_path, shard_name_template=''))

    return pipeline
 def testLangInLangSet(self):
     with TestPipeline() as p:
         tokens = p | beam.Create(self.sample_input)
         result = tokens | beam.ParDo(utils.FilterTokensByLang({'en'}))
         assert_that(
             result,
             equal_to([('I', 'en'), ('like', 'en'), ('pie', 'en'),
                       ('.', 'en')]))
 def testLangNotInLangSetIncludeOthers(self):
     with TestPipeline() as p:
         tokens = p | beam.Create(self.sample_input)
         result = tokens | beam.ParDo(utils.FilterTokensByLang({'fr'},
                                                               True))
         assert_that(
             result,
             equal_to([('I', 'other'), ('like', 'other'), ('pie', 'other'),
                       ('.', 'other')]))
Exemplo n.º 4
0
    def run_vocab():
        """Creates a pipeline to generate wordpiece vocab over a corpus."""

        vocab_pipeline = beam.Pipeline()

        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Read raw data and convert to TF Transform encoded dict.
            raw_data = (
                vocab_pipeline
                | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                    data_file, coder=beam.coders.ProtoCoder(tf.train.Example))
                | 'DecodeInputData' >> beam.Map(example_converter.decode))

            # Apply TF Transform.
            (transformed_data,
             _), _ = ((raw_data, raw_metadata)
                      | 'FilterLangAndExtractToken' >>
                      tft_beam.AnalyzeAndTransformDataset(
                          utils.count_preprocessing_fn(
                              FLAGS.text_key, FLAGS.language_code_key)))

            # Filter by languages.
            tokens = (transformed_data
                      | 'FilterByLang' >> beam.ParDo(
                          utils.FilterTokensByLang(lang_set)))

            # Calculate smoothing coefficients.
            coeffs = (
                tokens
                | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
                    utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

            # Apply smoothing, aggregate counts, and sort words by count.
            _ = (tokens
                 | 'ApplyExponentialSmoothing' >> beam.ParDo(
                     utils.ExponentialSmoothing(),
                     beam.pvalue.AsSingleton(coeffs))
                 | 'SumCounts' >> beam.CombinePerKey(sum)
                 | 'FilterLowCounts' >> beam.ParDo(
                     utils.FilterByCount(FLAGS.max_word_length,
                                         min_token_frequency))
                 | 'MergeAndSortCounts' >> beam.CombineGlobally(
                     utils.SortByCount())
                 | 'LearnVocab' >> beam.ParDo(utils.LearnVocab(params))
                 | 'Flatten' >> beam.FlatMap(lambda x: x + '\n')
                 | 'WriteVocab' >> beam.io.WriteToText(
                     vocab_file,
                     shard_name_template='',
                     append_trailing_newlines=False))
        return vocab_pipeline
 def testLangNotInLangSet(self):
     with TestPipeline() as p:
         tokens = p | beam.Create(self.sample_input)
         result = tokens | beam.ParDo(utils.FilterTokensByLang({'fr'}))
         assert_that(result, equal_to([]))