示例#1
0
def word_count(input_path, output_path, raw_metadata, min_token_frequency=2):
    """Returns a pipeline counting words and writing the output.

  Args:
    input_path: recordio file to read
    output_path: path in which to write the output
    raw_metadata: metadata of input tf.Examples
    min_token_frequency: the min frequency for a token to be included
  """

    lang_set = set(FLAGS.lang_set.split(','))

    # Create pipeline.
    pipeline = beam.Pipeline()

    with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
        converter = tft.coders.ExampleProtoCoder(raw_metadata.schema,
                                                 serialized=False)

        # Read raw data and convert to TF Transform encoded dict.
        raw_data = (
            pipeline
            | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                input_path, coder=beam.coders.ProtoCoder(tf.train.Example))
            | 'DecodeInputData' >> beam.Map(converter.decode))

        # Apply TF Transform.
        (transformed_data, _), _ = (
            (raw_data, raw_metadata)
            |
            'FilterLangAndExtractToken' >> tft_beam.AnalyzeAndTransformDataset(
                utils.count_preprocessing_fn(FLAGS.text_key,
                                             FLAGS.language_code_key)))

        # Filter by languages.
        tokens = (
            transformed_data
            | 'FilterByLang' >> beam.ParDo(utils.FilterTokensByLang(lang_set)))

        # Calculate smoothing coefficients.
        coeffs = (tokens
                  | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
                      utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

        # Apply smoothing, aggregate counts, and sort words by count.
        _ = (tokens
             | 'ApplyExponentialSmoothing' >> beam.ParDo(
                 utils.ExponentialSmoothing(), beam.pvalue.AsSingleton(coeffs))
             | 'SumCounts' >> beam.CombinePerKey(sum)
             | 'FilterLowCounts' >> beam.ParDo(
                 utils.FilterByCount(FLAGS.max_word_length,
                                     min_token_frequency))
             |
             'MergeAndSortCounts' >> beam.CombineGlobally(utils.SortByCount())
             | 'Flatten' >> beam.FlatMap(lambda x: x)
             | 'FormatCounts' >> beam.Map(lambda tc: '%s\t%s' % (tc[0], tc[1]))
             | 'WriteSortedCount' >> beam.io.WriteToText(
                 output_path, shard_name_template=''))

    return pipeline
示例#2
0
 def testNotEqual(self):
   with TestPipeline() as p:
     sample_input = [('I', 'en'), ('kind', 'en'), ('of', 'en'), ('like', 'en'),
                     ('to', 'en'), ('eat', 'en'), ('pie', 'en'), ('!', 'en'),
                     ('Je', 'fr'), ('suis', 'fr'), ('une', 'fr'),
                     ('fille', 'fr'), ('.', 'fr')]
     tokens = p | beam.Create(sample_input)
     result = (tokens
               | beam.CombineGlobally(utils.CalculateCoefficients(0.5))
               | beam.ParDo(CompareValues()))
     assert_that(result, equal_to([True]))
示例#3
0
    def run_vocab():
        """Creates a pipeline to generate wordpiece vocab over a corpus."""

        vocab_pipeline = beam.Pipeline()

        with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
            # Read raw data and convert to TF Transform encoded dict.
            raw_data = (
                vocab_pipeline
                | 'ReadInputData' >> beam.io.tfrecordio.ReadFromTFRecord(
                    data_file, coder=beam.coders.ProtoCoder(tf.train.Example))
                | 'DecodeInputData' >> beam.Map(example_converter.decode))

            # Apply TF Transform.
            (transformed_data,
             _), _ = ((raw_data, raw_metadata)
                      | 'FilterLangAndExtractToken' >>
                      tft_beam.AnalyzeAndTransformDataset(
                          utils.count_preprocessing_fn(
                              FLAGS.text_key, FLAGS.language_code_key)))

            # Filter by languages.
            tokens = (transformed_data
                      | 'FilterByLang' >> beam.ParDo(
                          utils.FilterTokensByLang(lang_set)))

            # Calculate smoothing coefficients.
            coeffs = (
                tokens
                | 'CalculateSmoothingCoefficients' >> beam.CombineGlobally(
                    utils.CalculateCoefficients(FLAGS.smoothing_exponent)))

            # Apply smoothing, aggregate counts, and sort words by count.
            _ = (tokens
                 | 'ApplyExponentialSmoothing' >> beam.ParDo(
                     utils.ExponentialSmoothing(),
                     beam.pvalue.AsSingleton(coeffs))
                 | 'SumCounts' >> beam.CombinePerKey(sum)
                 | 'FilterLowCounts' >> beam.ParDo(
                     utils.FilterByCount(FLAGS.max_word_length,
                                         min_token_frequency))
                 | 'MergeAndSortCounts' >> beam.CombineGlobally(
                     utils.SortByCount())
                 | 'LearnVocab' >> beam.ParDo(utils.LearnVocab(params))
                 | 'Flatten' >> beam.FlatMap(lambda x: x + '\n')
                 | 'WriteVocab' >> beam.io.WriteToText(
                     vocab_file,
                     shard_name_template='',
                     append_trailing_newlines=False))
        return vocab_pipeline
示例#4
0
 def testEqual(self):
   with TestPipeline() as p:
     tokens = p | beam.Create(self.sample_input)
     result = tokens | beam.CombineGlobally(utils.CalculateCoefficients(0.5))
     assert_that(result, equal_to([{'en': 1.0, 'fr': 1.0}]))