Exemplo n.º 1
0
def ReadCSVFilesPlain(
    p: beam.Pipeline,
    file_pattern: str,
    fieldnames: typing.List[str],
):
    return (p
            | "Read files csv files" >> beam_io.ReadFromText(
                file_pattern=file_pattern, skip_header_lines=1)
            | "Chunk for parsing" >> beam.BatchElements()
            | "Parse csv lines to dicts" >> beam.FlatMap(
                lambda x: map(dict, csv.DictReader(x, fieldnames=fieldnames)))
            | "Chunk for processing" >> beam.BatchElements()
            | "Convert chunks to DF" >> beam.Map(_to_dataframe))
Exemplo n.º 2
0
def RunInferenceImpl(  # pylint: disable=invalid-name
    examples: beam.pvalue.PCollection,
    inference_spec_type: model_spec_pb2.InferenceSpecType
) -> beam.pvalue.PCollection:
  """Implementation of RunInference API.

  Args:
    examples: A PCollection containing examples.
    inference_spec_type: Model inference endpoint.

  Returns:
    A PCollection containing prediction logs.

  Raises:
    ValueError; when operation is not supported.
  """
  logging.info('RunInference on model: %s', inference_spec_type)

  batched_examples = examples | 'BatchExamples' >> beam.BatchElements()
  operation_type = _get_operation_type(inference_spec_type)
  if operation_type == OperationType.CLASSIFICATION:
    return batched_examples | 'Classify' >> _Classify(inference_spec_type)
  elif operation_type == OperationType.REGRESSION:
    return batched_examples | 'Regress' >> _Regress(inference_spec_type)
  elif operation_type == OperationType.PREDICTION:
    return batched_examples | 'Predict' >> _Predict(inference_spec_type)
  elif operation_type == OperationType.MULTIHEAD:
    return (batched_examples
            | 'MultiInference' >> _MultiInference(inference_spec_type))
  else:
    raise ValueError('Unsupported operation_type %s' % operation_type)
def _ExtractPredictions(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection, eval_config: config.EvalConfig,
    eval_shared_models: List[types.EvalSharedModel]) -> beam.pvalue.PCollection:
  """A PTransform that adds predictions and possibly other tensors to extracts.

  Args:
    extracts: PCollection of extracts containing model inputs keyed by
      tfma.FEATURES_KEY (if model inputs are named) or tfma.INPUTS_KEY (if model
      takes raw tf.Examples as input).
    eval_config: Eval config.
    eval_shared_models: Shared model parameters.

  Returns:
    PCollection of Extracts updated with the predictions.
  """
  batch_args = {}
  if eval_config.options.HasField('desired_batch_size'):
    batch_args = dict(
        min_batch_size=eval_config.options.desired_batch_size.value,
        max_batch_size=eval_config.options.desired_batch_size.value)

  extracts = (
      extracts
      | 'Batch' >> beam.BatchElements(**batch_args)
      | 'Predict' >> beam.ParDo(
          _PredictionDoFn(
              eval_config=eval_config, eval_shared_models=eval_shared_models)))

  return extracts
Exemplo n.º 4
0
 def expand(self, pcoll: beam.PCollection) -> beam.PCollection:
     return (pcoll
             | beam.BatchElements()
             | beam.ParDo(
                 _RunInferenceDoFn(shared.Shared(), self._model_loader,
                                   self._clock))
             | beam.FlatMap(_unbatch))
Exemplo n.º 5
0
 def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection):
     return (
         raw_record_pcoll
         | 'Batch' >> beam.BatchElements(
             **record_based_tfxio.GetBatchElementsKwargs(batch_size))
         | 'ToRecordBatch' >> beam.Map(_BatchedRecordsToArrow,
                                       self.raw_record_column_name))
def _ExtractTFLitePredictions(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection, eval_config: config.EvalConfig,
    eval_shared_models: Dict[Text, types.EvalSharedModel],
    desired_batch_size: Optional[int]) -> beam.pvalue.PCollection:
  """A PTransform that adds predictions and possibly other tensors to extracts.

  Args:
    extracts: PCollection of extracts containing model inputs keyed by
      tfma.FEATURES_KEY.
    eval_config: Eval config.
    eval_shared_models: Shared model parameters keyed by model name.
    desired_batch_size: Optional batch size.

  Returns:
    PCollection of Extracts updated with the predictions.
  """
  batch_args = {}
  # TODO(b/143484017): Consider removing this option if autotuning is better
  # able to handle batch size selection.
  if desired_batch_size is not None:
    batch_args = dict(
        min_batch_size=desired_batch_size, max_batch_size=desired_batch_size)
  else:
    # TODO(b/155887292): Remove the following and allow dynamic batch sizing
    # once the bug is addressed. Also add unit tests to exercise.
    batch_args = dict(min_batch_size=1, max_batch_size=1)

  return (
      extracts
      | 'Batch' >> beam.BatchElements(**batch_args)
      | 'Predict' >> beam.ParDo(
          _TFLitePredictionDoFn(
              eval_config=eval_config, eval_shared_models=eval_shared_models)))
Exemplo n.º 7
0
    def expand(self, lines: beam.pvalue.PCollection):
        """Decodes the input CSV records into an in-memory dict representation.

    Args:
      lines: A PCollection of strings representing the lines in the CSV file.

    Returns:
      A PCollection of dicts representing the CSV records.
    """
        csv_lines = (lines | 'ParseCSVLines' >> beam.ParDo(
            csv_decoder.ParseCSVLine(self._delimiter)))

        if self._infer_type_from_schema:
            column_infos = _get_feature_types_from_schema(
                self._schema, self._column_names)
        else:
            # TODO(b/72746442): Consider using a DeepCopy optimization similar to TFT.
            # Do first pass to infer the feature types.
            column_infos = beam.pvalue.AsSingleton(
                csv_lines | 'InferColumnTypes' >> beam.CombineGlobally(
                    csv_decoder.ColumnTypeInferrer(
                        column_names=self._column_names,
                        skip_blank_lines=self._skip_blank_lines)))

        # Do second pass to generate the in-memory dict representation.
        return (
            csv_lines
            | 'BatchCSVLines' >> beam.BatchElements(
                **batch_util.GetBeamBatchKwargs(self._desired_batch_size))
            | 'BatchedCSVRowsToArrow' >> beam.ParDo(
                _BatchedCSVRowsToArrow(
                    skip_blank_lines=self._skip_blank_lines), column_infos))
Exemplo n.º 8
0
    def test_mini_beam_pipeline_batched(self):
        def test_call_fn(batched_model_input, sr, mod, key, name):
            del sr, mod, key, name
            return np.zeros([batched_model_input.shape[0], 5, 1024],
                            np.float32)

        with beam.Pipeline() as root:
            _ = (root
                 | beam.Create([('k1', make_tfexample(5)),
                                ('k2', make_tfexample(5))])
                 | 'Batch' >> beam.BatchElements(min_batch_size=2,
                                                 max_batch_size=2)
                 | beam.ParDo(
                     beam_dofns.ComputeBatchedChunkedSingleEmbeddings(
                         name='all',
                         module='dummy_mod_loc',
                         output_key=['k1'],
                         audio_key='audio',
                         sample_rate_key='sample_rate',
                         sample_rate=None,
                         chunk_len=2,
                         average_over_time=True,
                         feature_fn=None,
                         setup_fn=lambda _: MockModule(['k1']),
                         module_call_fn=test_call_fn))
                 | beam.Map(data_prep_utils.single_audio_emb_to_tfex,
                            embedding_name='ename',
                            audio_key='audio',
                            embedding_length=1024))
Exemplo n.º 9
0
def run(argv=None):
    """Main entry point; defines and runs the pipeline."""
    logging.info("Starting pipeline.")

    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--project=voterdb-test', '--job_name=news-pipeline',
        '--temp_location gs://voterdb-test-dataflow-temp/',
        '--staging_location gs://voterdb-test-dataflow-staging/',
        '--requirements_file=requirements.txt', '--max_num_workers=8',
        '--disk_size_gb=50'
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        # Creating an initial PCollection of one value is necessary to ensure
        # the HTTP gets are deferred until the DataflowRunner starts
        # in the cloud.  beam.Create(read_zip_csv()) otherwise creates a pickled
        # Python image which is bigger than the upload limit, and fails.

        raw = (p
               | "beam.Create" >> beam.Create(KEYWORDS)
               | "get_news_items" >> beam.FlatMap(get_news_items)
               | "BatchElements" >> beam.BatchElements()
               | "BatchRunner analyze" >> beam.ParDo(BatchRunner(), analyze)
               | "format_bq" >> beam.Map(format_bq)
               | "News.Semantic" >> beam.io.WriteToBigQuery(
                   table='News.Semantic',
                   schema=gen_schema(SCHEMA_FIELDS),
                   write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                   create_disposition=beam.io.BigQueryDisposition.
                   CREATE_IF_NEEDED))
Exemplo n.º 10
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='Input folder to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output folder to write results to.')
    parser.add_argument('--models',
                        dest='models',
                        help='Input folder to read model parameters.')
    parser.add_argument('--batchsize',
                        dest='batchsize',
                        help='Batch size for processing')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True

    with beam.Pipeline(options=pipeline_options) as p:
        filtered_images = (
            p | "Read Images" >> beam.Create(
                glob.glob(known_args.input + '*wms*' + '.png'))
            | "Batch elements" >> beam.BatchElements(20, known_args.batchsize)
            | "Filter Cloudy images" >> beam.ParDo(
                FilterCloudyFn.FilterCloudyFn(known_args.models)))

        filtered_images | "Segment for Land use" >> beam.ParDo(
            UNetInference.UNetInferenceFn(known_args.models,
                                          known_args.output))
Exemplo n.º 11
0
def _ExtractPredictions(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection,
        eval_shared_model: types.EvalSharedModel,
        desired_batch_size: Optional[int] = None) -> beam.pvalue.PCollection:
    """A PTransform that adds predictions and possibly other tensors to extracts.

  Args:
    extracts: PCollection of extracts containing model inputs keyed by
      tfma.FEATURES_KEY (if model inputs are named) or tfma.INPUTS_KEY (if model
      takes raw tf.Examples as input).
    eval_shared_model: Shared model parameters.
    desired_batch_size: Optional batch size for prediction.

  Returns:
    PCollection of Extracts updated with the predictions.
  """
    batch_args = {}
    if desired_batch_size:
        batch_args = dict(min_batch_size=desired_batch_size,
                          max_batch_size=desired_batch_size)

    extracts = (extracts
                | 'Batch' >> beam.BatchElements(**batch_args)
                | 'Predict' >> beam.ParDo(
                    _PredictionDoFn(eval_shared_model=eval_shared_model)))

    return extracts
Exemplo n.º 12
0
 def ptransform_fn(raw_records_pcoll: beam.pvalue.PCollection):
     return (raw_records_pcoll
             | "Batch" >> beam.BatchElements(
                 **batch_util.GetBatchElementsKwargs(batch_size))
             | "Decode" >> beam.ParDo(
                 _DecodeBatchExamplesDoFn(self._GetSchemaForDecoding(),
                                          self.raw_record_column_name)))
def ModelAgnosticExtract(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    model_agnostic_config: agnostic_predict.ModelAgnosticConfig,
    desired_batch_size: Optional[int] = None) -> beam.pvalue.PCollection:
  """A PTransform that generates features, predictions, labels.

  Args:
    extracts: PCollection of Extracts containing a serialized example to be fed
      to the model.
    model_agnostic_config: A config specifying how to extract
      FeaturesPredictionsLabels from the input input Extracts.
    desired_batch_size: Optional batch size for batching in Aggregate.

  Returns:
    PCollection of Extracts, where the extracts contains the features,
    predictions, labels retrieved.
  """
  batch_args = {}
  if desired_batch_size:
    batch_args = dict(
        min_batch_size=desired_batch_size, max_batch_size=desired_batch_size)
  return (extracts
          | 'Batch' >> beam.BatchElements(**batch_args)
          | 'ModelAgnosticExtract' >> beam.ParDo(
              _ModelAgnosticExtractDoFn(
                  model_agnostic_config=model_agnostic_config)))
Exemplo n.º 14
0
 def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection):
     return (raw_record_pcoll
             | "Batch" >> beam.BatchElements(
                 **batch_util.GetBatchElementsKwargs(batch_size))
             | "ToRecordBatch" >> beam.Map(
                 _BatchedRecordsToArrow, self.raw_record_column_name,
                 self._can_produce_large_types))
def _ExtractPredictions(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection, eval_config: config.EvalConfig,
        eval_shared_models: Dict[Text, types.EvalSharedModel],
        desired_batch_size: Optional[int]) -> beam.pvalue.PCollection:
    """A PTransform that adds predictions and possibly other tensors to extracts.

  Args:
    extracts: PCollection of extracts containing model inputs keyed by
      tfma.FEATURES_KEY (if model inputs are named) or tfma.INPUTS_KEY (if model
      takes raw tf.Examples as input).
    eval_config: Eval config.
    eval_shared_models: Shared model parameters keyed by model name.
    desired_batch_size: Optional batch size.

  Returns:
    PCollection of Extracts updated with the predictions.
  """
    batch_args = {}
    # TODO(b/143484017): Consider removing this option if autotuning is better
    # able to handle batch size selection.
    if desired_batch_size is not None:
        batch_args = dict(min_batch_size=desired_batch_size,
                          max_batch_size=desired_batch_size)

    return (extracts
            | 'Batch' >> beam.BatchElements(**batch_args)
            | 'Predict' >> beam.ParDo(
                _PredictionDoFn(eval_config=eval_config,
                                eval_shared_models=eval_shared_models)))
Exemplo n.º 16
0
def BatchExamplesToArrowTables(
    examples: beam.pvalue.PCollection,
    desired_batch_size: Optional[int] = constants.
    DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
    """Batches example dicts into Arrow tables.

  Args:
    examples: A PCollection of example dicts.
    desired_batch_size: Batch size. The output Arrow tables will have as many
      rows as the `desired_batch_size`.

  Returns:
    A PCollection of Arrow tables.
  """
    # DecodedExamplesToTable should be called within a lambda function instead of
    # specifying the function name in beam.Map for the reasons discussed in
    # b/143648957.
    # TODO(b/131315065): Remove the comment above when the CSV decoder no longer
    # uses BatchExamplesToArrowTables.
    return (
        examples
        | "BatchBeamExamples" >>
        beam.BatchElements(**GetBeamBatchKwargs(desired_batch_size))
        | "DecodeExamplesToTable" >>
        # pylint: disable=unnecessary-lambda
        beam.Map(lambda x: decoded_examples_to_arrow.DecodedExamplesToTable(x))
    )
Exemplo n.º 17
0
 def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
     return (raw_records_pcoll
             | "BatchElements" >> beam.BatchElements(
                 **batch_util.GetBatchElementsKwargs(batch_size))
             | "Decode" >> beam.ParDo(
                 _RecordsToRecordBatch(self._saved_decoder_path,
                                       self.raw_record_column_name,
                                       self._can_produce_large_types)))
Exemplo n.º 18
0
 def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
     return (raw_records_pcoll
             | "Batch" >> beam.BatchElements(
                 **batch_util.GetBatchElementsKwargs(batch_size))
             | "Decode" >> beam.ParDo(
                 _DecodeBatchExamplesDoFn(
                     self._schema, self.raw_record_column_name,
                     self._can_produce_large_types)))
Exemplo n.º 19
0
 def _ptransform_fn(raw_records_pcoll: beam.pvalue.PCollection):
     return (
         raw_records_pcoll
         | "Batch" >> beam.BatchElements(
             **record_based_tfxio.GetBatchElementsKwargs(batch_size))
         | "Decode" >> beam.ParDo(
             _DecodeBatchExamplesDoFn(self._schema,
                                      self.raw_record_column_name)))
Exemplo n.º 20
0
    def expand(self, pcoll: beam.pvalue.PCollection):
        record_batches = (pcoll
                          | beam.BatchElements(
                              min_batch_size=self._desired_batch_size,
                              max_batch_size=self._desired_batch_size)
                          | beam.ParDo(RecordsToTable(), self._column_specs))

        return record_batches
Exemplo n.º 21
0
 def expand(self, pcoll):
   return (
       pcoll
       | 'BatchElements' >> beam.BatchElements(
           min_batch_size=self.num_threads,
           max_batch_size=self.num_threads,
       )
       | 'ParDo' >> beam.ParDo(self.get_dofn(), *self.args, **self.kwargs)
   )
Exemplo n.º 22
0
 def _ptransform_fn(pipeline: beam.pvalue.PCollection):
     # TODO(zhuo): collect telemetry from RecordBatches.
     return (
         pipeline
         | "ReadExamples" >> self._SerializedExamplesSource()
         | "Batch" >>
         beam.BatchElements(**_GetBatchElementsKwargs(batch_size))
         |
         "Decode" >> beam.ParDo(_DecodeBatchExamplesDoFn(self._schema)))
def _TFMAPredict(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_shared_models: Dict[str, types.EvalSharedModel],
    desired_batch_size: Optional[int] = None,
    materialize: Optional[bool] = True,
    eval_config: Optional[
        config_pb2.EvalConfig] = None) -> beam.pvalue.PCollection:
  """A PTransform that adds predictions to Extracts.

  Args:
    extracts: PCollection of Extracts containing a serialized example to be fed
      to the model.
    eval_shared_models: Shared model parameters keyed by model name.
    desired_batch_size: Optional. Desired batch size for prediction.
    materialize: True to call the FeatureExtractor to add MaterializedColumn
      entries for the features, predictions, and labels.
    eval_config: Eval config.

  Returns:
    PCollection of Extracts, where the extracts contains the features,
    predictions, labels retrieved.
  """
  if not eval_config:
    batch_args = {}

    # TODO(b/143484017): Consider removing this option if autotuning is better
    # able to handle batch size selection.
    if desired_batch_size:
      batch_args = dict(
          min_batch_size=desired_batch_size, max_batch_size=desired_batch_size)

    extracts = (extracts | 'Batch' >> beam.BatchElements(**batch_args))
  else:
    extracts = (
        extracts
        | 'UnwrapBatchedExtract' >> beam.Map(_unwrap_batched_extract))

  # We don't actually need to add the add_metrics_callbacks to do Predict,
  # but because if we want to share the model between Predict and subsequent
  # stages (i.e. we use same shared handle for this and subsequent stages),
  # then if we don't add the metrics callbacks here, they won't be present
  # in the model in the later stages if we reuse the model from this stage.
  extracts = (
      extracts
      | 'Predict' >> beam.ParDo(
          _TFMAPredictionDoFn(
              eval_shared_models=eval_shared_models, eval_config=eval_config)))

  if materialize and not eval_config:
    additional_fetches = []
    for m in eval_shared_models.values():
      if m.additional_fetches:
        additional_fetches.extend(m.additional_fetches)
    return extracts | 'ExtractFeatures' >> legacy_feature_extractor._ExtractFeatures(  # pylint: disable=protected-access
        additional_extracts=additional_fetches or None)

  return extracts
Exemplo n.º 24
0
 def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
   return (
       raw_records_pcoll
       | "BatchElements" >> beam.BatchElements(
           **batch_util.GetBatchElementsKwargs(batch_size))
       | "Decode" >> beam.ParDo(_RecordsToRecordBatch(
           self._saved_decoder_path,
           self.telemetry_descriptors,
           shared.Shared() if self._use_singleton_decoder else None,
           self.raw_record_column_name,
           self._record_index_column_name)))
Exemplo n.º 25
0
 def expand(
         self, pcoll: beam.PCollection[ExampleT]
 ) -> beam.PCollection[PredictionT]:
     resource_hints = self._model_handler.get_resource_hints()
     return (
         pcoll
         # TODO(https://github.com/apache/beam/issues/21440): Hook into the
         # batching DoFn APIs.
         | beam.BatchElements(**self._model_handler.batch_elements_kwargs())
         | (beam.ParDo(
             _RunInferenceDoFn(self._model_handler, self._clock),
             self._inference_args).with_resource_hints(**resource_hints)))
Exemplo n.º 26
0
    def _RawRecordsToRecordBatch(pcoll, batch_size):
      batch_size = 1 if not batch_size else batch_size

      class _CreateRBDoFn(beam.DoFn):

        def process(self, examples):
          return [
              pa.RecordBatch.from_arrays([pa.array(examples)], ["column_name"])
          ]

      return (pcoll | beam.BatchElements(batch_size)
              | beam.ParDo(_CreateRBDoFn()))
Exemplo n.º 27
0
def durations(root, ds_file, ds_name, reader_type, suffix):
    """Beam pipeline for durations from a particular file or glob."""
    logging.info('Reading from %s: (%s, %s)', reader_type, ds_name, ds_file)
    input_examples = utils.reader_functions[reader_type](root, ds_file,
                                                         f'Read-{suffix}')
    if FLAGS.batch_size:
        input_examples = input_examples | f'Batch-{suffix}' >> beam.BatchElements(
            min_batch_size=FLAGS.batch_size, max_batch_size=FLAGS.batch_size)
        return input_examples | f'Lens-{suffix}' >> beam.FlatMap(
            durations_from_tfexs)
    else:
        return input_examples | f'Lens-{suffix}' >> beam.Map(
            duration_from_tfex)
Exemplo n.º 28
0
def _Predict(  # pylint: disable=invalid-name
        examples,
        eval_saved_model_path,
        desired_batch_size=None):
    batch_args = {}
    if desired_batch_size:
        batch_args = dict(min_batch_size=desired_batch_size,
                          max_batch_size=desired_batch_size)
    return (examples
            | 'Batch' >> beam.BatchElements(**batch_args)
            | beam.ParDo(
                _PredictionDoFn(eval_saved_model_path=eval_saved_model_path,
                                add_metrics_callbacks=None,
                                shared_handle=shared.Shared())))
Exemplo n.º 29
0
def run(args=None):
    options = PipelineOptions()
    pipeline = beam.Pipeline(options=options)
    runtime_options = options.view_as(RunTimeOptions)
    gcp_proejct = options.view_as(GoogleCloudOptions).project
    pipeline | "Read Files" >> beam.io.ReadFromText(runtime_options.input) |  \
               "Parse Event" >> beam.ParDo(ParseEventFn()) | \
               "BatchElements" >> beam.BatchElements(max_batch_size=200) | \
               "Apply DLP" >> beam.ParDo(TokenizationFxn(gcp_proejct, runtime_options.deIdentiyTemplateId)) | \
               "Write to BigQuery" >>  beam.io.WriteToBigQuery(
                                      runtime_options.bq,
                                      write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                                      create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
    pipeline.run()
Exemplo n.º 30
0
def run(argv=None):

    pipeline_options = PipelineOptions(argv)
    lda_options = pipeline_options.view_as(LdaOptions)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True

    with apache_beam.Pipeline(options=pipeline_options) as p:
        articles = (p | "Read Articles" >> apache_beam.Create(glob.glob(lda_options.input + '*.txt'))) \
                   | apache_beam.Map(load_text)\
                   | "Batch elements" >> apache_beam.BatchElements(lda_options.batchsize, lda_options.batchsize)
        articles | apache_beam.ParDo(
            LdaFn(lda_options.K, lda_options.tau0,
                  lda_options.kappa)) | "Write" >> WriteToText("test.txt")