Exemplo n.º 1
0
 def testGetBatchElementsKwargs(self):
     kwargs = batch_util.GetBatchElementsKwargs(batch_size=None)
     self.assertDictEqual(kwargs, {'max_batch_size': 1000})
     kwargs = batch_util.GetBatchElementsKwargs(batch_size=5000)
     self.assertDictEqual(kwargs, {
         'max_batch_size': 5000,
         'min_batch_size': 5000
     })
Exemplo n.º 2
0
 def ptransform_fn(raw_records_pcoll: beam.pvalue.PCollection):
     return (raw_records_pcoll
             | "Batch" >> beam.BatchElements(
                 **batch_util.GetBatchElementsKwargs(batch_size))
             | "Decode" >> beam.ParDo(
                 _DecodeBatchExamplesDoFn(self._GetSchemaForDecoding(),
                                          self.raw_record_column_name)))
Exemplo n.º 3
0
 def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection):
     return (raw_record_pcoll
             | "Batch" >> beam.BatchElements(
                 **batch_util.GetBatchElementsKwargs(batch_size))
             | "ToRecordBatch" >> beam.Map(
                 _BatchedRecordsToArrow, self.raw_record_column_name,
                 self._can_produce_large_types))
Exemplo n.º 4
0
 def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
     return (raw_records_pcoll
             | "BatchElements" >> beam.BatchElements(
                 **batch_util.GetBatchElementsKwargs(batch_size))
             | "Decode" >> beam.ParDo(
                 _RecordsToRecordBatch(self._saved_decoder_path,
                                       self.raw_record_column_name,
                                       self._can_produce_large_types)))
Exemplo n.º 5
0
 def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
     return (raw_records_pcoll
             | "Batch" >> beam.BatchElements(
                 **batch_util.GetBatchElementsKwargs(batch_size))
             | "Decode" >> beam.ParDo(
                 _DecodeBatchExamplesDoFn(
                     self._schema, self.raw_record_column_name,
                     self._can_produce_large_types)))
Exemplo n.º 6
0
 def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
   return (
       raw_records_pcoll
       | "BatchElements" >> beam.BatchElements(
           **batch_util.GetBatchElementsKwargs(batch_size))
       | "Decode" >> beam.ParDo(_RecordsToRecordBatch(
           self._saved_decoder_path,
           self.telemetry_descriptors,
           shared.Shared() if self._use_singleton_decoder else None,
           self.raw_record_column_name,
           self._record_index_column_name)))
Exemplo n.º 7
0
def BatchSerializedExamplesToArrowRecordBatches(
    examples: beam.pvalue.PCollection,
    desired_batch_size: Optional[int] = constants
    .DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
  """Batches serialized examples into Arrow record batches.

  Args:
    examples: A PCollection of serialized tf.Examples.
    desired_batch_size: Batch size. The output Arrow record batches will have as
      many rows as the `desired_batch_size`.

  Returns:
    A PCollection of Arrow record batches.
  """
  return (examples
          | "BatchSerializedExamples" >> beam.BatchElements(
              **batch_util.GetBatchElementsKwargs(desired_batch_size))
          | "BatchDecodeExamples" >> beam.ParDo(_BatchDecodeExamplesDoFn()))
Exemplo n.º 8
0
def BatchExamplesToArrowRecordBatches(
    examples: beam.pvalue.PCollection,
    desired_batch_size: Optional[int] = constants
    .DEFAULT_DESIRED_INPUT_BATCH_SIZE
) -> beam.pvalue.PCollection:
  """Batches example dicts into Arrow record batches.

  Args:
    examples: A PCollection of example dicts.
    desired_batch_size: Batch size. The output Arrow record batches will have as
      many rows as the `desired_batch_size`.

  Returns:
    A PCollection of Arrow record batches.
  """
  return (
      examples
      | "BatchBeamExamples" >> beam.BatchElements(
          **batch_util.GetBatchElementsKwargs(desired_batch_size))
      | "DecodeExamplesToRecordBatch" >> beam.Map(
          # pylint: disable=unnecessary-lambda
          lambda x: decoded_examples_to_arrow.DecodedExamplesToRecordBatch(x)))
Exemplo n.º 9
0
def CSVToRecordBatch(lines: beam.pvalue.PCollection,
                     column_names: List[Text],
                     desired_batch_size: Optional[int],
                     delimiter: Text = ",",
                     skip_blank_lines: bool = True,
                     schema: Optional[schema_pb2.Schema] = None,
                     multivalent_columns: Optional[List[Union[Text,
                                                              bytes]]] = None,
                     secondary_delimiter: Optional[Union[Text, bytes]] = None,
                     raw_record_column_name: Optional[Text] = None,
                     produce_large_types: bool = False):
  """Decodes CSV records into Arrow RecordBatches.

  Args:
    lines: The pcollection of raw records (csv lines).
    column_names: List of feature names. Order must match the order in the CSV
      file.
    desired_batch_size: Batch size. The output Arrow RecordBatches will have as
      many rows as the `desired_batch_size`. If None, the batch size is auto
      tuned by beam.
    delimiter: A one-character string used to separate fields.
    skip_blank_lines: A boolean to indicate whether to skip over blank lines
      rather than interpreting them as missing values.
    schema: An optional schema of the input data. If this is provided, it must
      contain all columns.
    multivalent_columns: Columns that can contain multiple values. If
      secondary_delimiter is provided, this must also be provided.
    secondary_delimiter: Delimiter used for parsing multivalent columns. If
      multivalent_columns is provided, this must also be provided.
    raw_record_column_name: Optional name for a column containing the raw csv
      lines. If this is None, then this column will not be produced. This will
      always be the last column in the record batch.
    produce_large_types: If True, will output record batches with columns that
      are large_list types.

  Returns:
    RecordBatches of the CSV lines.

  Raises:
    ValueError:
      * If the columns do not match the specified csv headers.
      * If the schema has invalid feature types.
      * If the schema does not contain all columns.
      * If raw_record_column_name exists in column_names
  """
  if (raw_record_column_name is not None and
      raw_record_column_name in column_names):
    raise ValueError(
        "raw_record_column_name: {} is already an existing column name. "
        "Please choose a different name.".format(raw_record_column_name))

  csv_lines_and_raw_records = (
      lines | "ParseCSVLines" >> beam.ParDo(ParseCSVLine(delimiter)))

  if schema is not None:
    column_infos = _GetColumnInfosFromSchema(schema, column_names)
  else:
    # TODO(b/72746442): Consider using a DeepCopy optimization similar to TFT.
    # Do first pass to infer the feature types.
    column_infos = beam.pvalue.AsSingleton(
        csv_lines_and_raw_records
        | "ExtractParsedCSVLines" >> beam.Keys()
        | "InferColumnTypes" >> beam.CombineGlobally(
            ColumnTypeInferrer(
                column_names=column_names,
                skip_blank_lines=skip_blank_lines,
                multivalent_columns=multivalent_columns,
                secondary_delimiter=secondary_delimiter)))

  # Do second pass to generate the RecordBatches.
  return (csv_lines_and_raw_records
          | "BatchCSVLines" >> beam.BatchElements(
              **batch_util.GetBatchElementsKwargs(desired_batch_size))
          | "BatchedCSVRowsToArrow" >> beam.ParDo(
              BatchedCSVRowsToRecordBatch(
                  skip_blank_lines=skip_blank_lines,
                  multivalent_columns=multivalent_columns,
                  secondary_delimiter=secondary_delimiter,
                  raw_record_column_name=raw_record_column_name,
                  produce_large_types=produce_large_types), column_infos))