def testGetBatchElementsKwargs(self): kwargs = batch_util.GetBatchElementsKwargs(batch_size=None) self.assertDictEqual(kwargs, {'max_batch_size': 1000}) kwargs = batch_util.GetBatchElementsKwargs(batch_size=5000) self.assertDictEqual(kwargs, { 'max_batch_size': 5000, 'min_batch_size': 5000 })
def ptransform_fn(raw_records_pcoll: beam.pvalue.PCollection): return (raw_records_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _DecodeBatchExamplesDoFn(self._GetSchemaForDecoding(), self.raw_record_column_name)))
def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection): return (raw_record_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "ToRecordBatch" >> beam.Map( _BatchedRecordsToArrow, self.raw_record_column_name, self._can_produce_large_types))
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): return (raw_records_pcoll | "BatchElements" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _RecordsToRecordBatch(self._saved_decoder_path, self.raw_record_column_name, self._can_produce_large_types)))
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): return (raw_records_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _DecodeBatchExamplesDoFn( self._schema, self.raw_record_column_name, self._can_produce_large_types)))
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): return ( raw_records_pcoll | "BatchElements" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo(_RecordsToRecordBatch( self._saved_decoder_path, self.telemetry_descriptors, shared.Shared() if self._use_singleton_decoder else None, self.raw_record_column_name, self._record_index_column_name)))
def BatchSerializedExamplesToArrowRecordBatches( examples: beam.pvalue.PCollection, desired_batch_size: Optional[int] = constants .DEFAULT_DESIRED_INPUT_BATCH_SIZE ) -> beam.pvalue.PCollection: """Batches serialized examples into Arrow record batches. Args: examples: A PCollection of serialized tf.Examples. desired_batch_size: Batch size. The output Arrow record batches will have as many rows as the `desired_batch_size`. Returns: A PCollection of Arrow record batches. """ return (examples | "BatchSerializedExamples" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(desired_batch_size)) | "BatchDecodeExamples" >> beam.ParDo(_BatchDecodeExamplesDoFn()))
def BatchExamplesToArrowRecordBatches( examples: beam.pvalue.PCollection, desired_batch_size: Optional[int] = constants .DEFAULT_DESIRED_INPUT_BATCH_SIZE ) -> beam.pvalue.PCollection: """Batches example dicts into Arrow record batches. Args: examples: A PCollection of example dicts. desired_batch_size: Batch size. The output Arrow record batches will have as many rows as the `desired_batch_size`. Returns: A PCollection of Arrow record batches. """ return ( examples | "BatchBeamExamples" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(desired_batch_size)) | "DecodeExamplesToRecordBatch" >> beam.Map( # pylint: disable=unnecessary-lambda lambda x: decoded_examples_to_arrow.DecodedExamplesToRecordBatch(x)))
def CSVToRecordBatch(lines: beam.pvalue.PCollection, column_names: List[Text], desired_batch_size: Optional[int], delimiter: Text = ",", skip_blank_lines: bool = True, schema: Optional[schema_pb2.Schema] = None, multivalent_columns: Optional[List[Union[Text, bytes]]] = None, secondary_delimiter: Optional[Union[Text, bytes]] = None, raw_record_column_name: Optional[Text] = None, produce_large_types: bool = False): """Decodes CSV records into Arrow RecordBatches. Args: lines: The pcollection of raw records (csv lines). column_names: List of feature names. Order must match the order in the CSV file. desired_batch_size: Batch size. The output Arrow RecordBatches will have as many rows as the `desired_batch_size`. If None, the batch size is auto tuned by beam. delimiter: A one-character string used to separate fields. skip_blank_lines: A boolean to indicate whether to skip over blank lines rather than interpreting them as missing values. schema: An optional schema of the input data. If this is provided, it must contain all columns. multivalent_columns: Columns that can contain multiple values. If secondary_delimiter is provided, this must also be provided. secondary_delimiter: Delimiter used for parsing multivalent columns. If multivalent_columns is provided, this must also be provided. raw_record_column_name: Optional name for a column containing the raw csv lines. If this is None, then this column will not be produced. This will always be the last column in the record batch. produce_large_types: If True, will output record batches with columns that are large_list types. Returns: RecordBatches of the CSV lines. Raises: ValueError: * If the columns do not match the specified csv headers. * If the schema has invalid feature types. * If the schema does not contain all columns. * If raw_record_column_name exists in column_names """ if (raw_record_column_name is not None and raw_record_column_name in column_names): raise ValueError( "raw_record_column_name: {} is already an existing column name. " "Please choose a different name.".format(raw_record_column_name)) csv_lines_and_raw_records = ( lines | "ParseCSVLines" >> beam.ParDo(ParseCSVLine(delimiter))) if schema is not None: column_infos = _GetColumnInfosFromSchema(schema, column_names) else: # TODO(b/72746442): Consider using a DeepCopy optimization similar to TFT. # Do first pass to infer the feature types. column_infos = beam.pvalue.AsSingleton( csv_lines_and_raw_records | "ExtractParsedCSVLines" >> beam.Keys() | "InferColumnTypes" >> beam.CombineGlobally( ColumnTypeInferrer( column_names=column_names, skip_blank_lines=skip_blank_lines, multivalent_columns=multivalent_columns, secondary_delimiter=secondary_delimiter))) # Do second pass to generate the RecordBatches. return (csv_lines_and_raw_records | "BatchCSVLines" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(desired_batch_size)) | "BatchedCSVRowsToArrow" >> beam.ParDo( BatchedCSVRowsToRecordBatch( skip_blank_lines=skip_blank_lines, multivalent_columns=multivalent_columns, secondary_delimiter=secondary_delimiter, raw_record_column_name=raw_record_column_name, produce_large_types=produce_large_types), column_infos))