def expand(self, lines: beam.pvalue.PCollection): """Decodes the input CSV records into an in-memory dict representation. Args: lines: A PCollection of strings representing the lines in the CSV file. Returns: A PCollection of dicts representing the CSV records. """ csv_lines = (lines | 'ParseCSVLines' >> beam.ParDo( csv_decoder.ParseCSVLine(self._delimiter))) if self._infer_type_from_schema: column_infos = _get_feature_types_from_schema( self._schema, self._column_names) else: # TODO(b/72746442): Consider using a DeepCopy optimization similar to TFT. # Do first pass to infer the feature types. column_infos = beam.pvalue.AsSingleton( csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer( column_names=self._column_names, skip_blank_lines=self._skip_blank_lines))) # Do second pass to generate the in-memory dict representation. return ( csv_lines | 'BatchCSVLines' >> beam.BatchElements( **batch_util.GetBeamBatchKwargs(self._desired_batch_size)) | 'BatchedCSVRowsToArrow' >> beam.ParDo( _BatchedCSVRowsToArrow( skip_blank_lines=self._skip_blank_lines), column_infos))
def test_parse_csv_lines(self, input_lines, column_names, expected_csv_cells, expected_types, skip_blank_lines=False, delimiter=','): def _check_csv_cells(actual): self.assertEqual(expected_csv_cells, actual) def _check_types(actual): self.assertLen(actual, 1) self.assertCountEqual([ csv_decoder.ColumnInfo(n, t) for n, t in zip(column_names, expected_types) ], actual[0]) with beam.Pipeline() as p: parsed_csv_cells = ( p | beam.Create(input_lines, reshuffle=False) | beam.ParDo(csv_decoder.ParseCSVLine(delimiter=delimiter))) inferred_types = parsed_csv_cells | beam.CombineGlobally( csv_decoder.ColumnTypeInferrer( column_names, skip_blank_lines=skip_blank_lines)) beam_test_util.assert_that(parsed_csv_cells, _check_csv_cells, label='check_parsed_csv_cells') beam_test_util.assert_that(inferred_types, _check_types, label='check_types')
def _CsvToExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, input_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], # pylint: disable=unused-argument split_pattern: Text) -> beam.pvalue.PCollection: """Read CSV files and transform to TF examples. Note that each input split will be transformed by this function separately. Args: pipeline: beam pipeline. input_dict: Input dict from input key to a list of Artifacts. - input_base: input dir that contains csv data. csv files must have header line. exec_properties: A dict of execution properties. split_pattern: Split.pattern in Input config, glob relative file pattern that maps to input files with root directory given by input_base. Returns: PCollection of TF examples. Raises: RuntimeError: if split is empty or csv headers are not equal. """ input_base_uri = artifact_utils.get_single_uri(input_dict['input_base']) csv_pattern = os.path.join(input_base_uri, split_pattern) absl.logging.info( 'Processing input csv data {} to TFExample.'.format(csv_pattern)) csv_files = tf.io.gfile.glob(csv_pattern) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format(csv_pattern)) column_names = io_utils.load_csv_column_names(csv_files[0]) for csv_files in csv_files[1:]: if io_utils.load_csv_column_names(csv_files) != column_names: raise RuntimeError( 'Files in same split {} have different header.'.format( csv_pattern)) parsed_csv_lines = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern, skip_header_lines=1) | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))) column_infos = beam.pvalue.AsSingleton( parsed_csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True))) return (parsed_csv_lines | 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
def _CsvToExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, exec_properties: Dict[Text, Any], split_pattern: Text) -> beam.pvalue.PCollection: """Read CSV files and transform to TF examples. Note that each input split will be transformed by this function separately. Args: pipeline: beam pipeline. exec_properties: A dict of execution properties. - input_base: input dir that contains CSV data. CSV must have header line. split_pattern: Split.pattern in Input config, glob relative file pattern that maps to input files with root directory given by input_base. Returns: PCollection of TF examples. Raises: RuntimeError: if split is empty or csv headers are not equal. """ input_base_uri = exec_properties[utils.INPUT_BASE_KEY] csv_pattern = os.path.join(input_base_uri, split_pattern) logging.info('Processing input csv data %s to TFExample.', csv_pattern) csv_files = tf.io.gfile.glob(csv_pattern) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format(csv_pattern)) column_names = io_utils.load_csv_column_names(csv_files[0]) for csv_file in csv_files[1:]: if io_utils.load_csv_column_names(csv_file) != column_names: raise RuntimeError( 'Files in same split {} have different header.'.format( csv_pattern)) parsed_csv_lines = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern, skip_header_lines=1) | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))) # TODO(b/155997704) clean this up once tfx_bsl makes a release. if getattr(csv_decoder, 'PARSE_CSV_LINE_YIELDS_RAW_RECORDS', False): # parsed_csv_lines is the following tuple (parsed_lines, raw_records) # we only want the parsed_lines. parsed_csv_lines |= 'ExtractParsedCSVLines' >> beam.Keys() column_infos = beam.pvalue.AsSingleton( parsed_csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True))) return (parsed_csv_lines | 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
def test_invalid_row(self): input_lines = ['1,2.0,hello', '5,12.34'] column_names = ['int_feature', 'float_feature', 'str_feature'] with self.assertRaisesRegexp( ValueError, '.*Columns do not match specified csv headers.*'): with beam.Pipeline() as p: result = (p | beam.Create(input_lines, reshuffle=False) | beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')) | beam.CombineGlobally( csv_decoder.ColumnTypeInferrer( column_names, skip_blank_lines=False))) beam_test_util.assert_that(result, lambda _: None)
def convert_csv_to_tf_examples(self, csv_path, tfrecords_output_path): """Runs a Beam pipeline to convert the CSV file into a TFRecords file. This is needed because the conversion is orders of magnitude more time-consuming than the functions we want to benchmark, so instead of doing the conversion each time, we do it once to generate a converted dataset and use that for the benchmark instead. Args: csv_path: Path to CSV file containing examples. tfrecords_output_path: Path to output TFRecords file containing parsed examples. """ # Copied from CSV example gen. fp = open(csv_path, "r") column_names = next(fp).strip().split(",") fp.close() with beam.Pipeline() as p: parsed_csv_lines = (p | "ReadFromText" >> beam.io.ReadFromText( file_pattern=csv_path, skip_header_lines=1) | "ParseCSVLine" >> beam.ParDo( csv_decoder.ParseCSVLine(delimiter=","))) # TODO(b/155997704) clean this up once tfx_bsl makes a release. if getattr(csv_decoder, "PARSE_CSV_LINE_YIELDS_RAW_RECORDS", False): # parsed_csv_lines is the following tuple (parsed_lines, raw_records) # we only want the parsed_lines. parsed_csv_lines |= "ExtractParsedCSVLines" >> beam.Keys() column_infos = beam.pvalue.AsSingleton( parsed_csv_lines | "InferColumnTypes" >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True))) _ = ( parsed_csv_lines | "ToTFExample" >> beam.ParDo( csv_exgen._ParsedCsvToTfExample(), # pylint: disable=protected-access column_infos) | "Serialize" >> beam.Map(lambda x: x.SerializeToString()) | "WriteToTFRecord" >> beam.io.tfrecordio.WriteToTFRecord( file_path_prefix=tfrecords_output_path, shard_name_template="", compression_type=beam.io.filesystem.CompressionTypes.GZIP))
def expand( self, pipeline: beam.Pipeline ) -> beam.pvalue.PCollection[tf.train.Example]: logging.info('Processing input csv data %s to TFExample.', self._csv_pattern) csv_files = fileio.glob(self._csv_pattern) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format( self._csv_pattern)) column_names = io_utils.load_csv_column_names(csv_files[0]) for csv_file in csv_files[1:]: if io_utils.load_csv_column_names(csv_file) != column_names: raise RuntimeError( 'Files in same split {} have different header.'.format( self._csv_pattern)) # Read each CSV file while maintaining order. This is done in order to group # together multi-line string fields. parsed_csv_lines = ( pipeline | 'CreateFilenames' >> beam.Create(csv_files) | 'ReadFromText' >> beam.ParDo(_ReadCsvRecordsFromTextFile()) | 'ParseCSVLine' >> beam.ParDo( csv_decoder.ParseCSVLine(delimiter=',')) | 'ExtractParsedCSVLines' >> beam.Keys()) column_infos = beam.pvalue.AsSingleton( parsed_csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True))) return ( parsed_csv_lines | 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
def test_parse_csv_lines(self, input_lines, column_names, expected_csv_cells, expected_types, expected_record_batch, skip_blank_lines=False, schema=None, delimiter=',', multivalent_columns=None, secondary_delimiter=None, raw_record_column_name=None): def _check_csv_cells(actual): for i in range(len(actual)): self.assertEqual(expected_csv_cells[i], actual[i][0]) self.assertEqual(input_lines[i], actual[i][1]) def _check_types(actual): self.assertLen(actual, 1) self.assertCountEqual([ csv_decoder.ColumnInfo(n, t) for n, t in zip(column_names, expected_types) ], actual[0]) def _check_record_batches(actual): """Compares a list of pa.RecordBatch.""" if actual: self.assertTrue(actual[0].equals(expected_record_batch)) else: self.assertEqual(expected_record_batch, actual) def _check_arrow_schema(actual): for record_batch in actual: expected_arrow_schema = csv_decoder.GetArrowSchema( column_names, schema, raw_record_column_name) self.assertEqual(record_batch.schema, expected_arrow_schema) with beam.Pipeline() as p: parsed_csv_cells_and_raw_records = ( p | beam.Create(input_lines, reshuffle=False) | beam.ParDo(csv_decoder.ParseCSVLine(delimiter=delimiter))) inferred_types = ( parsed_csv_cells_and_raw_records | beam.Keys() | beam.CombineGlobally( csv_decoder.ColumnTypeInferrer( column_names, skip_blank_lines=skip_blank_lines, multivalent_columns=multivalent_columns, secondary_delimiter=secondary_delimiter))) beam_test_util.assert_that(parsed_csv_cells_and_raw_records, _check_csv_cells, label='check_parsed_csv_cells') beam_test_util.assert_that(inferred_types, _check_types, label='check_types') record_batches = ( parsed_csv_cells_and_raw_records | beam.BatchElements(min_batch_size=1000) | beam.ParDo( csv_decoder.BatchedCSVRowsToRecordBatch( skip_blank_lines=skip_blank_lines, multivalent_columns=multivalent_columns, secondary_delimiter=secondary_delimiter, raw_record_column_name=raw_record_column_name), beam.pvalue.AsSingleton(inferred_types))) beam_test_util.assert_that(record_batches, _check_record_batches, label='check_record_batches') if schema: beam_test_util.assert_that(record_batches, _check_arrow_schema, label='check_arrow_schema') # Testing CSVToRecordBatch with beam.Pipeline() as p: record_batches = ( p | 'CreatingPColl' >> beam.Create(input_lines, reshuffle=False) | 'CSVToRecordBatch' >> csv_decoder.CSVToRecordBatch( column_names=column_names, delimiter=delimiter, skip_blank_lines=skip_blank_lines, desired_batch_size=1000, schema=schema, multivalent_columns=multivalent_columns, secondary_delimiter=secondary_delimiter, raw_record_column_name=raw_record_column_name)) beam_test_util.assert_that(record_batches, _check_record_batches, label='check_record_batches')