def _CsvToExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, input_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], # pylint: disable=unused-argument split_pattern: Text) -> beam.pvalue.PCollection: """Read CSV files and transform to TF examples. Note that each input split will be transformed by this function separately. Args: pipeline: beam pipeline. input_dict: Input dict from input key to a list of Artifacts. - input_base: input dir that contains csv data. csv files must have header line. exec_properties: A dict of execution properties. split_pattern: Split.pattern in Input config, glob relative file pattern that maps to input files with root directory given by input_base. Returns: PCollection of TF examples. Raises: RuntimeError: if split is empty or csv headers are not equal. """ input_base_uri = artifact_utils.get_single_uri(input_dict['input_base']) csv_pattern = os.path.join(input_base_uri, split_pattern) absl.logging.info( 'Processing input csv data {} to TFExample.'.format(csv_pattern)) csv_files = tf.io.gfile.glob(csv_pattern) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format(csv_pattern)) column_names = io_utils.load_csv_column_names(csv_files[0]) for csv_files in csv_files[1:]: if io_utils.load_csv_column_names(csv_files) != column_names: raise RuntimeError( 'Files in same split {} have different header.'.format( csv_pattern)) parsed_csv_lines = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern, skip_header_lines=1) | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))) column_infos = beam.pvalue.AsSingleton( parsed_csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True))) return (parsed_csv_lines | 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
def _CsvToExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, exec_properties: Dict[Text, Any], split_pattern: Text) -> beam.pvalue.PCollection: """Read CSV files and transform to TF examples. Note that each input split will be transformed by this function separately. Args: pipeline: beam pipeline. exec_properties: A dict of execution properties. - input_base: input dir that contains CSV data. CSV must have header line. split_pattern: Split.pattern in Input config, glob relative file pattern that maps to input files with root directory given by input_base. Returns: PCollection of TF examples. Raises: RuntimeError: if split is empty or csv headers are not equal. """ input_base_uri = exec_properties[utils.INPUT_BASE_KEY] csv_pattern = os.path.join(input_base_uri, split_pattern) logging.info('Processing input csv data %s to TFExample.', csv_pattern) csv_files = tf.io.gfile.glob(csv_pattern) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format(csv_pattern)) column_names = io_utils.load_csv_column_names(csv_files[0]) for csv_file in csv_files[1:]: if io_utils.load_csv_column_names(csv_file) != column_names: raise RuntimeError( 'Files in same split {} have different header.'.format( csv_pattern)) parsed_csv_lines = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern, skip_header_lines=1) | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))) # TODO(b/155997704) clean this up once tfx_bsl makes a release. if getattr(csv_decoder, 'PARSE_CSV_LINE_YIELDS_RAW_RECORDS', False): # parsed_csv_lines is the following tuple (parsed_lines, raw_records) # we only want the parsed_lines. parsed_csv_lines |= 'ExtractParsedCSVLines' >> beam.Keys() column_infos = beam.pvalue.AsSingleton( parsed_csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True))) return (parsed_csv_lines | 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
def _CsvToExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, input_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], # pylint: disable=unused-argument split_pattern: Text) -> beam.pvalue.PCollection: """Read CSV files and transform to TF examples. Note that each input split will be transformed by this function separately. Args: pipeline: beam pipeline. input_dict: Input dict from input key to a list of Artifacts. - input_base: input dir that contains csv data. csv files must have header line. exec_properties: A dict of execution properties. split_pattern: Split.pattern in Input config, glob relative file pattern that maps to input files with root directory given by input_base. Returns: PCollection of TF examples. Raises: RuntimeError: if split is empty or csv headers are not equal. """ input_base_uri = artifact_utils.get_single_uri(input_dict['input_base']) csv_pattern = os.path.join(input_base_uri, split_pattern) tf.logging.info( 'Processing input csv data {} to TFExample.'.format(csv_pattern)) csv_files = tf.gfile.Glob(csv_pattern) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format(csv_pattern)) column_names = io_utils.load_csv_column_names(csv_files[0]) for csv_files in csv_files[1:]: if io_utils.load_csv_column_names(csv_files) != column_names: raise RuntimeError( 'Files in same split {} have different header.'.format( csv_pattern)) # TODO(pachristopher): Remove this once TFDV 0.14 is released. (major, minor, _) = tfdv.__version__.split('.') if int(major) > 0 or int(minor) >= 14: decoder = csv_decoder.DecodeCSVToDict else: decoder = csv_decoder.DecodeCSV return (pipeline | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern, skip_header_lines=1) | 'ParseCSV' >> decoder(column_names) | 'ToTFExample' >> beam.Map(_dict_to_example))
def _CsvToExample( # pylint: disable=invalid-name pipeline, input_dict, exec_properties): # pylint: disable=unused-argument """Read CSV file and transform to TF examples. Args: pipeline: beam pipeline. input_dict: Input dict from input key to a list of Artifacts. - input-base: input dir that contains csv data. csv files must have header line. exec_properties: A dict of execution properties. Returns: PCollection of TF examples. """ input_base = types.get_single_instance(input_dict['input-base']) input_base_uri = input_base.uri csv_uri = io_utils.get_only_uri_in_dir(input_base_uri) tf.logging.info( 'Processing input csv data {} to TFExample.'.format(csv_uri)) return ( pipeline | 'ReadFromText' >> beam.io.ReadFromText(csv_uri, skip_header_lines=1) | 'ParseCSV' >> csv_decoder.DecodeCSV( io_utils.load_csv_column_names(csv_uri)) | 'ToTFExample' >> beam.Map(_dict_to_example))
def load_csv_header(csv_path: Text): """ Gets header column of csv and returns list. Args: csv_path (str): Path to csv file. """ return load_csv_column_names(csv_path)
def _CsvToSerializedExample( # pylint: disable=invalid-name pipeline, csv_uri): """Read csv file and transform to tf examples.""" return (pipeline | 'ReadFromText' >> beam.io.ReadFromText(csv_uri, skip_header_lines=1) | 'ParseCSV' >> csv_decoder.DecodeCSV( io_utils.load_csv_column_names(csv_uri)) | 'ToSerializedTFExample' >> beam.Map(_dict_to_example))
def expand( self, pipeline: beam.Pipeline ) -> beam.pvalue.PCollection[tf.train.Example]: logging.info('Processing input csv data %s to TFExample.', self._csv_pattern) csv_files = fileio.glob(self._csv_pattern) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format( self._csv_pattern)) column_names = io_utils.load_csv_column_names(csv_files[0]) for csv_file in csv_files[1:]: if io_utils.load_csv_column_names(csv_file) != column_names: raise RuntimeError( 'Files in same split {} have different header.'.format( self._csv_pattern)) # Read each CSV file while maintaining order. This is done in order to group # together multi-line string fields. parsed_csv_lines = ( pipeline | 'CreateFilenames' >> beam.Create(csv_files) | 'ReadFromText' >> beam.ParDo(_ReadCsvRecordsFromTextFile()) | 'ParseCSVLine' >> beam.ParDo( csv_decoder.ParseCSVLine(delimiter=',')) | 'ExtractParsedCSVLines' >> beam.Keys()) column_infos = beam.pvalue.AsSingleton( parsed_csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True))) return ( parsed_csv_lines | 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
def _do_inference(model_handle, examples_file, num_examples, schema): """Sends requests to the model and prints the results. Args: model_handle: handle to the model. This can be either "aiplatform:model:version" or "host:port" examples_file: path to csv file containing examples, with the first line assumed to have the column headers num_examples: number of requests to send to the server schema: a Schema describing the input data Returns: Response from model server """ filtered_features = [ feature for feature in schema.feature if feature.name != _LABEL_KEY ] del schema.feature[:] schema.feature.extend(filtered_features) column_names = io_utils.load_csv_column_names(examples_file) csv_coder = _make_csv_coder(schema, column_names) proto_coder = _make_proto_coder(schema) input_file = open(examples_file, 'r') input_file.readline() # skip header line serialized_examples = [] for _ in range(num_examples): one_line = input_file.readline() if not one_line: print('End of example file reached') break one_example = csv_coder.decode(one_line) serialized_example = proto_coder.encode(one_example) serialized_examples.append(serialized_example) parsed_model_handle = model_handle.split(':') if parsed_model_handle[0] == 'aiplatform': _do_aiplatform_inference( model=parsed_model_handle[1], version=parsed_model_handle[2], serialized_examples=serialized_examples) else: _do_local_inference( host=parsed_model_handle[0], port=parsed_model_handle[1], serialized_examples=serialized_examples)
def testLoadCsvColumnNames(self): source_data_dir = os.path.join(os.path.dirname(__file__), 'testdata') test_file = os.path.join(source_data_dir, 'test.csv') column_names = io_utils.load_csv_column_names(test_file) self.assertListEqual(['a', 'b', 'c', 'd'], column_names)