Пример #1
0
def _CsvToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    input_dict: Dict[Text, List[types.Artifact]],
    exec_properties: Dict[Text, Any],  # pylint: disable=unused-argument
    split_pattern: Text) -> beam.pvalue.PCollection:
    """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input_base: input dir that contains csv data. csv files must have header
        line.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
    input_base_uri = artifact_utils.get_single_uri(input_dict['input_base'])
    csv_pattern = os.path.join(input_base_uri, split_pattern)
    absl.logging.info(
        'Processing input csv data {} to TFExample.'.format(csv_pattern))

    csv_files = tf.io.gfile.glob(csv_pattern)
    if not csv_files:
        raise RuntimeError(
            'Split pattern {} does not match any files.'.format(csv_pattern))

    column_names = io_utils.load_csv_column_names(csv_files[0])
    for csv_files in csv_files[1:]:
        if io_utils.load_csv_column_names(csv_files) != column_names:
            raise RuntimeError(
                'Files in same split {} have different header.'.format(
                    csv_pattern))

    parsed_csv_lines = (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern,
                                                 skip_header_lines=1)
        |
        'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')))
    column_infos = beam.pvalue.AsSingleton(
        parsed_csv_lines
        | 'InferColumnTypes' >> beam.CombineGlobally(
            csv_decoder.ColumnTypeInferrer(column_names,
                                           skip_blank_lines=True)))

    return (parsed_csv_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
Пример #2
0
def _CsvToExample(  # pylint: disable=invalid-name
        pipeline: beam.Pipeline, exec_properties: Dict[Text, Any],
        split_pattern: Text) -> beam.pvalue.PCollection:
    """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    exec_properties: A dict of execution properties.
      - input_base: input dir that contains CSV data. CSV must have header line.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
    input_base_uri = exec_properties[utils.INPUT_BASE_KEY]
    csv_pattern = os.path.join(input_base_uri, split_pattern)
    logging.info('Processing input csv data %s to TFExample.', csv_pattern)

    csv_files = tf.io.gfile.glob(csv_pattern)
    if not csv_files:
        raise RuntimeError(
            'Split pattern {} does not match any files.'.format(csv_pattern))

    column_names = io_utils.load_csv_column_names(csv_files[0])
    for csv_file in csv_files[1:]:
        if io_utils.load_csv_column_names(csv_file) != column_names:
            raise RuntimeError(
                'Files in same split {} have different header.'.format(
                    csv_pattern))

    parsed_csv_lines = (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern,
                                                 skip_header_lines=1)
        |
        'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')))
    # TODO(b/155997704) clean this up once tfx_bsl makes a release.
    if getattr(csv_decoder, 'PARSE_CSV_LINE_YIELDS_RAW_RECORDS', False):
        # parsed_csv_lines is the following tuple (parsed_lines, raw_records)
        # we only want the parsed_lines.
        parsed_csv_lines |= 'ExtractParsedCSVLines' >> beam.Keys()
    column_infos = beam.pvalue.AsSingleton(
        parsed_csv_lines
        | 'InferColumnTypes' >> beam.CombineGlobally(
            csv_decoder.ColumnTypeInferrer(column_names,
                                           skip_blank_lines=True)))

    return (parsed_csv_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
Пример #3
0
def _CsvToExample(  # pylint: disable=invalid-name
    pipeline: beam.Pipeline,
    input_dict: Dict[Text, List[types.Artifact]],
    exec_properties: Dict[Text, Any],  # pylint: disable=unused-argument
    split_pattern: Text) -> beam.pvalue.PCollection:
    """Read CSV files and transform to TF examples.

  Note that each input split will be transformed by this function separately.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input_base: input dir that contains csv data. csv files must have header
        line.
    exec_properties: A dict of execution properties.
    split_pattern: Split.pattern in Input config, glob relative file pattern
      that maps to input files with root directory given by input_base.

  Returns:
    PCollection of TF examples.

  Raises:
    RuntimeError: if split is empty or csv headers are not equal.
  """
    input_base_uri = artifact_utils.get_single_uri(input_dict['input_base'])
    csv_pattern = os.path.join(input_base_uri, split_pattern)
    tf.logging.info(
        'Processing input csv data {} to TFExample.'.format(csv_pattern))

    csv_files = tf.gfile.Glob(csv_pattern)
    if not csv_files:
        raise RuntimeError(
            'Split pattern {} does not match any files.'.format(csv_pattern))

    column_names = io_utils.load_csv_column_names(csv_files[0])
    for csv_files in csv_files[1:]:
        if io_utils.load_csv_column_names(csv_files) != column_names:
            raise RuntimeError(
                'Files in same split {} have different header.'.format(
                    csv_pattern))

    # TODO(pachristopher): Remove this once TFDV 0.14 is released.
    (major, minor, _) = tfdv.__version__.split('.')
    if int(major) > 0 or int(minor) >= 14:
        decoder = csv_decoder.DecodeCSVToDict
    else:
        decoder = csv_decoder.DecodeCSV
    return (pipeline
            | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern,
                                                     skip_header_lines=1)
            | 'ParseCSV' >> decoder(column_names)
            | 'ToTFExample' >> beam.Map(_dict_to_example))
Пример #4
0
def _CsvToExample(  # pylint: disable=invalid-name
        pipeline, input_dict, exec_properties):  # pylint: disable=unused-argument
    """Read CSV file and transform to TF examples.

  Args:
    pipeline: beam pipeline.
    input_dict: Input dict from input key to a list of Artifacts.
      - input-base: input dir that contains csv data. csv files must have header
        line.
    exec_properties: A dict of execution properties.

  Returns:
    PCollection of TF examples.
  """
    input_base = types.get_single_instance(input_dict['input-base'])
    input_base_uri = input_base.uri
    csv_uri = io_utils.get_only_uri_in_dir(input_base_uri)
    tf.logging.info(
        'Processing input csv data {} to TFExample.'.format(csv_uri))

    return (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(csv_uri, skip_header_lines=1)
        | 'ParseCSV' >> csv_decoder.DecodeCSV(
            io_utils.load_csv_column_names(csv_uri))
        | 'ToTFExample' >> beam.Map(_dict_to_example))
Пример #5
0
def load_csv_header(csv_path: Text):
    """
    Gets header column of csv and returns list.

    Args:
        csv_path (str): Path to csv file.
    """
    return load_csv_column_names(csv_path)
Пример #6
0
def _CsvToSerializedExample(  # pylint: disable=invalid-name
    pipeline, csv_uri):
  """Read csv file and transform to tf examples."""
  return (pipeline
          |
          'ReadFromText' >> beam.io.ReadFromText(csv_uri, skip_header_lines=1)
          | 'ParseCSV' >> csv_decoder.DecodeCSV(
              io_utils.load_csv_column_names(csv_uri))
          | 'ToSerializedTFExample' >> beam.Map(_dict_to_example))
Пример #7
0
    def expand(
            self, pipeline: beam.Pipeline
    ) -> beam.pvalue.PCollection[tf.train.Example]:
        logging.info('Processing input csv data %s to TFExample.',
                     self._csv_pattern)

        csv_files = fileio.glob(self._csv_pattern)
        if not csv_files:
            raise RuntimeError(
                'Split pattern {} does not match any files.'.format(
                    self._csv_pattern))

        column_names = io_utils.load_csv_column_names(csv_files[0])
        for csv_file in csv_files[1:]:
            if io_utils.load_csv_column_names(csv_file) != column_names:
                raise RuntimeError(
                    'Files in same split {} have different header.'.format(
                        self._csv_pattern))

        # Read each CSV file while maintaining order. This is done in order to group
        # together multi-line string fields.
        parsed_csv_lines = (
            pipeline
            | 'CreateFilenames' >> beam.Create(csv_files)
            | 'ReadFromText' >> beam.ParDo(_ReadCsvRecordsFromTextFile())
            | 'ParseCSVLine' >> beam.ParDo(
                csv_decoder.ParseCSVLine(delimiter=','))
            | 'ExtractParsedCSVLines' >> beam.Keys())
        column_infos = beam.pvalue.AsSingleton(
            parsed_csv_lines
            | 'InferColumnTypes' >> beam.CombineGlobally(
                csv_decoder.ColumnTypeInferrer(column_names,
                                               skip_blank_lines=True)))

        return (
            parsed_csv_lines
            |
            'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
Пример #8
0
def _do_inference(model_handle, examples_file, num_examples, schema):
  """Sends requests to the model and prints the results.

  Args:
    model_handle: handle to the model. This can be either
     "aiplatform:model:version" or "host:port"
    examples_file: path to csv file containing examples, with the first line
      assumed to have the column headers
    num_examples: number of requests to send to the server
    schema: a Schema describing the input data

  Returns:
    Response from model server
  """
  filtered_features = [
      feature for feature in schema.feature if feature.name != _LABEL_KEY
  ]
  del schema.feature[:]
  schema.feature.extend(filtered_features)

  column_names = io_utils.load_csv_column_names(examples_file)
  csv_coder = _make_csv_coder(schema, column_names)
  proto_coder = _make_proto_coder(schema)

  input_file = open(examples_file, 'r')
  input_file.readline()  # skip header line

  serialized_examples = []
  for _ in range(num_examples):
    one_line = input_file.readline()
    if not one_line:
      print('End of example file reached')
      break
    one_example = csv_coder.decode(one_line)

    serialized_example = proto_coder.encode(one_example)
    serialized_examples.append(serialized_example)

  parsed_model_handle = model_handle.split(':')
  if parsed_model_handle[0] == 'aiplatform':
    _do_aiplatform_inference(
        model=parsed_model_handle[1],
        version=parsed_model_handle[2],
        serialized_examples=serialized_examples)
  else:
    _do_local_inference(
        host=parsed_model_handle[0],
        port=parsed_model_handle[1],
        serialized_examples=serialized_examples)
Пример #9
0
 def testLoadCsvColumnNames(self):
     source_data_dir = os.path.join(os.path.dirname(__file__), 'testdata')
     test_file = os.path.join(source_data_dir, 'test.csv')
     column_names = io_utils.load_csv_column_names(test_file)
     self.assertListEqual(['a', 'b', 'c', 'd'], column_names)
Пример #10
0
 def testLoadCsvColumnNames(self):
   source_data_dir = os.path.join(os.path.dirname(__file__), 'testdata')
   test_file = os.path.join(source_data_dir, 'test.csv')
   column_names = io_utils.load_csv_column_names(test_file)
   self.assertListEqual(['a', 'b', 'c', 'd'], column_names)