def ReadExamplesArtifact(self, examples: types.Artifact, num_examples: int, split_name: Optional[Text] = None): """Read records from Examples artifact. Currently it assumes Examples artifact contains serialized tf.Example in gzipped TFRecord files. Args: examples: `Examples` artifact. num_examples: Number of examples to read. If the specified value is larger than the actual number of examples, all examples would be read. split_name: Name of the split to read from the Examples artifact. Raises: RuntimeError: If read twice. """ if self._records: raise RuntimeError('Cannot read records twice.') if num_examples < 1: raise ValueError('num_examples < 1 (got {})'.format(num_examples)) available_splits = artifact_utils.decode_split_names( examples.split_names) if not available_splits: raise ValueError( 'No split_name is available in given Examples artifact.') if split_name is None: split_name = available_splits[0] if split_name not in available_splits: raise ValueError( 'No split_name {}; available split names: {}'.format( split_name, ', '.join(available_splits))) # ExampleGen generates artifacts under each split_name directory. glob_pattern = os.path.join(examples.uri, split_name, '*') tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact( examples=[examples], telemetry_descriptors=_TELEMETRY_DESCRIPTORS, schema=None, read_as_raw_records=True, raw_record_column_name=_RAW_RECORDS_COLUMN) try: filenames = fileio.glob(glob_pattern) except tf.errors.NotFoundError: filenames = [] if not filenames: raise ValueError( 'Unable to find examples matching {}.'.format(glob_pattern)) self._payload_format = examples_utils.get_payload_format(examples) tfxio = tfxio_factory(filenames) self._ReadFromDataset( tfxio.TensorFlowDataset( dataset_options.TensorFlowDatasetOptions( batch_size=num_examples)))
def ReadExamplesArtifact(self, examples: types.Artifact, num_examples: int, split_name: Optional[Text] = None): """Read records from Examples artifact. Currently it assumes Examples artifact contains serialized tf.Example in gzipped TFRecord files. Args: examples: `Examples` artifact. num_examples: Number of examples to read. If the specified value is larger than the actual number of examples, all examples would be read. split_name: Name of the split to read from the Examples artifact. Raises: RuntimeError: If read twice. """ if self._records: raise RuntimeError('Cannot read records twice.') if num_examples < 1: raise ValueError('num_examples < 1 (got {})'.format(num_examples)) available_splits = artifact_utils.decode_split_names( examples.split_names) if not available_splits: raise ValueError( 'No split_name is available in given Examples artifact.') if split_name is None: split_name = available_splits[0] if split_name not in available_splits: raise ValueError( 'No split_name {}; available split names: {}'.format( split_name, ', '.join(available_splits))) # ExampleGen generates artifacts under each split_name directory. glob_pattern = os.path.join(examples.uri, split_name, '*.gz') try: filenames = fileio.glob(glob_pattern) except tf.errors.NotFoundError: filenames = [] if not filenames: raise ValueError( 'Unable to find examples matching {}.'.format(glob_pattern)) # Assume we have a tf.Example logical format. self._record_format = _LogicalFormat.TF_EXAMPLE self._ReadFromDataset(tf.data.TFRecordDataset(filenames, compression_type='GZIP'), num_examples=num_examples)
def _get_results(self, path, file_name, proto_type): results = [] filepattern = os.path.join(path, file_name) + '-?????-of-?????.gz' for f in fileio.glob(filepattern): record_iterator = tf.compat.v1.python_io.tf_record_iterator( path=f, options=tf.compat.v1.python_io.TFRecordOptions( tf.compat.v1.python_io.TFRecordCompressionType.GZIP)) for record_string in record_iterator: prediction_log = proto_type() prediction_log.MergeFromString(record_string) results.append(prediction_log) return results
def _CsvToExample( # pylint: disable=invalid-name pipeline: beam.Pipeline, exec_properties: Dict[Text, Any], split_pattern: Text) -> beam.pvalue.PCollection: """Read CSV files and transform to TF examples. Note that each input split will be transformed by this function separately. Args: pipeline: beam pipeline. exec_properties: A dict of execution properties. - input_base: input dir that contains CSV data. CSV must have header line. split_pattern: Split.pattern in Input config, glob relative file pattern that maps to input files with root directory given by input_base. Returns: PCollection of TF examples. Raises: RuntimeError: if split is empty or csv headers are not equal. """ input_base_uri = exec_properties[standard_component_specs.INPUT_BASE_KEY] csv_pattern = os.path.join(input_base_uri, split_pattern) logging.info('Processing input csv data %s to TFExample.', csv_pattern) csv_files = fileio.glob(csv_pattern) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format(csv_pattern)) column_names = io_utils.load_csv_column_names(csv_files[0]) for csv_file in csv_files[1:]: if io_utils.load_csv_column_names(csv_file) != column_names: raise RuntimeError( 'Files in same split {} have different header.'.format( csv_pattern)) parsed_csv_lines = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=csv_pattern, skip_header_lines=1) | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')) | 'ExtractParsedCSVLines' >> beam.Keys()) column_infos = beam.pvalue.AsSingleton( parsed_csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True))) return (parsed_csv_lines | 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
def _get_results(self, prediction_log_path): results = [] filepattern = os.path.join( prediction_log_path, executor._PREDICTION_LOGS_DIR_NAME) + '-?????-of-?????.gz' for f in fileio.glob(filepattern): record_iterator = tf.compat.v1.python_io.tf_record_iterator( path=f, options=tf.compat.v1.python_io.TFRecordOptions( tf.compat.v1.python_io.TFRecordCompressionType.GZIP)) for record_string in record_iterator: prediction_log = prediction_log_pb2.PredictionLog() prediction_log.MergeFromString(record_string) results.append(prediction_log) return results
def setUpClass(cls): super(ExecutorTest, cls).setUpClass() source_example_dir = os.path.join(cls._SOURCE_DATA_DIR, 'csv_example_gen') io_utils.copy_dir(source_example_dir, cls._ARTIFACT1_URI) io_utils.copy_dir(source_example_dir, cls._ARTIFACT2_URI) # Duplicate the number of train and eval records such that # second artifact has twice as many as first. artifact2_pattern = os.path.join(cls._ARTIFACT2_URI, '*', '*') artifact2_files = fileio.glob(artifact2_pattern) for filepath in artifact2_files: directory, filename = os.path.split(filepath) io_utils.copy_file(filepath, os.path.join(directory, 'dup_' + filename))
def generate_fingerprint(split_name: Text, file_pattern: Text) -> Text: """Generates a fingerprint for all files that match the pattern.""" files = fileio.glob(file_pattern) total_bytes = 0 # Checksum used here is based on timestamp (mtime). # Checksums are xor'ed and sum'ed over the files so that they are order- # independent. xor_checksum = 0 sum_checksum = 0 for f in files: stat = fileio.stat(f) total_bytes += stat.length # Take mtime only up to second-granularity. mtime = int(stat.mtime_nsec / NANO_PER_SEC) xor_checksum ^= mtime sum_checksum += mtime return 'split:%s,num_files:%d,total_bytes:%d,xor_checksum:%d,sum_checksum:%d' % ( split_name, len(files), total_bytes, xor_checksum, sum_checksum)
def expand( self, pipeline: beam.Pipeline ) -> beam.pvalue.PCollection[tf.train.Example]: logging.info('Processing input csv data %s to TFExample.', self._csv_pattern) csv_files = fileio.glob(self._csv_pattern) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format( self._csv_pattern)) column_names = io_utils.load_csv_column_names(csv_files[0]) for csv_file in csv_files[1:]: if io_utils.load_csv_column_names(csv_file) != column_names: raise RuntimeError( 'Files in same split {} have different header.'.format( self._csv_pattern)) # Read each CSV file while maintaining order. This is done in order to group # together multi-line string fields. parsed_csv_lines = ( pipeline | 'CreateFilenames' >> beam.Create(csv_files) | 'ReadFromText' >> beam.ParDo(_ReadCsvRecordsFromTextFile()) | 'ParseCSVLine' >> beam.ParDo( csv_decoder.ParseCSVLine(delimiter=',')) | 'ExtractParsedCSVLines' >> beam.Keys()) column_infos = beam.pvalue.AsSingleton( parsed_csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer(column_names, skip_blank_lines=True))) return ( parsed_csv_lines | 'ToTFExample' >> beam.ParDo(_ParsedCsvToTfExample(), column_infos))
def _verify_transform_outputs(self, materialize=True, store_cache=True, multiple_example_inputs=False, compute_statistics=False): expected_outputs = ['transformed_graph'] if store_cache: expected_outputs.append('CACHE') self.assertNotEqual( 0, len(fileio.listdir(self._updated_analyzer_cache_artifact.uri))) example_artifacts = self._example_artifacts[:1] transformed_example_artifacts = self._transformed_example_artifacts[:1] if multiple_example_inputs: example_artifacts = self._example_artifacts transformed_example_artifacts = self._transformed_example_artifacts if materialize: expected_outputs.append('transformed_examples') assert len(example_artifacts) == len(transformed_example_artifacts) for example, transformed_example in zip( example_artifacts, transformed_example_artifacts): examples_train_files = fileio.glob( os.path.join(example.uri, 'Split-train', '*')) transformed_train_files = fileio.glob( os.path.join(transformed_example.uri, 'Split-train', '*')) self.assertGreater(len(transformed_train_files), 0) examples_eval_files = fileio.glob( os.path.join(example.uri, 'Split-eval', '*')) transformed_eval_files = fileio.glob( os.path.join(transformed_example.uri, 'Split-eval', '*')) self.assertGreater(len(transformed_eval_files), 0) # Construct datasets and count number of records in each split. examples_train_count = _get_dataset_size(examples_train_files) transformed_train_count = _get_dataset_size( transformed_train_files) examples_eval_count = _get_dataset_size(examples_eval_files) transformed_eval_count = _get_dataset_size( transformed_eval_files) # Check for each split that it contains the same number of records in # the input artifact as in the output artifact (i.e 1-to-1 mapping is # preserved). self.assertEqual(examples_train_count, transformed_train_count) self.assertEqual(examples_eval_count, transformed_eval_count) self.assertGreater(transformed_train_count, transformed_eval_count) path_to_pre_transform_statistics = os.path.join( self._transformed_output.uri, tft.TFTransformOutput.PRE_TRANSFORM_FEATURE_STATS_PATH) path_to_post_transform_statistics = os.path.join( self._transformed_output.uri, tft.TFTransformOutput.POST_TRANSFORM_FEATURE_STATS_PATH) if compute_statistics: self.assertTrue(fileio.exists(path_to_pre_transform_statistics)) self.assertTrue(fileio.exists(path_to_post_transform_statistics)) else: self.assertFalse(fileio.exists(path_to_pre_transform_statistics)) self.assertFalse(fileio.exists(path_to_post_transform_statistics)) # Depending on `materialize` and `store_cache`, check that # expected outputs are exactly correct. If either flag is False, its # respective output should not be present. self.assertCountEqual(expected_outputs, fileio.listdir(self._output_data_dir)) path_to_saved_model = os.path.join( self._transformed_output.uri, tft.TFTransformOutput.TRANSFORM_FN_DIR, tf.saved_model.SAVED_MODEL_FILENAME_PB) self.assertTrue(fileio.exists(path_to_saved_model))
def _get_target_span_version( uri: str, split: example_gen_pb2.Input.Split, range_config: Optional[range_config_pb2.RangeConfig] = None ) -> Tuple[Optional[int], Optional[int]]: """Retrieves a target span and version for a given split pattern. If both Span and Version spec occur in the split pattern, searches for and returns both the target Span and Version. If only Span exists in the split pattern, searches for the target Span, and Version is returned as None. If Version is present, but not Span, an error is raised. If neither Span nor Version is present, returns both as None. Additonally, supports parsing span number from date stamps using the Date. specs. Once the calendar date is parsed from the Date specs, it is converted into a span number by counting the number of days since 01/01/1970. Args: uri: The base path from which files will be searched. split: An example_gen_pb2.Input.Split object which contains a split pattern, to be searched on. range_config: An instance of range_config_pb2.RangeConfig, which specifies which spans to consider when finding the most recent span and version. If unset, search for latest span number with no restrictions. Returns: Tuple of two ints, Span (optional) and Version (optional). Note that this function will update the {SPAN} or Date tags as well as the {VERSION} tags in the split config to actual Span and Version numbers. Raises: ValueError: if any of the following occurs: - If either Span or Version spec is occurs in the split pattern more than once. - If Version spec is provided, but Span spec is not present. - If Span or Version found is not an integer. - If a matching cannot be found for split pattern provided. """ is_match_span, is_match_date, is_match_version = verify_split_pattern_specs( split) if not is_match_span and not is_match_date: return (None, None) split_glob_pattern, split_regex_pattern = _create_matching_glob_and_regex( uri=uri, split=split, is_match_span=is_match_span, is_match_date=is_match_date, is_match_version=is_match_version, range_config=range_config) logging.info('Glob pattern for split %s: %s', split.name, split_glob_pattern) logging.info('Regex pattern for split %s: %s', split.name, split_regex_pattern) latest_span_tokens = None latest_span_int = None latest_version = None latest_version_int = None files = fileio.glob(split_glob_pattern) for file_path in files: match_span_tokens, match_span_int, match_version, match_version_int = ( _find_matched_span_version_from_path(file_path, split_regex_pattern, is_match_span, is_match_date, is_match_version)) if latest_span_int is None or match_span_int > latest_span_int: # Uses str instead of int because of zero padding digits. latest_span_tokens = match_span_tokens latest_span_int = match_span_int latest_version = match_version latest_version_int = match_version_int elif (latest_span_int == match_span_int and (latest_version is None or match_version_int >= latest_version_int)): latest_version = match_version latest_version_int = match_version_int if latest_span_int is None or (is_match_version and latest_version is None): raise ValueError('Cannot find matching for split %s based on %s' % (split.name, split.pattern)) # Update split pattern so executor can find the files to ingest. if is_match_span: split.pattern = re.sub(SPAN_FULL_REGEX, latest_span_tokens[0], split.pattern) elif is_match_date: for spec, value in zip(DATE_SPECS, latest_span_tokens): split.pattern = split.pattern.replace(spec, value) if is_match_version: split.pattern = re.sub(VERSION_FULL_REGEX, latest_version, split.pattern) return latest_span_int, latest_version_int