def setUp(self): self.image_data = test_utils.get_test_df() self.split_key = schema.get_key(schema.SplitKeyType, schema.image_csv_schema) self.label_key = schema.get_key(schema.StringLabelType, schema.image_csv_schema) self.image_uri_key = schema.get_key(schema.ImageUriType, schema.image_csv_schema) self.tempfiles = [] self.tempdir = None
def setUp(self): """Test setup.""" image_height = 40 image_width = 30 image_channels = 3 image_fn = functools.partial(test_utils.make_random_image, image_height, image_width, image_channels) data = test_utils.get_test_data() image_uri_key = schema.get_key(schema.ImageUriType, schema.image_csv_schema) num_records = len(data[image_uri_key]) image_uris = data.pop(image_uri_key) data['image_name'] = [os.path.split(uri)[-1] for uri in image_uris] data.update({ 'image': [beam_image.encode(image_fn()) for _ in range(num_records)], 'image_height': [image_height] * num_records, 'image_width': [image_width] * num_records, 'image_channels': [image_channels] * num_records, }) self.num_records = num_records self.data = data self.dataset = tf.data.Dataset.from_tensor_slices(self.data)
def test_no_get_split_key(self): """Tests no split key present.""" test_schema = dict() for k, v in schema.image_csv_schema.items(): # Brute force copy because OG is a FrozenOrderedDict. if k != 'split': test_schema[k] = v key = schema.get_key(schema.SplitKeyType, test_schema) self.assertIsNone(key)
def get_raw_feature_df() -> pd.DataFrame: """Returns test dataframe having raw feature spec schema.""" df = get_test_df() my_raw_schema = schema.get_raw_schema_map(schema.image_csv_schema) image_key = schema.get_key(schema.ImageUriType, schema.image_csv_schema) df.drop([image_key], axis=1, inplace=True) df['image_name'] = 'image_name' df['image'] = 'image' # Note: TF Transform parser expects string values in input. They will # be parsed based on the raw feature spec that is passed together with the # data df['image_height'] = '48' df['image_width'] = '48' df['image_channels'] = '3' df = df[my_raw_schema.keys()] return df
def test_valid_get_key(self): """Tests a valid split key.""" key = schema.get_key(schema.SplitKeyType, schema.image_csv_schema) self.assertEqual(key, 'split')
def setUp(self): self.df = test_utils.get_test_df() self.schema_map = schema.image_csv_schema self.split_key = schema.get_key(schema.SplitKeyType, self.schema_map)
def build_pipeline( df: pd.DataFrame, job_label: str, runner: str, project: str, region: str, output_dir: str, compression: str, num_shards: int, schema_map: Dict[str, collections.namedtuple], tfrecorder_wheel: str, dataflow_options: Dict[str, Any]) -> beam.Pipeline: """Runs TFRecorder Beam Pipeline. Args: df: Pandas DataFrame job_label: User description for the beam job. runner: Beam Runner: (e.g. DataflowRunner, DirectRunner). project: GCP project ID (if DataflowRunner) region: GCP compute region (if DataflowRunner) output_dir: GCS or Local Path for output. compression: gzip or None. num_shards: Number of shards. schema_map: A schema map (Dictionary mapping Dataframe columns to types) used to derive the input and target schema. tfrecorder_wheel: Path to TFRecorder wheel for DataFlow dataflow_options: Dataflow Runner Options (optional) Returns: beam.Pipeline Note: These inputs must be validated upstream (by client.create_tfrecord()) """ job_name = _get_job_name(job_label) job_dir = _get_job_dir(output_dir, job_name) options = _get_pipeline_options( runner, job_name, job_dir, project, region, tfrecorder_wheel, dataflow_options) p = beam.Pipeline(options=options) with tft_beam.Context(temp_dir=os.path.join(job_dir, 'tft_tmp')): converter = schema.get_tft_coder(df.columns, schema_map) flatten_rows = ToCSVRows() # Each element in the data PCollection will be a dict # including the image_csv_columns and the image features created from # extract_images_fn. data = ( p | 'ReadFromDataFrame' >> beam.Create(df.values.tolist()) | 'ToCSVRows' >> beam.ParDo(flatten_rows) | 'DecodeCSV' >> beam.Map(converter.decode) ) # Extract images if an image_uri key exists. image_uri_key = schema.get_key(schema.ImageUriType, schema_map) if image_uri_key: extract_images_fn = beam_image.ExtractImagesDoFn(image_uri_key) data = ( data | 'ReadImage' >> beam.ParDo(extract_images_fn) ) # If the schema contains a valid split key, partition the dataset. split_key = schema.get_key(schema.SplitKeyType, schema_map) # Note: This will not always reflect actual number of samples per dataset # written as TFRecords. The succeeding `Partition` operation may mark # additional samples from other splits as discarded. If a split has all # its samples discarded, the pipeline will still generate a TFRecord # file for that split, albeit empty. split_counts = get_split_counts(df, split_key) # Raw metadata is the TFT metadata after image insertion but before TFT # e.g Image columns have been added if necessary. raw_metadata = schema.get_raw_metadata(df.columns, schema_map) # Require training set to be available in the input data. The transform_fn # and transformed_metadata will be generated from the training set and # applied to the other datasets, if any assert 'TRAIN' in split_counts # Split dataset into train, validation, test sets. partition_fn = functools.partial(_partition_fn, split_key=split_key) train_data, val_data, test_data, discard_data = ( data | 'SplitDataset' >> beam.Partition( partition_fn, len(schema.SplitKeyType.allowed_values))) raw_schema_map = schema.get_raw_schema_map(schema_map=schema_map) preprocessing_fn = functools.partial( _preprocessing_fn, schema_map=raw_schema_map) tfr_writer = functools.partial( _get_write_to_tfrecord, output_dir=job_dir, compress=compression, num_shards=num_shards) transform_fn = _transform_and_write_tfr( train_data, tfr_writer, preprocessing_fn=preprocessing_fn, raw_metadata=raw_metadata, label='Train') if 'VALIDATION' in split_counts: _transform_and_write_tfr( val_data, tfr_writer, transform_fn=transform_fn, raw_metadata=raw_metadata, label='Validation') if 'TEST' in split_counts: _transform_and_write_tfr( test_data, tfr_writer, transform_fn=transform_fn, raw_metadata=raw_metadata, label='Test') _ = ( discard_data | 'WriteDiscardedData' >> beam.io.WriteToText( os.path.join(job_dir, 'discarded-data'))) # Note: `transform_fn` already contains the transformed metadata _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn( job_dir)) return p