def file_based_input_dataset_builder(channel, input_filenames, pipe_mode, is_training, drop_remainder, batch_size, epochs, steps_per_epoch, max_seq_length): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: print('***** Using pipe_mode with channel {}'.format(channel)) from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel, record_format='TFRecord') else: print('***** Using input_filenames {}'.format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch * 100) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) name_to_features = { "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" record = tf.io.parse_single_example(record, name_to_features) # TODO: wip/bert/bert_attention_head_view/train.py # Convert input_ids into input_tokens with DistilBert vocabulary # if hook.get_collections()['all'].save_config.should_save_step(modes.EVAL, hook.mode_steps[modes.EVAL]): # hook._write_raw_tensor_simple("input_tokens", input_tokens) return record dataset = dataset.apply( tf.data.experimental.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, drop_remainder=drop_remainder, num_parallel_calls=tf.data.experimental.AUTOTUNE)) dataset.cache() if is_training: dataset = dataset.shuffle(seed=42, buffer_size=100, reshuffle_each_iteration=True) return dataset
def file_based_input_dataset_builder(channel, input_filenames, pipe_mode, is_training, drop_remainder, batch_size, epochs, steps_per_epoch, max_seq_length): # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if pipe_mode: print('***** Using pipe_mode with channel {}'.format(channel)) from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel, record_format='TFRecord') else: print('***** Using input_filenames {}'.format(input_filenames)) dataset = tf.data.TFRecordDataset(input_filenames) dataset = dataset.repeat(epochs * steps_per_epoch) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) name_to_features = { "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.io.FixedLenFeature([], tf.int64), # "is_real_example": tf.io.FixedLenFeature([], tf.int64), } def _decode_record(record, name_to_features): """Decodes a record to a TensorFlow example.""" return tf.io.parse_single_example(record, name_to_features) dataset = dataset.apply( tf.data.experimental.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, drop_remainder=drop_remainder, num_parallel_calls=tf.data.experimental.AUTOTUNE)) dataset.cache() if is_training: dataset = dataset.shuffle(seed=42, buffer_size=1000, reshuffle_each_iteration=True) return dataset
def _input(args, channel_name): try: mode_channel_name = channel_name + 'ing' if channel_name == 'train' else channel_name mode = args.data_config[mode_channel_name]['TrainingInputMode'] except: mode = 'File' """Uses the tf.data input pipeline for dataset. Args: mode: Standard names for model modes (tf.estimators.ModeKeys). batch_size: The number of samples per batch of input requested. """ filenames = get_filenames(args, channel_name) # Repeat infinitely. logging.info("Running {} in {} mode".format(channel_name, mode)) if mode == 'Pipe': from sagemaker_tensorflow import PipeModeDataset dataset = PipeModeDataset(channel=channel_name, record_format='TFRecord') else: dataset = tf.data.TFRecordDataset(filenames) # Potentially shuffle records. if channel_name == 'train': # Ensure that the capacity is sufficiently large to provide good random # shuffling. dataset = dataset.map(_load_image_train, num_parallel_calls=tf.data.experimental.AUTOTUNE) buffer_size = int(args.train_num_examples * 0.4) + 3 * args.BATCH_SIZE dataset = dataset.cache().shuffle(buffer_size=buffer_size).batch( args.BATCH_SIZE).repeat() elif channel_name == 'test': dataset = dataset.map(_load_image_test) for image, mask in dataset.take(1): sample_image, sample_mask = image, mask _img_save('sample_image.jpg', sample_image) _img_save('sample_mask.png', sample_mask) dataset = dataset.batch(args.BATCH_SIZE) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) return dataset