def _map_fn(filename, annotation): ## read image image = tf.io.read_file(filename) image = tf.image.decode_jpeg(image, channels=3) image = tf.image.convert_image_dtype(image, dtype=tf.float32) ## data augmentation for image only 0.02s image = tf.image.random_brightness(image, max_delta=63) image = tf.image.random_contrast(image, lower=0.2, upper=1.8) # subtract off the mean and divide by the variance of the pixels. (optional) # img = tf.image.per_image_standardization(img) ## data augmentation for image and bounding box image, annotation = tf.numpy_function(_data_aug_fn, [image, annotation], [tf.float32, tf.string]) return image, annotation
# Shape of the vector extracted from InceptionV3 is (64, 2048) # These two variables represent that vector shape features_shape = 2048 attention_features_shape = 64 # Load the numpy files def map_func(img_name, cap): img_tensor = np.load(img_name.decode('utf-8') + '.npy') return img_tensor, cap dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train)) # Use map to load the numpy files in parallel dataset = dataset.map(lambda item1, item2: tf.numpy_function( map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) # Shuffle and batch dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) class BahdanauAttention(tf.keras.Model): def __init__(self, units): super(BahdanauAttention, self).__init__() self.W1 = tf.keras.layers.Dense(units) self.W2 = tf.keras.layers.Dense(units) self.V = tf.keras.layers.Dense(1) def call(self, features, hidden):
def _norm_mean_std_tf(self, x, mean, std): x = tf.numpy_function(self._norm_mean_std, [x, mean, std], tf.float32) return x
def aug_process(self, image, label): """Creates tensorflow function with related augmentation functions""" aug_img = tf.numpy_function(func=self.aug_func, inp=[image], Tout=tf.float32) return aug_img, label
def create_dataset(self): if self.ques_type in ['c4', 'overall']: ques_id_ds = tf.data.Dataset.from_tensor_slices( self.df['Question_Id']) image_path_ds = tf.data.Dataset.from_tensor_slices( self.df['image']) image_ds = image_path_ds.map( self.load_and_preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE) # create question feature dataset ques_path_ds = tf.data.Dataset.from_tensor_slices( self.df['question']) ques_ds = ques_path_ds.map(lambda x: tf.numpy_function( DataLoader.load_question_features, inp=[x], Tout=tf.float32), num_parallel_calls=tf.data.experimental. AUTOTUNE) answers = self.df['Answers'].map( lambda x: DataLoader.process_answer(x)) # use tokenizer for string and one-hot mapping, counting vocab, max length tokenizer = tf.keras.preprocessing.text.Tokenizer( filters="", oov_token="<unk>", lower=True) tokenizer.fit_on_texts(answers) answers = tokenizer.texts_to_sequences(answers) # use 0 as padding tokenizer.word_index['<pad>'] = 0 tokenizer.index_word[0] = '<pad>' answers = tf.keras.preprocessing.sequence.pad_sequences( answers, padding='post') ans_ds = tf.data.Dataset.from_tensor_slices(answers) return tf.data.Dataset.zip( ((image_ds, ques_ds), ans_ds, ques_id_ds)), tokenizer else: ques_id_ds = tf.data.Dataset.from_tensor_slices( self.df['Question_Id']) image_path_ds = tf.data.Dataset.from_tensor_slices( self.df['image']) image_ds = image_path_ds.map( self.load_and_preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE) # create question feature dataset ques_path_ds = tf.data.Dataset.from_tensor_slices( self.df['question']) ques_ds = ques_path_ds.map(lambda x: tf.numpy_function( DataLoader.load_question_features, inp=[x], Tout=tf.float32), num_parallel_calls=tf.data.experimental. AUTOTUNE) vocab = self.df['Answers'].unique() tokenizer = OnehotManager(vocab) answers = tf.data.Dataset.from_tensor_slices(self.df['Answers']) ans_ds = answers.map( lambda x: tf.numpy_function( tokenizer.get_index, inp=[x], Tout=tf.int32), num_parallel_calls=tf.data.experimental.AUTOTUNE) return tf.data.Dataset.zip( ((image_ds, ques_ds), ans_ds, ques_id_ds)), tokenizer
def eval_as_np(fn, y_true, y_pred): return tf.numpy_function(fn, [y_true, tf.round(y_pred)], tf.double)
def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None): """ Initializes the model for inference sets "mel_outputs" and "alignments" fields. Args: - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of steps in the input time series, and values are character IDs - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths of each sequence in inputs. - embed_targets: float32 Tensor with shape [N, E] where E is the speaker embedding size. - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number of steps in the output time series, M is num_mels, and values are entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: raise ValueError("no multi targets were provided but token_targets were given") if mel_targets is not None and stop_token_targets is None and not gta: raise ValueError("Mel targets are provided without corresponding token_targets") if not gta and self._hparams.predict_linear == True and linear_targets is None and \ is_training: raise ValueError( "Model is set to use post processing to predict linear spectrograms in training " "but no linear targets given!") if gta and linear_targets is not None: raise ValueError("Linear spectrogram prediction is not supported in GTA mode!") if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( "Model set to mask paddings but no targets lengths provided for the mask!") if is_training and is_evaluating: raise RuntimeError( "Model can not be in training and evaluation modes at the same time!") split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or \ self._hparams.split_on_cpu else "/gpu:{}".format( self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = \ tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if \ targets_lengths is not None else targets_lengths ### SV2TTS ### tower_embed_targets = tf.split(embed_targets, num_or_size_splits=hp.tacotron_num_gpus, axis=0) ############## p_inputs = tf.numpy_function(split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.numpy_function(split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets p_stop_token_targets = tf.numpy_function(split_func, [stop_token_targets, split_infos[:, 2]], lout_float) if stop_token_targets is not None else \ stop_token_targets tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels for i in range(hp.tacotron_num_gpus): tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1])) if p_mel_targets is not None: tower_mel_targets.append( tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels])) if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_cond_outputs = [] tower_residual = [] tower_projected_residual = [] # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): with tf.compat.v1.variable_scope("inference") as scope: assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled") if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training: assert global_step is not None # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit # post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.compat.v1.get_variable( "inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i]) # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( EncoderConvolutions(is_training, hparams=hp, scope="encoder_convolutions"), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope="encoder_LSTM")) encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) # For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape ### SV2TT2 ### # Append the speaker embedding to the encoder output at each timestep tileable_shape = [-1, 1, self._hparams.speaker_embedding_size] tileable_embed_targets = tf.reshape(tower_embed_targets[i], tileable_shape) tiled_embed_targets = tf.tile(tileable_embed_targets, [1, tf.shape(encoder_outputs)[1], 1]) encoder_cond_outputs = tf.concat((encoder_outputs, tiled_embed_targets), 2) ############## # Decoder Parts # Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope="decoder_prenet") # Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_cond_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( tower_input_lengths[i], [-1]), smoothing=hp.smoothing, cumulate_weights=hp.cumulative_weights) # Decoder LSTM Cells decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers, size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope="decoder_LSTM") # Frames Projection layer frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope="linear_transform_projection") # <stop_token> projection layer stop_projection = StopProjection(is_training or is_evaluating, shape=hp .outputs_per_step, scope="stop_token_projection") # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, attention_mechanism, decoder_lstm, frame_projection, stop_projection) # Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) # initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) # Only use max iterations at synthesis time max_iters = hp.max_iters if not (is_training or is_evaluating) else None # Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( CustomDecoder(decoder_cell, self.helper, decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) # Reshape outputs to be one output per entry # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) # Postnet postnet = Postnet(is_training, hparams=hp, scope="postnet_convolutions") # Compute residual using post-net ==> [batch_size, decoder_steps * r, # postnet_channels] residual = postnet(decoder_output) # Project residual to same dimension as mel spectrogram # ==> [batch_size, decoder_steps * r, num_mels] residual_projection = FrameProjection(hp.num_mels, scope="postnet_projection") projected_residual = residual_projection(residual) # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual if post_condition: # Add post-processing CBHG. This does a great job at extracting features # from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name="CBHG_postnet") # [batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) # Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection(hp.num_freq, scope="cbhg_linear_specs_projection") # [batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) # Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) self.tower_stop_token_prediction.append(stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_cond_outputs.append(encoder_cond_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) if post_condition: self.tower_linear_outputs.append(linear_outputs) log("initialisation done {}".format(gpus[i])) if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs self.tower_input_lengths = tower_input_lengths self.tower_mel_targets = tower_mel_targets # self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets self.all_vars = tf.compat.v1.trainable_variables() log("Initialized Tacotron model. Dimensions (? = dynamic shape): ") log(" Train mode: {}".format(is_training)) log(" Eval mode: {}".format(is_evaluating)) log(" GTA mode: {}".format(gta)) log(" Synthesis mode: {}".format(not (is_training or is_evaluating))) log(" Input: {}".format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(" device: {}".format(i)) log(" embedding: {}".format(tower_embedded_inputs[i].shape)) log(" enc conv out: {}".format(tower_enc_conv_output_shape[i])) log(" encoder out (cond): {}".format(tower_encoder_cond_outputs[i].shape)) log(" decoder out: {}".format(self.tower_decoder_output[i].shape)) log(" residual out: {}".format(tower_residual[i].shape)) log(" projected residual out: {}".format(tower_projected_residual[i].shape)) log(" mel out: {}".format(self.tower_mel_outputs[i].shape)) if post_condition: log(" linear out: {}".format(self.tower_linear_outputs[i].shape)) log(" <stop_token> out: {}".format(self.tower_stop_token_prediction[i].shape)) # 1_000_000 is causing syntax problems for some people?! Python please :) log(" Tacotron Parameters {:.3f} Million.".format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def trimBlackMarginsTF(imageTensor): withoutMargins = tf.numpy_function(TrimBlackPaddings, [imageTensor], Tout=tf.uint8) return withoutMargins
def __init__(self, batch_size=48, train_model='mobilenetv2'): _, self.train_filenames, self.train_captions = load_records() _, self.val_filenames, self.val_captions = load_records(False) del _ self.batch_size = batch_size if train_model == 'mobilenetv2': load_fn = self.load_image_mobilenet self.transfer_train_dataset = tf.data.Dataset.from_tensor_slices( list(self.train_filenames)) self.transfer_train_dataset = self.transfer_train_dataset.map( load_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(batch_size) self.transfer_val_dataset = tf.data.Dataset.from_tensor_slices( list(self.val_filenames)) self.transfer_val_dataset = self.transfer_val_dataset.map( load_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(batch_size) # Create a Tokenizer self.tokenizer = TokenizerWrapper(self.get_texts()) # Create a list of filenames each mapped with its corresponding caption _train_filenames, _train_captions = [], [] for i, captions in enumerate(self.train_captions, start=0): # get the path of train transfer features train_path = os.path.join( PATHS.TRAIN_TRANSFER_DIR, os.path.basename(self.train_filenames[i])) + '.npy' for cap in captions: _train_filenames.append(train_path) _train_captions.append(cap) _train_captions = self.tokenizer.texts_to_sequences(_train_captions) _train_captions = tf.keras.preprocessing.sequence.pad_sequences( _train_captions, padding='post') max_len = max([len(cap) for cap in _train_captions]) self.train_dataset = tf.data.Dataset.from_tensor_slices((_train_filenames, _train_captions)).map( lambda item1, item2: tf.numpy_function(self.map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) \ .shuffle(1000, reshuffle_each_iteration=True) \ .batch(self.batch_size) \ .prefetch(buffer_size=tf.data.experimental.AUTOTUNE) # Free the memory del _train_filenames del _train_captions _val_filenames, _val_captions = [], [] for i, captions in enumerate(self.val_captions, start=0): # get the path of train transfer features val_path = os.path.join(PATHS.VAL_TRANSFER_DIR, os.path.basename( self.val_filenames[i])) + '.npy' for cap in captions: _val_filenames.append(val_path) _val_captions.append(cap) _val_captions = self.tokenizer.texts_to_sequences(_val_captions) _val_captions = tf.keras.preprocessing.sequence.pad_sequences( _val_captions, padding='post') max_len = max([len(cap) for cap in _val_captions]) self.val_dataset = tf.data.Dataset.from_tensor_slices((_val_filenames, _val_captions)) \ .map(lambda item1, item2: tf.numpy_function(self.map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) \ .shuffle(1000, reshuffle_each_iteration=True) \ .batch(self.batch_size) \ .prefetch(buffer_size=tf.data.experimental.AUTOTUNE) # Free the memory del _val_filenames del _val_captions
def loadImagePackTF(pathTensor): y = tf.numpy_function(loadImagePackNp, [pathTensor], (tf.uint8)) return y
def my_iou_metric(label, pred): # Tensorflow version return tf.numpy_function(get_iou_vector, [label, pred > 0.5], tf.float64)
def read_parse_single_example(self, serialized_sample, is_training=False): """ parse tensor :param image_sample: :return: """ # construct feature description keys_to_features = { 'image/filename': tf.FixedLenFeature([], tf.string, default_value=''), 'image/encoded': tf.FixedLenFeature([], tf.string, default_value=''), 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 'image/height': tf.FixedLenFeature([], tf.int64), 'image/width': tf.FixedLenFeature([], tf.int64), 'image/channels': tf.FixedLenFeature([], tf.int64), 'image/shape': tf.FixedLenFeature([3], tf.int64), 'image/object/num_object': tf.FixedLenFeature([], tf.int64), 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64), 'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64) } features = tf.io.parse_single_example(serialized=serialized_sample, features=keys_to_features) # parse feature image_name = tf.cast(features['image/filename'], dtype=tf.string) num_objects = tf.cast(features['image/object/num_object'], dtype=tf.int32) height = tf.cast(features['image/height'], dtype=tf.int32) width = tf.cast(features['image/width'], dtype=tf.int32) depth = tf.cast(features['image/channels'], dtype=tf.int32) # shape = tf.cast(feature['shape'], tf.int32) # actual data shape image_shape = [height, width, depth] bbox_shape = [num_objects, 1] image = tf.decode_raw(features['image/encoded'], out_type=tf.uint8) image = tf.reshape(image, image_shape) # parse gtbox x_min = tf.sparse_tensor_to_dense(features['image/object/bbox/xmin'], default_value=0) y_min = tf.sparse_tensor_to_dense(features['image/object/bbox/ymin'], default_value=0) x_max = tf.sparse_tensor_to_dense(features['image/object/bbox/xmax'], default_value=0) y_max = tf.sparse_tensor_to_dense(features['image/object/bbox/ymax'], default_value=0) label = tf.sparse_tensor_to_dense(features['image/object/bbox/label'],default_value=0) x_min = tf.reshape(x_min, bbox_shape) y_min = tf.reshape(y_min, bbox_shape) x_max = tf.reshape(x_max, bbox_shape) y_max = tf.reshape(y_max, bbox_shape) label = tf.reshape(label, bbox_shape) # bboxes = tf.concat([x_min[:, tf.newaxis], y_min[:, tf.newaxis], x_max[:, tf.newaxis], y_max[:, tf.newaxis], tf.cast(label[:, tf.newaxis], dtype=tf.float32)], axis=-1) bboxes = tf.concat([x_min, y_min, x_max, y_max, tf.cast(label, dtype=tf.float32)], axis=-1) bboxes = tf.reshape(bboxes, shape=[-1, 5]) self.train_output_sizes = self.train_input_size // self.strides image, bboxes = tf.numpy_function(self.image_processing, inp=[image, bboxes, is_training], Tout=[tf.float32, tf.float32]) image = tf.reshape(image, shape=(self.train_input_size, self.train_input_size, 3)) bboxes = tf.reshape(bboxes, shape=(-1, 5)) label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = tf.numpy_function(self.preprocess_true_boxes, inp=[bboxes], Tout=[tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32]) label_sbbox = tf.reshape(label_sbbox, shape=(self.train_output_sizes[0], self.train_output_sizes[0], self.anchor_per_scale, 5 + self.num_classes)) label_mbbox = tf.reshape(label_mbbox, shape=(self.train_output_sizes[1], self.train_output_sizes[1], self.anchor_per_scale, 5 + self.num_classes)) label_lbbox = tf.reshape(label_lbbox, shape=(self.train_output_sizes[2], self.train_output_sizes[2], self.anchor_per_scale, 5 + self.num_classes)) sbboxes = tf.reshape(sbboxes, shape=(self.max_bbox_per_scale, 4)) mbboxes = tf.reshape(mbboxes, shape=(self.max_bbox_per_scale, 4)) lbboxes = tf.reshape(lbboxes, shape=(self.max_bbox_per_scale, 4)) return image, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
def grad(dheights): return tf.numpy_function( numpy_grad_func, [heights, dheights], DEFAULT_FLOAT_DTYPE_TF, )
def eval_update(gt, pred): tf.numpy_function(evaluator.update_state, [gt, postprocess.transform_detections(pred)], [])
vocab_size = top_k + 1 num_steps = len(img_name_train) // BATCH_SIZE # Shape of the vector extracted from InceptionV3 is (64, 2048) # These two variables represent that vector shape features_shape = 2048 attention_features_shape = 64 # Load the numpy files def map_func(img_name, cap): img_tensor = np.load(img_name.decode('utf-8') + '.npy') return img_tensor, cap dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train)) # Use map to load the numpy files in parallel dataset = dataset.map(lambda item1, item2 : tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls = tf.data.AUTOTUNE) # Shuffle and batch dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE) dataset = dataset.prefetch(buffer_size = tf.data.AUTOTUNE) # 모델 : 디코더는 신경망 기계 번역에 대한 예제의 디코더와 동일...하다는 데 코드레벨로는 다르다. class BahdanauAttention(tf.keras.Model): def __init__(self, units): super(BahdanauAttention, self).__init__() self.W1 = tf.keras.layers.Dense(units) self.W2 = tf.keras.layers.Dense(units) self.V = tf.keras.layers.Dense(1) def call(self, features, hidden): # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim) # hidden shape == (batch_size, hidden_size)
def parse_example_ctc_attention(self, serial_example): norm_h = self.norm_h expand_rate = self.expand_rate debug = False feat_dict = tf.io.parse_single_example(serial_example, features={ 'img_raw': tf.io.FixedLenFeature([], tf.string),\ 'height': tf.io.FixedLenFeature([], tf.int64),\ 'width': tf.io.FixedLenFeature([], tf.int64),\ 'channel': tf.io.FixedLenFeature([], tf.int64),\ 'img_path': tf.io.FixedLenFeature([], tf.string),\ 'coord': tf.io.FixedLenFeature([], tf.string),\ 'label': tf.io.FixedLenFeature([], tf.string)}) img_raw = feat_dict['img_raw'] height = feat_dict['height'] width = feat_dict['width'] channel = feat_dict['channel'] img_path = feat_dict['img_path'] coord = feat_dict['coord'] img_text = feat_dict['label'] ctc_idx, ctc_len, att_idx, att_len = tf.numpy_function( self.get_idlst_by_charstr, [img_text], [tf.int64, tf.int64, tf.int64, tf.int64]) ctc_idx = tf.cast(tf.reshape(ctc_idx, [-1]), tf.int32) att_idx = tf.cast(tf.reshape(att_idx, [-1]), tf.int32) ctc_len = tf.cast(ctc_len, tf.int32) att_len = tf.cast(att_len, tf.int32) coord_val = tf.strings.split([coord], ',').values coord_val = tf.strings.to_number(coord_val, out_type=tf.int32) img_raw = tf.io.decode_raw(img_raw, tf.uint8) orig_img = tf.reshape(img_raw, (height, width, channel)) prob = tf.random.uniform([]) invert_flg = tf.logical_and(tf.greater(prob, 0.75), tf.equal(self.mode, 'train')) orig_img = tf.cond( invert_flg, true_fn=lambda: tf.cast(255 - orig_img, dtype=tf.uint8), false_fn=lambda: orig_img) prob = tf.random.uniform([]) noise_flg = tf.logical_and(tf.greater(prob, 0.75), tf.equal(self.mode, 'train')) noise_idx = tf.random.shuffle(tf.range(2))[0] orig_img = tf.cond( noise_flg, true_fn=lambda: random_noise_static(orig_img, noise_idx), false_fn=lambda: random_noise_static(orig_img, -1)) prob = tf.random.uniform([]) encode_flg = tf.logical_and(tf.greater(prob, 0.75), tf.equal(self.mode, 'train')) encode_idx = tf.random.shuffle(tf.range(4))[0] orig_img = tf.cond( encode_flg, true_fn=lambda: encode_decode_static(orig_img, encode_idx), false_fn=lambda: encode_decode_static(orig_img, -1)) prob = tf.random.uniform([]) color_flg = tf.logical_and(tf.greater(prob, 0.75), tf.equal(self.mode, 'train')) color_idx = tf.random.shuffle(tf.range(6))[0] orig_img = tf.cond( color_flg, true_fn=lambda: distort_color_static(orig_img, color_idx), false_fn=lambda: distort_color_static(orig_img, -1)) prob = tf.random.uniform([]) coord_flg = tf.logical_and(tf.greater(prob, 0.4), tf.equal(self.mode, 'train')) coord_val1 = tf.cond( coord_flg, true_fn=lambda: coord_augmentation(coord_val, width, height), false_fn=lambda: (coord_val[0], coord_val[1], coord_val[2], coord_val[3])) offset_w = coord_val1[0] offset_h = coord_val1[1] target_w = coord_val1[2] - coord_val1[0] target_h = coord_val1[3] - coord_val1[1] crop_img = tf.image.crop_to_bounding_box(orig_img, offset_h, offset_w, target_h, target_w) ratio = tf.cast(norm_h, tf.float32) / tf.cast(target_h, tf.float32) norm_w = tf.cast( tf.cast(target_w, tf.float32) * expand_rate * ratio, tf.int32) norm_img = tf.image.resize(crop_img, (norm_h, norm_w)) if debug: norm_img = tf.cast(norm_img, tf.uint8) else: # convert RGB-->BGR mean = [127.5, 127.5, 127.5] norm_img = norm_img[:, :, ::-1] norm_img = (norm_img - mean) / 127.5 return img_path, norm_img, img_text, ctc_idx, ctc_len, att_idx, att_len, coord, norm_w
def map_fn(prob): return tf.numpy_function(self.perform_greedy, inp=[prob], Tout=tf.string) return tf.map_fn(map_fn, probs, fn_output_signature=tf.TensorSpec([], dtype=tf.string))
Load image from image_path resizing it to match inputs required for InceptionV3 - notably width and height of 299 pixels """ img = tf.io.read_file(image_path) img = tf.image.decode_png(img, channels=3) img = tf.image.resize(img, (299, 299)) img = tf.keras.applications.inception_v3.preprocess_input(img) return img, seq0, seq1, seq2, seq3, seq4, seq5, seq6, seq7, seq8, \ matrix_shapes train_dataset = train_dataset.map( lambda item1, item2, item3, item4, item5, item6, item7, item8, \ item9, item10, item11: tf.numpy_function(load_image, [item1, item2, item3, item4, item5, item6, item7, item8, item9, item10, item11], [tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) validation_dataset = validation_dataset.map( lambda item1, item2, item3, item4, item5, item6, item7, item8, \ item9, item10, item11: tf.numpy_function(load_image, [item1, item2, item3, item4, item5, item6, item7, item8, item9, item10, item11], [tf.float32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE)
def map_fn(prob): return tf.numpy_function(self.perform_beam_search, inp=[prob, lm], Tout=tf.string) return tf.map_fn(map_fn, probs, dtype=tf.string)
def __call__(self, inputs): return tf.map_fn( lambda i: tf.numpy_function(self._policy, [i], tf.int64), inputs, fn_output_signature=tf.int64, )
def attack_as_tf(img, y_true): return tf.numpy_function(attack, [img, y_true], tf.double)
def tf_env_step(self, action: tf.Tensor) -> List[tf.Tensor]: return tf.numpy_function(self.env_step, [action], [tf.float32, tf.float32, tf.float32])
def preprocessData(raw): print("---preprocessData---") global MAX_LENGTH global image_features_extract_model image_paths, image_path_to_caption = raw # before pre-processing, each image is corresponding to multiple caption. # we will duplicate the images so that we have (image, caption) pairs train_captions = [] img_name_vector = [] for image_path in image_paths: caption_list = image_path_to_caption[image_path] train_captions.extend(caption_list) img_name_vector.extend([image_path] * len(caption_list)) encode_train = sorted(set(img_name_vector)) image_dataset = tf.data.Dataset.from_tensor_slices(image_paths) image_dataset = image_dataset.map( load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16) # pretrained InceptionV3 to extract features from images image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet') new_input = image_model.input hidden_layer = image_model.layers[-1].output image_features_extract_model = tf.keras.Model(new_input, hidden_layer) # cache the features of image extracted by InceptionV3 to the disk # because the memory in RAM is not sufficient to store these features for all images # Note: only need to run in the frist time. Haozhe 11/25/20 '''for img, path in tqdm(image_dataset): batch_features = image_features_extract_model(img) batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3])) for bf, p in zip(batch_features, path): path_of_feature = p.numpy().decode("utf-8") np.save(path_of_feature, bf.numpy())''' # img_name_vector is a list of image file paths # train_captions is a list of corresponding captions # we need to split the training and testing set from this # return img_name_vector, train_captions # Preprocess and tokenize the captions # Choose the top 5000 words from the vocabulary tokenizer.fit_on_texts(train_captions) train_seqs = tokenizer.texts_to_sequences(train_captions) tokenizer.word_index['<pad>'] = 0 tokenizer.index_word[0] = '<pad>' # Create the tokenized vectors train_seqs = tokenizer.texts_to_sequences(train_captions) print("padding: ") print(train_seqs[:5]) # Pad each vector to the max_length of the captions # If you do not provide a max_length value, pad_sequences calculates it automatically cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post') # Calculates the max_length, which is used to store the attention weights MAX_LENGTH = calc_max_length(train_seqs) # Split the data into training and testing img_to_cap_vector = collections.defaultdict(list) for img, cap in zip(img_name_vector, cap_vector): img_to_cap_vector[img].append(cap) # Create training and validation sets using an 80-20 split randomly. img_keys = list(img_to_cap_vector.keys()) random.shuffle(img_keys) slice_index = int(len(img_keys) * 0.8) img_name_train_keys, img_name_val_keys = img_keys[:slice_index], img_keys[ slice_index:] print("parsing dataset") img_name_train = [] cap_train = [] for imgt in img_name_train_keys: capt_len = len(img_to_cap_vector[imgt]) img_name_train.extend([imgt] * capt_len) cap_train.extend(img_to_cap_vector[imgt]) img_name_val = [] cap_val = [] for imgv in img_name_val_keys: capv_len = len(img_to_cap_vector[imgv]) img_name_val.extend([imgv] * capv_len) cap_val.extend(img_to_cap_vector[imgv]) # Create a tf.data dataset for training num_steps = len(img_name_train) // BATCH_SIZE # Shape of the vector extracted from InceptionV3 is (64, 2048) # These two variables represent that vector shape features_shape = 2048 attention_features_shape = 64 dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train)) # Use map to load the numpy files in parallel dataset = dataset.map(lambda item1, item2: tf.numpy_function( map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=tf.data.experimental.AUTOTUNE) # Shuffle and batch dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=False) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) return dataset, (img_name_val, cap_val)
def tf_augment_sample(depthmap, targets): depthmap_aug = tf.numpy_function(augmentation, [depthmap, CONFIG.DATA_AUGMENTATION_MODE], tf.float32) depthmap_aug.set_shape((CONFIG.IMAGE_TARGET_HEIGHT, CONFIG.IMAGE_TARGET_WIDTH, CONFIG.N_ARTIFACTS)) targets.set_shape((len(CONFIG.TARGET_INDEXES,))) return depthmap_aug, targets
def parse_function(image_filename, label_filename): # print('--------------{}-------------------'.format(tf.as_string(image_filename))) # filename变成tensor了? img = tf.numpy_function(read_img, [image_filename], tf.float32) label = tf.numpy_function(read_label, [label_filename], tf.float32) return img, label
def print_string(batch: tf.Tensor): tf.numpy_function(lambda x: print(*bytes_to_string(x), sep="\n"), [batch], [])
def process_data(image, label): aug_img = tf.numpy_function(func=aug_fn, inp=[image], Tout=tf.float32) return aug_img, label
def tf_average_by_duration(x, durs): outs = tf.numpy_function(average_by_duration, [x, durs], tf.float32) return outs
def generate_detections(params, cls_outputs, box_outputs, image_scales, image_ids, flip=False): """A legacy interface for generating [id, x, y, w, h, score, class].""" _, width = utils.parse_image_size(params['image_size']) original_image_widths = tf.expand_dims(image_scales, -1) * width if params['nms_configs'].get('pyfunc', True): # numpy based soft-nms gives better accuracy than the tensorflow builtin # the reason why is unknown detections_bs = [] boxes, scores, classes = pre_nms(params, cls_outputs, box_outputs) for index in range(boxes.shape[0]): nms_configs = params['nms_configs'] detections = tf.numpy_function( functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [ boxes[index], scores[index], classes[index], tf.slice(image_ids, [index], [1]), tf.slice(image_scales, [index], [1]), params['num_classes'], nms_configs['max_output_size'], ], tf.float32) if flip: detections = tf.stack([ detections[:, 0], # the mirrored location of the left edge is the image width # minus the position of the right edge original_image_widths[index] - detections[:, 3], detections[:, 2], # the mirrored location of the right edge is the image width # minus the position of the left edge original_image_widths[index] - detections[:, 1], detections[:, 4], detections[:, 5], detections[:, 6], ], axis=-1) detections_bs.append(detections) return tf.stack(detections_bs, axis=0, name='detnections') nms_boxes_bs, nms_scores_bs, nms_classes_bs, _ = postprocess_per_class( params, cls_outputs, box_outputs, image_scales) image_ids_bs = tf.cast(tf.expand_dims(image_ids, -1), nms_scores_bs.dtype) if flip: detections_bs = [ image_ids_bs * tf.ones_like(nms_scores_bs), # the mirrored location of the left edge is the image width # minus the position of the right edge original_image_widths - nms_boxes_bs[:, :, 3], nms_boxes_bs[:, :, 0], # the mirrored location of the right edge is the image width # minus the position of the left edge original_image_widths - nms_boxes_bs[:, :, 1], nms_boxes_bs[:, :, 2], nms_scores_bs, nms_classes_bs, ] else: detections_bs = [ image_ids_bs * tf.ones_like(nms_scores_bs), nms_boxes_bs[:, :, 1], nms_boxes_bs[:, :, 0], nms_boxes_bs[:, :, 3], nms_boxes_bs[:, :, 2], nms_scores_bs, nms_classes_bs, ] return tf.stack(detections_bs, axis=-1, name='detnections')
def load_tensor_from_kaldi_archive(ark_key): return tf.numpy_function(_kaldiio_load, [ark_key], tf.float32)
def parse_example_ctc_attention(self, line): norm_h = self.norm_h expand_rate = self.expand_rate debug = False field_delim = ' ' use_quote_delim = False record_defaults = ['', '', ''] img_path, img_text, coord = tf.io.decode_csv(line, record_defaults, field_delim, use_quote_delim) ctc_idx, ctc_len, att_idx, att_len = tf.numpy_function( self.get_idlst_by_charstr, [img_text], [tf.int64, tf.int64, tf.int64, tf.int64]) ctc_idx = tf.cast(tf.reshape(ctc_idx, [-1]), tf.int32) att_idx = tf.cast(tf.reshape(att_idx, [-1]), tf.int32) ctc_len = tf.cast(ctc_len, tf.int32) att_len = tf.cast(att_len, tf.int32) coord_val = tf.strings.split([coord], ',').values coord_val = tf.strings.to_number(coord_val, out_type=tf.int32) orig_img = tf.image.decode_image(tf.io.read_file(img_path)) img_shape = tf.shape(orig_img) width = img_shape[1] height = img_shape[0] prob = tf.random.uniform([]) invert_flg = tf.logical_and(tf.greater(prob, 0.0), tf.equal(self.mode, 'train')) orig_img = tf.cond( invert_flg, true_fn=lambda: tf.cast(255 - orig_img, dtype=tf.uint8), false_fn=lambda: orig_img) prob = tf.random.uniform([]) noise_flg = tf.logical_and(tf.greater(prob, 0.0), tf.equal(self.mode, 'train')) noise_idx = tf.random.shuffle(tf.range(2))[0] orig_img = tf.cond( noise_flg, true_fn=lambda: random_noise_static(orig_img, noise_idx), false_fn=lambda: random_noise_static(orig_img, -1)) prob = tf.random.uniform([]) encode_flg = tf.logical_and(tf.greater(0.3, prob), tf.equal(self.mode, 'train')) encode_idx = tf.random.shuffle(tf.range(4))[0] orig_img = tf.cond( encode_flg, true_fn=lambda: encode_decode_static(orig_img, encode_idx), false_fn=lambda: encode_decode_static(orig_img, -1)) prob = tf.random.uniform([]) color_flg = tf.logical_and(tf.greater(prob, 0), tf.equal(self.mode, 'train')) color_idx = tf.random.shuffle(tf.range(6))[0] orig_img = tf.cond( color_flg, true_fn=lambda: distort_color_static(orig_img, color_idx), false_fn=lambda: distort_color_static(orig_img, -1)) prob = tf.random.uniform([]) coord_flg = tf.logical_and(tf.greater(prob, 0), tf.equal(self.mode, 'train')) coord_val1 = tf.cond( coord_flg, true_fn=lambda: coord_augmentation(coord_val, width, height), false_fn=lambda: (coord_val[0], coord_val[1], coord_val[2], coord_val[3])) offset_w = coord_val1[0] offset_h = coord_val1[1] target_w = coord_val1[2] - coord_val1[0] target_h = coord_val1[3] - coord_val1[1] crop_img = tf.image.crop_to_bounding_box(orig_img, offset_h, offset_w, target_h, target_w) ratio = tf.cast(norm_h, tf.float32) / tf.cast(target_h, tf.float32) norm_w = tf.cast( tf.cast(target_w, tf.float32) * expand_rate * ratio, tf.int32) norm_img = tf.image.resize(crop_img, (norm_h, norm_w)) if debug: norm_img = tf.cast(norm_img, tf.uint8) else: # convert RGB-->BGR mean = [127.5, 127.5, 127.5] norm_img = norm_img[:, :, ::-1] norm_img = norm_img - mean return img_path, norm_img, img_text, ctc_idx, ctc_len, att_idx, att_len, coord, norm_w