def testNestedOutputs(self): ds = Dataset.zip((Dataset.range(4), Dataset.zip((Dataset.range(4), Dataset.range(4))))) total = 0 # The Iterator will return a nested structure of Tensor objects. # Some funkiness to compare against simple integers. for (i, x) in enumerate(datasets.Iterator(ds)): want = (i, (i, i)) got = (x[0].numpy(), (x[1][0].numpy(), x[1][1].numpy())) self.assertEqual(got, want) total += 1 self.assertEqual(4, total)
def testNestedOutputs(self): ds = Dataset.zip((Dataset.range(4), Dataset.zip((Dataset.range(4), Dataset.range(4))))) total = 0 # The Iterator will return a nested structure of Tensor objects. # Some funkiness to compare against simple integers. for (i, x) in enumerate(datasets.Iterator(ds)): want = (i, (i, i)) got = (x[0].numpy(), (x[1][0].numpy(), x[1][1].numpy())) self.assertEqual(got, want) total += 1 self.assertEqual(4, total)
def input_fn(): name = model_class.__name__ if name == "ClassifierModel": dataset = Dataset.zip((labeled(), task("classify"))) elif name == "ClassifierSynthModel": synthetic = self.dataset_synthetic() dataset = Dataset.zip((synthetic, task("classify_synth"))) elif name == "VAECondModel": dataset = Dataset.zip((labeled(), task("vae_cond_lab"))) iterator = dataset.make_one_shot_iterator() (sequences, seq_lens, labels), tasks = iterator.get_next() inputs = { "sequences": sequences, "seq_lens": seq_lens, "tasks": tasks } return inputs, labels
def datasets(self): synthetic = self.dataset_synthetic() lab_unshuf = self.get_dataset(labeled=True) unlab_unshuf = self.get_dataset(labeled=False) labeled = self.repeat_and_shuffle(lab_unshuf) unlabeled = self.repeat_and_shuffle(unlab_unshuf) lab_unlab = self.repeat_and_shuffle( lab_unshuf.concatenate(unlab_unshuf)) def task(task_name): s = tf.constant(task_name, tf.string) return Dataset.from_tensors(s).repeat() return { "vae_uncond": Dataset.zip((lab_unlab, task("vae_uncond"))), "classify": Dataset.zip((labeled, task("classify"))), "vae_cond_lab": Dataset.zip((labeled, task("vae_cond_lab"))), "vae_cond_unlab": Dataset.zip((unlabeled, task("vae_cond_unlab"))), "classify_synth": Dataset.zip((synthetic, task("classify_synth"))), }
def construct(dataset, frames, dilation): #We want to create a dataset that provides #a `frames`-sized moving window over the #original dataset: e.g., (1,2,3,4), then (2,3,4,5), #then (3,4,5,6), and so on for `frames == 4`. We do #this by creating `frames` datasets, each of which #skips the first `i` elements of the original #dataset. Zipping them together then produces the #dataset we're looking for. We can additionally #apply a dilation, where would see e.g. for #`dilation == 2` (1,2,4,8), (2,3,5,9), ... datasets = [] for i in range(frames): datasets.append(dataset.skip(i)) dataset = Dataset.zip(tuple(datasets))
def dataset_synthetic(self): # this is not actually generating a synthetic dataset but creates # dummy dataset that will result in the generation of synthetic data nc = self.hparams.num_classes seqs = tf.zeros(shape=[nc, self.hparams.max_seq_len], dtype=tf.int32) seq_lens = tf.ones(shape=[nc, 1], dtype=tf.int32) seq_lens *= self.hparams.max_seq_len labels = tf.constant(np.arange(nc), dtype=tf.int32) labels = tf.reshape(labels, [nc, 1]) seqs = Dataset.from_tensor_slices(seqs) seq_lens = Dataset.from_tensor_slices(seq_lens) labels = Dataset.from_tensor_slices(labels) dataset = Dataset.zip((seqs, seq_lens, labels)) dataset = self.repeat_and_shuffle(dataset) return dataset
def split_multi_string(self, x): """ Splits a list of strings on whitespaces and casts them to int :param x: list of sequences :return: list of int tensors """ # Split best answers on comma n_best_answers = tf.string_split([x], delimiter=",") # Reformat data to sparse tensor n_best_answers = tf.SparseTensorValue(indices=n_best_answers.indices, values=tf.string_split(n_best_answers.values), dense_shape=n_best_answers.dense_shape) # Get data as sparse tensor up_ba = n_best_answers.values # Sparse tensor to dense Tensor with padding '0' up_ba = tf.sparse_to_dense(up_ba.indices, (up_ba.dense_shape[0], up_ba.dense_shape[1] + 1), up_ba.values, '0') # If n_best_answer not empty, convert every answer to int up_ba = tf.cond(tf.greater(tf.size(up_ba), 0), lambda: tf.map_fn(lambda s: tf.string_to_number(s, out_type=tf.int32), up_ba, dtype=tf.int32), lambda: tf.string_to_number(up_ba, out_type=tf.int32)) # Get length of all sequences seq_len = tf.argmin(tf.to_int32(tf.not_equal(up_ba, 0)), axis=1) # Create filter mask idx = tf.less_equal(seq_len, self.max_seq_len - 1) # Filter sequences and length up_ba = tf.boolean_mask(up_ba, idx) up_ba = up_ba[:, 0:tf.cond(tf.greater(tf.size(up_ba), 0), lambda: self.max_seq_len, lambda: 0)] up_ba = tf.pad(up_ba, [[0, 0], [0, self.max_seq_len - tf.shape(up_ba)[1]]]) seq_len = tf.boolean_mask(seq_len, idx) + 1 # Make datasets sequences_nba = Dataset.from_tensors(up_ba) len_nba = Dataset.from_tensors(seq_len) return Dataset.zip((sequences_nba, len_nba))
def process_target(self, sub, len_sub, cont, len_cont, nba, nba_len): """ Processes target sequence with setting GO-Symbol in front of target input sequence and EOS-Symbol at the end of target output sequence :param sub: subject sequences :param len_sub: length of subject sequences :param cont: content sequences :param len_cont: length of content sequences :param nba: n best answers sequences :param nba_len: length of n best answers :return: Dataset with processed target input and output """ def set_eos(nba): """ Receives one answer out of all answers and sets an EOS-Symbol at the end :param nba: One answer out of n best answers :return: Answer with following EOS-Symbol """ len_nba = tf.argmin(nba, output_type=tf.int32) nba = tf.concat([nba[:len_nba], [3], nba[len_nba + 1:]], axis=0) return nba # Set number 2 (GO) at the beginning of the sequences target_input = tf.concat([tf.fill((tf.size(nba_len), 1), 2), nba[:, :-1]], axis=1) # Set number 3 (EOS) at the end of the sequences target_output = tf.map_fn(set_eos, nba, dtype=tf.int32) # Convert function input back to dataset sub = Dataset.from_tensors(sub) len_sub = Dataset.from_tensors(len_sub) cont = Dataset.from_tensors(cont) len_cont = Dataset.from_tensors(len_cont) target_input = Dataset.from_tensors(target_input) target_output = Dataset.from_tensors(target_output) len_nba = Dataset.from_tensors(nba_len) return Dataset.zip((sub, len_sub, cont, len_cont, target_input, target_output, len_nba))
def bucketing(questions, answers, boundaries, batch_size, shuffle, shuffle_size): ''' Bucketing questions and answers for training. :param questions: :param answers: :param boundaries: :param batch_size: :param shuffle: :param shuffle_size: :return: Tuple of (question length, question, answer length, answer) ''' def _which_bucket(question_len, question, answer_len, answer): q_max_boundaries, a_max_boundaries = list(zip(*boundaries)) which_bucket = tf.reduce_min( tf.where( tf.logical_and(question_len <= q_max_boundaries, answer_len <= a_max_boundaries))) return tf.to_int64(which_bucket) def _reduce_batch(key, batch): return batch.padded_batch(batch_size, ((), (None, ), (), (None, ))) q_max, a_max = max(boundaries) questions_and_answers = Dataset.zip(( questions.map(lambda q: tf.size(q)), questions, answers.map(lambda a: tf.size(a)), answers, )).filter(lambda q_size, q, a_size, a: tf.logical_and( q_size <= q_max, a_size <= a_max)) questions_and_answers = questions_and_answers.group_by_window( _which_bucket, _reduce_batch, batch_size) if shuffle: questions_and_answers = questions_and_answers.shuffle(shuffle_size) return questions_and_answers
def get_seq_len_and_join_ba(self, sub, cont, nba_and_len): """ For getting length of subject sequences, content sequences and answer sequences :param sub: subject sequences :param cont: content sequences :param nba_and_len: n best answer ans length :return: Dataset with sequences and length """ # Seperate n best answers and length nba = nba_and_len[0] nba_len = nba_and_len[1] # Count sequence length of subject and content len_sub = Dataset.from_tensors(tf.size(sub)) len_cont = Dataset.from_tensors(tf.size(cont)) # Make dataset from tensors sub = Dataset.from_tensors(sub) cont = Dataset.from_tensors(cont) nba = Dataset.from_tensors(nba) nba_len = Dataset.from_tensors(nba_len) return Dataset.zip((sub, len_sub, cont, len_cont, nba, nba_len))
def dataset_with_label(self, label_int, src_pattern): label = tf.constant(label_int, tf.int32, name="label") lines = Dataset.list_files(src_pattern).flat_map( lambda fn: TextLineDataset(fn)) labels = Dataset.from_tensors(label).repeat() return Dataset.zip((lines, labels))
def model_fn_inner(features, labels, mode, params, config): feat_tensor = cap_idx_tensor = cap_len_tensor = None scaffold = None if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: cap_lens = labels.map(lambda t: tf.size(t)) pad_size = ((params.bin_size * params.bin_size, 1536), (None, ), ()) batches = Dataset.zip((features, labels, cap_lens)) \ .shuffle(buffer_size=200 * params.batch_size) \ .padded_batch(params.batch_size, pad_size) if mode == ModeKeys.TRAIN: train_iterator = batches \ .repeat() \ .make_initializable_iterator() feat_tensor, cap_idx_tensor, cap_len_tensor = \ train_iterator.get_next() tf.add_to_collection("train_initializer", train_iterator.initializer) if mode == ModeKeys.EVAL: val_iterator = batches \ .make_initializable_iterator() feat_tensor, cap_idx_tensor, cap_len_tensor = \ val_iterator.get_next() tf.add_to_collection("val_initializer", val_iterator.initializer) scaffold = tf.train.Scaffold(init_op=val_iterator.initializer) if mode == ModeKeys.INFER: # for infer, we need to get image id. batches = features.padded_batch( params.batch_size, ((), (params.bin_size * params.bin_size, 1536))) infer_iterator = batches.make_initializable_iterator() image_id, feat_tensor = infer_iterator.get_next() tf.add_to_collection("infer_initializer", infer_iterator.initializer) loss_op = None train_op = None predictions = None model = AttendTell(vocab_size=params.vocab_size, dim_feature=(params.bin_size * params.bin_size, 1536), selector=params.selector, dropout=params.dropout, ctx2out=params.ctx2out, prev2out=params.prev2out, hard_attention=params.hard_attention, mode=mode) if mode != ModeKeys.INFER: if params.use_sampler: outputs = model.build_train(feat_tensor, cap_idx_tensor, use_generated_inputs=True) else: outputs = model.build_train(feat_tensor, cap_idx_tensor, use_generated_inputs=False) loss_op = create_loss(outputs, cap_idx_tensor, cap_len_tensor) train_op = _get_train_op(loss_op, params.learning_rate, params.hard_attention) else: outputs = model.build_infer(feat_tensor) predictions = tf.argmax(outputs, axis=-1) if mode != ModeKeys.INFER: return EstimatorSpec(mode=mode, predictions=predictions, loss=loss_op, train_op=train_op, scaffold=scaffold) else: return EstimatorSpec(mode=mode, predictions={ "image_id": image_id, "predictions": predictions }, loss=loss_op, train_op=train_op, scaffold=scaffold)
def model_fn(features, labels, mode, params, config): feat_tensor = caption_tensor = cap_idx_tensor = cap_len_tensor = None scaffold = None bin_size = 8 if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: cap_lens = labels["index"].map(lambda t: tf.size(t)) # todo: cannot utilize GPU to accelerate input pipeline, so train 1 by 1 # def extract_feats(image): # with tf.device("/gpu:0"): # _, end_points = vgg.vgg_16(tf.expand_dims(image, 0), # is_training=(mode == ModeKeys.TRAIN), # spatial_squeeze=False) # final_conv_layer = end_points['vgg_16/conv5/conv5_3'] # feats = spatial_pyramid_pooling(final_conv_layer, [bin_size], mode='avg') # return tf.reshape(feats, shape=(bin_size * bin_size, tf.shape(final_conv_layer)[-1])) # features = features.map(extract_feats) datasets = (features, labels["raw"], labels["index"], cap_lens) # todo: 512 is the feature depth, should not hard code here # pad_size = ((bin_size * bin_size, 512), (), (None,), ()) pad_size = ((None, None, 3), (), (None, ), ()) # todo: cannot utilize GPU to accelerate input pipeline, so train 1 by 1 batches = Dataset.zip(datasets) \ .shuffle(buffer_size=200 * params.batch_size) \ .padded_batch(1, pad_size) if mode == ModeKeys.TRAIN: train_iterator = batches \ .repeat() \ .make_initializable_iterator() feat_tensor, caption_tensor, cap_idx_tensor, cap_len_tensor = \ train_iterator.get_next() tf.add_to_collection("train_initializer", train_iterator.initializer) if mode == ModeKeys.EVAL: val_iterator = batches \ .make_initializable_iterator() feat_tensor, caption_tensor, cap_idx_tensor, cap_len_tensor = \ val_iterator.get_next() tf.add_to_collection("val_initializer", val_iterator.initializer) scaffold = tf.train.Scaffold(init_op=val_iterator.initializer) if mode == ModeKeys.INFER: batches = features.batch(params.batch_size) infer_iterator = batches.make_initializable_iterator() feat_tensor = infer_iterator.get_next() tf.add_to_collection("infer_initializer", infer_iterator.initializer) feat_tensor = _extract_feats(bin_size, feat_tensor, mode) if mode == ModeKeys.TRAIN: variables_to_restore = slim.get_variables_to_restore( exclude=['global_step']) init_fn = assign_from_checkpoint_fn(params.vgg_model_path, variables_to_restore) # signature of sc scaffold = tf.train.Scaffold(init_fn=lambda _, sess: init_fn(sess)) loss_op = None train_op = None predictions = None model = AttendTell(vocab_size=params.vocab_size, selector=params.selector, dropout=params.dropout, ctx2out=params.ctx2out, prev2out=params.prev2out, hard_attention=params.hard_attention, mode=mode) if mode != ModeKeys.INFER: if params.use_sampler: outputs = model.build_train(feat_tensor, cap_idx_tensor, use_generated_inputs=True) else: outputs = model.build_train(feat_tensor, cap_idx_tensor, use_generated_inputs=False) loss_op = create_loss(outputs, cap_idx_tensor, cap_len_tensor) train_op = _get_train_op(loss_op, params.learning_rate, params.hard_attention) else: outputs = model.build_infer(feat_tensor) predictions = tf.argmax(outputs, axis=-1) return EstimatorSpec(mode=mode, predictions=predictions, loss=loss_op, train_op=train_op, scaffold=scaffold)
def input_fn(self, num_threads=1): """ Receives sequences from defined data path and processes it to feed it into the seq2seq system. :param num_threads: Number of parallel operations :return: training iterator for iterating over training data and validation iterator for iterating over validation data """ # Get subject sequences from file sequences_subject = TextLineDataset(self.data_path + "sequences_subject.txt") sequences_subject = sequences_subject.map(self.split_string, num_threads=num_threads) # Get content sequences from file sequences_content = TextLineDataset(self.data_path + "sequences_content.txt") sequences_content = sequences_content.map(self.split_string, num_threads=num_threads) # Get n best answer sequences from file sequences_n_best_answers = TextLineDataset(self.data_path + "sequences_n_best_answers.txt") sequences_n_best_answers = sequences_n_best_answers.flat_map(self.split_multi_string) # Merge sequences into dataset all_data = Dataset.zip((sequences_subject, sequences_content, sequences_n_best_answers)) # Get length for all sequences all_data = all_data.flat_map(self.get_seq_len_and_join_ba) # Filter sequence by maximum sequence length all_data = all_data.filter(self.filter_by_sequence_length) # Process target sequences by setting GO and EOS symbols all_data = all_data.flat_map(self.process_target) # Pad all sequences to a fixed length all_data = all_data.map(self.process_pad) # Count of validation data. In the thesis, the value 10000 was used. To work with little data count, the value # is set to 100 for now n_val_data = 100 # Make validation data by defining count, repeat data after count validation_data = all_data.take(n_val_data).repeat() # Process a padding for validation data validation_data = validation_data.padded_batch(self.batch_size, padded_shapes=( [None], [], [None], [], [None, self.max_seq_len], [None, self.max_seq_len], [None])) # Make training data by defining count, repeat data after count all_data = all_data.skip(n_val_data) all_data = all_data.repeat() # Shuffle training data after 10000 iterations all_data = all_data.shuffle(10000) # Process a padding for training data all_data = all_data.padded_batch(self.batch_size, padded_shapes=( [None], [], [None], [], [None, self.max_seq_len], [None, self.max_seq_len], [None])) # Make iterators for iterating over training and validation data training_iterator = all_data.make_one_shot_iterator() validation_iterator = validation_data.make_one_shot_iterator() return training_iterator, validation_iterator