def preprocessing(dsData: tf.data.Dataset, window_size, batch_size): dsData = dsData.window(window_size + 1, shift=1, drop_remainder=True) dsData = dsData.flat_map(lambda w: w.batch(window_size + 1)) dsData = dsData.map(lambda x: (x[:-1], x[-1])) dsData = dsData.shuffle(1000) dsData = dsData.batch(batch_size).prefetch(1) return dsData
def processing(dataset: tf.data.Dataset, window_size, batch_size): dataset = dataset.map(lambda x: table.lookup(x)) dataset = dataset.unbatch() dataset = dataset.window(window_size+1, shift = 1, drop_remainder=True) dataset = dataset.flat_map(lambda ds: ds.batch(window_size+1)) dataset = dataset.map(lambda x: (x[:-1], x[-1]-1)) dataset = dataset.shuffle(10000) dataset = dataset.batch(batch_size).prefetch(1) return dataset
def train(self, dataset: tf.data.Dataset, nr_records: int): dataset = dataset.flat_map(self.transform_train_data).batch( self.batch_size) dataset = dataset.shuffle(2048) nr_steps = nr_records // self.batch_size for i in range(self.epochs): step = 0 for data in dataset: loss_value, grads = grad(self.model, data) self.optimizer.apply_gradients( zip(grads, [self.model.U, self.model.P])) # printProgressBar(step, nr_steps, 'Epoch {}, loss: {:.3f}'.format(i, loss_value),length=80) if step % 10 == 0: print("\rEpoch #{} Loss at step {}: {:.4f}".format( i, step, loss_value), end='\r') step += 1 print()
def train(self, dataset: tf.data.Dataset, nr_records: int): dataset = dataset.flat_map(self.transform_train_data).batch( self.batch_size) dataset = dataset.shuffle(4096) nr_steps = nr_records // self.batch_size for i in range(self.epochs): step = 0 metric_mean = tf.keras.metrics.Mean() for data in dataset: loss_value, grads = grad(self.model, data) self.optimizer.apply_gradients( zip(grads, [self.model.U, self.model.P])) metric_mean.update_state(loss_value) if step % 10 == 0: print("\rEpoch #{} Loss at step {}: {:.4f}".format( i, step, metric_mean.result()), end='\r') step += 1 print()
def transform_to_reader_validate_dataset( dataset: tf.data.Dataset, max_sequence_length: int = 256 ): def _flat_map(element): answers = element['answers'] question = element['question'] sequence_ids = element['passages/sequence_ids'] passage_offset = element['passages/passage_offset'] has_answer = element['has_answer'] sequence_ids = tf.sparse.to_dense(sequence_ids) filtered_idxs = tf.where(has_answer) filtered_sequence_ids = tf.gather_nd(sequence_ids, filtered_idxs) filtered_sequence_ids = filtered_sequence_ids[:, :max_sequence_length] filtered_sequence_ids = tf.pad(filtered_sequence_ids, [[0, 0], [0, max_sequence_length - tf.shape(filtered_sequence_ids)[1]]]) filtered_sequence_ids = tf.reshape(filtered_sequence_ids, [-1, max_sequence_length]) filtered_passage_offset = tf.gather_nd(passage_offset, filtered_idxs) filtered_passage_offset = tf.reshape(filtered_passage_offset, [-1]) answers = tf.expand_dims(answers, axis=0) answers = tf.tile(answers, multiples=[tf.shape(filtered_passage_offset)[0]]) question = tf.tile(question, multiples=[tf.shape(filtered_passage_offset)[0]]) tensors = { "answers": answers, "question": question, "input_ids": filtered_sequence_ids, "passage_offset": filtered_passage_offset, } return tf.data.Dataset.from_tensor_slices(tensors) dataset = dataset.flat_map( _flat_map ) return dataset
def train( self, dataset: tf.data.Dataset, class_weights: Optional[Dict[int, float]], n_features: int, context_size: int, word_embeddings: int, n_epoch: int = 100, from_latest: bool = False, name: str = "", **kwargs, ): # dataset: sequence of (None, n_features), (None, n_labels) if class_weights is not None: class_weights_tensor = tf.convert_to_tensor(list( class_weights.values()), dtype="float32") def dsmap(x, y): nonlocal word_embeddings, context_size if word_embeddings > 0: a = seq2seqofcontexts(x[0], context_size) b = seq2seqofcontexts(x[1], context_size) inputs = tf.data.Dataset.zip((a, b)) else: inputs = seq2seqofcontexts(x, context_size) if class_weights is not None: y = y * class_weights_tensor y = tf.data.Dataset.from_tensor_slices(y) return tf.data.Dataset.zip((inputs, y)) dataset = dataset.flat_map(dsmap).shuffle(4096).batch(2048).prefetch(3) if from_latest: print( "Reloading from checkpoint. Checking that parameters haven't changed." ) assert word_embeddings == self.params.word_embeddings assert context_size == self.params.context_size assert n_features == self.params.n_features self._model = load_model(self.path + "-chk") else: self._model = net_1d(n_features, context_size, 1 + len(self.labels), word_embeddings) self.model.summary() self.model.compile( optimizer=SGD(learning_rate=0.01, momentum=0.9, nesterov=True), loss="categorical_crossentropy", ) with open(self.params_path, "wb") as f: self._params = CNNParams(context_size, word_embeddings, class_weights is not None, n_features) pickle.dump(self._params, f) log_dir = f"{self.path}/logs/{name}/" + datetime.datetime.now( ).strftime("%Y%m%d-%H%M%S") tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) self.model.fit( dataset, epochs=n_epoch, verbose=1, callbacks=[ ModelCheckpoint(self.path + "-chk"), tensorboard_callback ], ) self.model.save(self.path)
def labelled_video_dataset_to_image_dataset(video_dataset: tf.data.Dataset, take_n: int): dataset = video_dataset.flat_map( labelled_video_element_to_frame_dataset_mapper('video', take_n=take_n)) return dataset
def transform_to_reader_train_dataset( dataset: tf.data.Dataset, max_sequence_length: int = 256, max_answers: int = 10 ): def _process(element): # parse positive sequence ids positive_sequence_ids_serialized = element['positive_passages/sequence_ids'][0] positive_sequence_ids_sparse = tf.io.parse_tensor(positive_sequence_ids_serialized, out_type=tf.string) positive_sequence_ids_indices = tf.io.parse_tensor(positive_sequence_ids_sparse[0], out_type=tf.int64) positive_sequence_ids_values = tf.io.parse_tensor(positive_sequence_ids_sparse[1], out_type=tf.int32) positive_sequence_ids_dense_shape = tf.io.parse_tensor(positive_sequence_ids_sparse[2], out_type=tf.int64) positive_sequence_ids = tf.sparse.SparseTensor( indices=positive_sequence_ids_indices, values=positive_sequence_ids_values, dense_shape=positive_sequence_ids_dense_shape ) positive_sequence_ids = tf.sparse.to_dense(positive_sequence_ids) # pad positive sequence ids positive_sequence_ids = positive_sequence_ids[:, :max_sequence_length] # truncate positive_sequence_ids = tf.pad(positive_sequence_ids, [[0, 0], [0, max_sequence_length - tf.shape(positive_sequence_ids)[1]]]) # padding # parse start positions positive_start_positions_serialized = element['positive_passages/start_positions'][0] positive_start_positions_sparse = tf.io.parse_tensor(positive_start_positions_serialized, out_type=tf.string) positive_start_positions_indices = tf.io.parse_tensor(positive_start_positions_sparse[0], out_type=tf.int64) positive_start_positions_values = tf.io.parse_tensor(positive_start_positions_sparse[1], out_type=tf.int32) positive_start_positions_dense_shape = tf.io.parse_tensor(positive_start_positions_sparse[2], out_type=tf.int64) positive_start_positions = tf.sparse.SparseTensor( indices=positive_start_positions_indices, values=positive_start_positions_values, dense_shape=positive_start_positions_dense_shape ) positive_start_positions = tf.sparse.to_dense(positive_start_positions) # pad start positions positive_start_positions = positive_start_positions[:, :max_answers] # truncate positive_start_positions = tf.pad(positive_start_positions, [[0, 0], [0, max_answers - tf.shape(positive_start_positions)[1]]]) # padding # parse positive end positions positive_end_positions_serialized = element['positive_passages/end_positions'][0] positive_end_positions_sparse = tf.io.parse_tensor(positive_end_positions_serialized, out_type=tf.string) positive_end_positions_indices = tf.io.parse_tensor(positive_end_positions_sparse[0], out_type=tf.int64) positive_end_positions_values = tf.io.parse_tensor(positive_end_positions_sparse[1], out_type=tf.int32) positive_end_positions_dense_shape = tf.io.parse_tensor(positive_end_positions_sparse[2], out_type=tf.int64) positive_end_positions = tf.sparse.SparseTensor( indices=positive_end_positions_indices, values=positive_end_positions_values, dense_shape=positive_end_positions_dense_shape ) positive_end_positions = tf.sparse.to_dense(positive_end_positions) # pad end positions positive_end_positions = positive_end_positions[:, :max_answers] # truncate positive_end_positions = tf.pad(positive_end_positions, [[0, 0], [0, max_answers - tf.shape(positive_end_positions)[1]]]) return { "input_ids": positive_sequence_ids, "start_positions": positive_start_positions, "end_positions": positive_end_positions } dataset = dataset.map( _process, num_parallel_calls=tf.data.AUTOTUNE, deterministic=True ) dataset = dataset.flat_map( lambda x: tf.data.Dataset.from_tensor_slices(x), ) return dataset