예제 #1
0
def preprocessing(dsData: tf.data.Dataset, window_size, batch_size):
    dsData = dsData.window(window_size + 1, shift=1, drop_remainder=True)
    dsData = dsData.flat_map(lambda w: w.batch(window_size + 1))
    dsData = dsData.map(lambda x: (x[:-1], x[-1]))
    dsData = dsData.shuffle(1000)
    dsData = dsData.batch(batch_size).prefetch(1)
    return dsData
예제 #2
0
def processing(dataset: tf.data.Dataset, window_size, batch_size):
    dataset = dataset.map(lambda x: table.lookup(x))
    dataset = dataset.unbatch()
    dataset = dataset.window(window_size+1, shift = 1, drop_remainder=True)
    dataset = dataset.flat_map(lambda ds: ds.batch(window_size+1))
    dataset = dataset.map(lambda x: (x[:-1], x[-1]-1))
    dataset = dataset.shuffle(10000)
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset
예제 #3
0
 def train(self, dataset: tf.data.Dataset, nr_records: int):
     dataset = dataset.flat_map(self.transform_train_data).batch(
         self.batch_size)
     dataset = dataset.shuffle(2048)
     nr_steps = nr_records // self.batch_size
     for i in range(self.epochs):
         step = 0
         for data in dataset:
             loss_value, grads = grad(self.model, data)
             self.optimizer.apply_gradients(
                 zip(grads, [self.model.U, self.model.P]))
             # printProgressBar(step, nr_steps, 'Epoch {}, loss:  {:.3f}'.format(i, loss_value),length=80)
             if step % 10 == 0:
                 print("\rEpoch #{} Loss at step {}: {:.4f}".format(
                     i, step, loss_value),
                       end='\r')
             step += 1
         print()
예제 #4
0
 def train(self, dataset: tf.data.Dataset, nr_records: int):
     dataset = dataset.flat_map(self.transform_train_data).batch(
         self.batch_size)
     dataset = dataset.shuffle(4096)
     nr_steps = nr_records // self.batch_size
     for i in range(self.epochs):
         step = 0
         metric_mean = tf.keras.metrics.Mean()
         for data in dataset:
             loss_value, grads = grad(self.model, data)
             self.optimizer.apply_gradients(
                 zip(grads, [self.model.U, self.model.P]))
             metric_mean.update_state(loss_value)
             if step % 10 == 0:
                 print("\rEpoch #{} Loss at step {}: {:.4f}".format(
                     i, step, metric_mean.result()),
                       end='\r')
             step += 1
         print()
예제 #5
0
def transform_to_reader_validate_dataset(
    dataset: tf.data.Dataset,
    max_sequence_length: int = 256
):
    def _flat_map(element):
        answers = element['answers']
        question = element['question']
        sequence_ids = element['passages/sequence_ids']
        passage_offset = element['passages/passage_offset']
        has_answer = element['has_answer']

        sequence_ids = tf.sparse.to_dense(sequence_ids)
        filtered_idxs = tf.where(has_answer)
        filtered_sequence_ids = tf.gather_nd(sequence_ids, filtered_idxs)
        filtered_sequence_ids = filtered_sequence_ids[:, :max_sequence_length]
        filtered_sequence_ids = tf.pad(filtered_sequence_ids, [[0, 0], [0, max_sequence_length - tf.shape(filtered_sequence_ids)[1]]])
        filtered_sequence_ids = tf.reshape(filtered_sequence_ids, [-1, max_sequence_length])
        filtered_passage_offset = tf.gather_nd(passage_offset, filtered_idxs)
        filtered_passage_offset = tf.reshape(filtered_passage_offset, [-1])

        answers = tf.expand_dims(answers, axis=0)
        answers = tf.tile(answers, multiples=[tf.shape(filtered_passage_offset)[0]])
        question = tf.tile(question, multiples=[tf.shape(filtered_passage_offset)[0]])

        tensors = {
            "answers": answers,
            "question": question,
            "input_ids": filtered_sequence_ids,
            "passage_offset": filtered_passage_offset, 
        }

        return tf.data.Dataset.from_tensor_slices(tensors)

    dataset = dataset.flat_map(
        _flat_map
    )

    return dataset
예제 #6
0
    def train(
        self,
        dataset: tf.data.Dataset,
        class_weights: Optional[Dict[int, float]],
        n_features: int,
        context_size: int,
        word_embeddings: int,
        n_epoch: int = 100,
        from_latest: bool = False,
        name: str = "",
        **kwargs,
    ):
        # dataset: sequence of (None, n_features), (None, n_labels)
        if class_weights is not None:
            class_weights_tensor = tf.convert_to_tensor(list(
                class_weights.values()),
                                                        dtype="float32")

        def dsmap(x, y):
            nonlocal word_embeddings, context_size

            if word_embeddings > 0:
                a = seq2seqofcontexts(x[0], context_size)
                b = seq2seqofcontexts(x[1], context_size)
                inputs = tf.data.Dataset.zip((a, b))
            else:
                inputs = seq2seqofcontexts(x, context_size)

            if class_weights is not None:
                y = y * class_weights_tensor
            y = tf.data.Dataset.from_tensor_slices(y)

            return tf.data.Dataset.zip((inputs, y))

        dataset = dataset.flat_map(dsmap).shuffle(4096).batch(2048).prefetch(3)

        if from_latest:
            print(
                "Reloading from checkpoint. Checking that parameters haven't changed."
            )
            assert word_embeddings == self.params.word_embeddings
            assert context_size == self.params.context_size
            assert n_features == self.params.n_features

            self._model = load_model(self.path + "-chk")
        else:
            self._model = net_1d(n_features, context_size,
                                 1 + len(self.labels), word_embeddings)
            self.model.summary()

        self.model.compile(
            optimizer=SGD(learning_rate=0.01, momentum=0.9, nesterov=True),
            loss="categorical_crossentropy",
        )

        with open(self.params_path, "wb") as f:
            self._params = CNNParams(context_size, word_embeddings,
                                     class_weights is not None, n_features)
            pickle.dump(self._params, f)

        log_dir = f"{self.path}/logs/{name}/" + datetime.datetime.now(
        ).strftime("%Y%m%d-%H%M%S")
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                              histogram_freq=1)

        self.model.fit(
            dataset,
            epochs=n_epoch,
            verbose=1,
            callbacks=[
                ModelCheckpoint(self.path + "-chk"), tensorboard_callback
            ],
        )
        self.model.save(self.path)
예제 #7
0
def labelled_video_dataset_to_image_dataset(video_dataset: tf.data.Dataset,
                                            take_n: int):
    dataset = video_dataset.flat_map(
        labelled_video_element_to_frame_dataset_mapper('video', take_n=take_n))
    return dataset
예제 #8
0
def transform_to_reader_train_dataset(
    dataset: tf.data.Dataset,
    max_sequence_length: int = 256,
    max_answers: int = 10
):
    def _process(element):
        # parse positive sequence ids
        positive_sequence_ids_serialized = element['positive_passages/sequence_ids'][0]
        positive_sequence_ids_sparse = tf.io.parse_tensor(positive_sequence_ids_serialized, out_type=tf.string)
        positive_sequence_ids_indices = tf.io.parse_tensor(positive_sequence_ids_sparse[0], out_type=tf.int64)
        positive_sequence_ids_values = tf.io.parse_tensor(positive_sequence_ids_sparse[1], out_type=tf.int32)
        positive_sequence_ids_dense_shape = tf.io.parse_tensor(positive_sequence_ids_sparse[2], out_type=tf.int64)
        positive_sequence_ids = tf.sparse.SparseTensor(
            indices=positive_sequence_ids_indices,
            values=positive_sequence_ids_values,
            dense_shape=positive_sequence_ids_dense_shape
        )
        positive_sequence_ids = tf.sparse.to_dense(positive_sequence_ids)

        # pad positive sequence ids
        positive_sequence_ids = positive_sequence_ids[:, :max_sequence_length] # truncate
        positive_sequence_ids = tf.pad(positive_sequence_ids, [[0, 0], [0, max_sequence_length - tf.shape(positive_sequence_ids)[1]]]) # padding

        # parse start positions
        positive_start_positions_serialized = element['positive_passages/start_positions'][0]
        positive_start_positions_sparse = tf.io.parse_tensor(positive_start_positions_serialized, out_type=tf.string)
        positive_start_positions_indices = tf.io.parse_tensor(positive_start_positions_sparse[0], out_type=tf.int64)
        positive_start_positions_values = tf.io.parse_tensor(positive_start_positions_sparse[1], out_type=tf.int32)
        positive_start_positions_dense_shape = tf.io.parse_tensor(positive_start_positions_sparse[2], out_type=tf.int64)
        positive_start_positions = tf.sparse.SparseTensor(
            indices=positive_start_positions_indices,
            values=positive_start_positions_values,
            dense_shape=positive_start_positions_dense_shape
        )
        positive_start_positions = tf.sparse.to_dense(positive_start_positions)

        # pad start positions
        positive_start_positions = positive_start_positions[:, :max_answers] # truncate
        positive_start_positions = tf.pad(positive_start_positions, [[0, 0], [0, max_answers - tf.shape(positive_start_positions)[1]]]) # padding

        # parse positive end positions
        positive_end_positions_serialized = element['positive_passages/end_positions'][0]
        positive_end_positions_sparse = tf.io.parse_tensor(positive_end_positions_serialized, out_type=tf.string)
        positive_end_positions_indices = tf.io.parse_tensor(positive_end_positions_sparse[0], out_type=tf.int64)
        positive_end_positions_values = tf.io.parse_tensor(positive_end_positions_sparse[1], out_type=tf.int32)
        positive_end_positions_dense_shape = tf.io.parse_tensor(positive_end_positions_sparse[2], out_type=tf.int64)
        positive_end_positions = tf.sparse.SparseTensor(
            indices=positive_end_positions_indices,
            values=positive_end_positions_values,
            dense_shape=positive_end_positions_dense_shape
        )
        positive_end_positions = tf.sparse.to_dense(positive_end_positions)

        # pad end positions
        positive_end_positions = positive_end_positions[:, :max_answers] # truncate
        positive_end_positions = tf.pad(positive_end_positions, [[0, 0], [0, max_answers - tf.shape(positive_end_positions)[1]]])

        return {
            "input_ids": positive_sequence_ids,
            "start_positions": positive_start_positions,
            "end_positions": positive_end_positions
        }

    dataset = dataset.map(
        _process,
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=True
    )

    dataset = dataset.flat_map(
        lambda x: tf.data.Dataset.from_tensor_slices(x),
    )

    return dataset