Пример #1
0
    def _check_state(self, state):
        state_attributes = [
            'manager', 'manager.epoch', 'manager.batch_step',
            'manager.global_iter', 'manager.logs', 'manager.metric_windows',
            'callbacks', 'trainer'
        ]

        for attr in state_attributes:
            v = get_value_at(attr, state, warn_on_failure=False)

            if v is None:
                if attr == "manager.metric_windows":
                    v = get_value_at('manager.training_loss_window',
                                     state,
                                     warn_on_failure=False)
                    if v is not None:
                        self._log.debug(
                            "Legacy state detected (training_loss_window) ... "
                        )
                        continue

                self._log.error(
                    f"Given state does not have a value for {attr}, state is invalid"
                )
                return False

        return True
Пример #2
0
    def _calc_gradients(self, batch_data, training_settings=None):
        """

        :param batch_data:
        :param training_settings:
        :return:

        :raises LossNotAvailableException
        """

        if not self.batch_chunk_size:
            with tf.GradientTape() as tape:
                results = self.evaluate_loss(
                    batch_data,
                    inference_mode=False,
                    evaluate_settings=training_settings)

            if 'loss' not in results:
                raise LossNotAvailableException()

            if self.trainable_variables is None:
                # We now have evaluated the model and the trainable variables should be available
                self._retrieve_trainable_variables()

            loss = results['loss']
            auxiliary_results = get_value_at('auxiliary_results',
                                             results,
                                             warn_on_failure=False)

            gradients = self._back_propagate_from(loss, tape)
        else:
            loss, auxiliary_results, gradients = self._calc_gradients_chunked(
                batch_data, training_settings)

        return loss, auxiliary_results, gradients
Пример #3
0
    def _evaluate_loss(self,
                       batch_data,
                       evaluate_settings=None,
                       inference_mode=None):

        use_teacher_forcing = get_value_at('use_teacher_forcing',
                                           evaluate_settings)
        if use_teacher_forcing is None:
            use_teacher_forcing = True

        padded_input_batch, input_lengths, output_batch, output_mask, max_output_len = batch_data

        padded_input_batch = padded_input_batch.to(device)
        input_lengths = input_lengths.to(device)
        output_batch = output_batch.to(device)
        output_mask = output_mask.to(device)

        batch_size = padded_input_batch.size(1)

        init_decoder_input = torch.tensor(
            [[SOS_token for _ in range(batch_size)]],
            dtype=torch.long,
            device=device)

        results = self.training_model(padded_input_batch, input_lengths,
                                      init_decoder_input, max_output_len,
                                      output_batch, use_teacher_forcing)

        per_sample_loss = results["loss"]

        # loss, loss_sum, num_samples
        return masked_loss(per_sample_loss, output_mask)
Пример #4
0
    def _get_model_quality(self, current_logs):
        model_quality = get_value_at(self._metric_to_monitor,
                                     current_logs,
                                     warn_on_failure=self._warn_on_model_quality_not_available)

        if type(model_quality) is tuple:
            # use the first value as metric value, the other values are auxiliary results meant for other purposes
            model_quality = model_quality[0]

        return model_quality
Пример #5
0
    def _get_current_evaluate_settings(self, logs):
        # Use custom settings if available, else use default settings

        evaluate_settings = get_value_at('evaluate_settings',
                                         logs,
                                         warn_on_failure=False)
        if evaluate_settings is None:
            evaluate_settings = self._evaluate_settings

        return evaluate_settings
Пример #6
0
    def _get_model_quality(self, current_logs):
        if not self._metric_to_monitor:
            return None

        model_quality = get_value_at(self._metric_to_monitor, current_logs)

        if type(model_quality) is tuple:
            # use the first value as metric value, the other values are auxiliary results meant for other purposes
            model_quality = model_quality[0]

        return model_quality
Пример #7
0
    def _back_propagate_from(self, loss, tape, last_chunk=False):
        gradients = {}
        for optimizer_name in self.optimizers.keys():
            trainable_variables = get_value_at(optimizer_name,
                                               self.trainable_variables,
                                               warn_on_failure=False)

            gradients[optimizer_name] = tape.gradient(loss,
                                                      trainable_variables)

        return gradients
Пример #8
0
    def _update_model_parameters(self, gradients):
        for optimizer_name, optimizer in self.get_optimizers().items():
            trainable_variables = get_value_at(optimizer_name,
                                               self.trainable_variables)
            if trainable_variables is None:
                raise MLPugException(
                    "Unexpected state :  trainable variables not found. Please file an issue."
                )

            optimizer.apply_gradients(
                zip(gradients[optimizer_name], trainable_variables))
Пример #9
0
    def _calc_whole_dataset_metrics(self, logs, log_path):

        current = self._get_logs_base(logs)
        metrics_log = get_value_at(log_path, current)

        evaluate_settings = self._get_current_evaluate_settings(logs)

        return self._metric_evaluator.calc_dataset_metrics_for(
            self._dataset,
            metrics_log,
            evaluate_settings=evaluate_settings,
            dataset_name=self._dataset_name)
Пример #10
0
    def _check_state(self, state):
        state_attributes = ['model_components', 'optimizers']

        for attr in state_attributes:
            v = get_value_at(attr, state, warn_on_failure=False)
            if v is None:
                self._log.error(
                    f"Given state does not have a value for {attr}, state is invalid"
                )
                return False

        return True
Пример #11
0
    def on_batch_training_completed(self, dataset_batch, logs):
        if not self.instance_valid():
            self._log.error(f"{self} is not valid, skipping this hook ... ")
            return False

        if not self._batch_level:
            return True

        if not self._log_condition_func(logs=logs,
                                        dataset_batch=dataset_batch):
            return True

        self._init_logs(logs)

        if self._logging_mode is MetricsLoggingMode.WHOLE_DATASET_METRICS:
            return self._calc_whole_dataset_metrics(
                logs, f"{self._dataset_name}.dataset")
        else:
            current = self._get_logs_base(logs)

            batch_metrics = {}
            if not self._calc_batch_metric_data_from(dataset_batch,
                                                     batch_metrics, logs):
                return False

            base_path = f"{self._dataset_name}.batch"
            dataset_batch_logs = get_value_at(base_path, current)

            # Merge in new batch level results
            dataset_batch_logs = {**dataset_batch_logs, **batch_metrics}
            if self._logging_mode is MetricsLoggingMode.BATCH_AND_WINDOW_AVERAGE_METRICS:
                set_value_at(base_path, current, dataset_batch_logs)

            metric_names = self._metric_evaluator.get_metric_names()
            metric_paths = get_key_paths(
                dataset_batch_logs,
                keys_to_consider=metric_names,
                keys_not_to_consider=["auxiliary_results"])

            self._update_metrics_windows_for(metric_paths,
                                             dataset_batch_logs,
                                             base_path=base_path)

            # gather all window data
            batch_metrics_lists = {
                p: s.window
                for p, s in self._metric_windows.items()
            }
            if not self._reduce(batch_metrics_lists,
                                current[self._dataset_name]['window_average']):
                return False

            return True
Пример #12
0
    def _calc_batch_metric_data_from(self, batch, batch_metrics, logs):
        evaluate_settings = self._get_current_evaluate_settings(logs)

        model_output = None
        if self._dataset is None:
            current = self._get_logs_base(logs)
            loss = get_value_at(f"{self._dataset_name}.batch.loss", current)
            auxiliary_results = get_value_at(
                f"{self._dataset_name}.batch.auxiliary_results",
                current,
                warn_on_failure=False)

            model_output = {
                'loss': loss,
                'auxiliary_results': auxiliary_results
            }

        return self._metric_evaluator.calc_batch_metrics_for(
            batch,
            batch_metrics,
            evaluate_settings=evaluate_settings,
            model_output=model_output)
Пример #13
0
    def _update_window(self, metric_path):
        try:
            window = self._metric_windows[metric_path]
            if window is None:
                return

            value = get_value_at(metric_path, self.logs["current"])

            window.slide(value)
        except Exception as e:
            _.log_exception(
                self._log,
                f"Exception occurred updating sliding window {metric_path}, skipped...",
                e)
Пример #14
0
    def _retrieve_trainable_variables(self):
        if len(self.optimizers) > 1:
            return

        # This only needs to be done once
        # Further, this situation only occurs when there is only one optimizer

        optimizer_name = next(iter(self.optimizers))
        trainable_variables = get_value_at(optimizer_name,
                                           self.trainable_variables,
                                           warn_on_failure=False)
        if trainable_variables is None:
            trainable_variables = self.training_model.trainable_variables

            self.trainable_variables = {optimizer_name: trainable_variables}
Пример #15
0
    def _update_metrics_windows_for(self, metric_paths, batch_metrics,
                                    base_path):
        for metric_path in metric_paths:
            metric_value = get_value_at(metric_path, batch_metrics)

            full_metric_path = f"{base_path}.{metric_path}"
            sliding_window = self._metric_windows[
                metric_path] if metric_path in self._metric_windows else None
            if sliding_window is None:
                self._log.debug(
                    f"Creating sliding window for {full_metric_path}")

                sliding_window = self._sliding_window_factory(
                    length=self._batch_averaging_window, name=full_metric_path)
                self._metric_windows[metric_path] = sliding_window

            sliding_window.slide(metric_value)
Пример #16
0
    def _set_metric_windows_states(self, state):
        success = True
        try:
            metric_windows_state = get_value_at("manager.metric_windows",
                                                state)
            if metric_windows_state is None:
                return success

            for metric_path, window_state in metric_windows_state.items():
                success &= self._set_metric_window_state(
                    metric_path, window_state)

        except Exception as e:
            _.log_exception(
                self._log, f"Unable to set metric windows state, skipped ...",
                e)
            success = False

        return success
Пример #17
0
    def _update_logs(self, logs):
        current = self._get_logs_base(logs)

        schedule_level = self._get_schedule_level()
        ctp = current['training_params'][schedule_level]

        try:
            current_lr = self._get_current_lr()

            lr = get_value_at('lr', ctp, warn_on_failure=False) or {}

            ctp['lr'] = {**lr, **current_lr}

            return True
        except Exception as e:
            _.log_exception(
                self._log, "An unexpected error occurred, "
                "unable to add current learning rate values to the logs object",
                e)
            return False
Пример #18
0
    def on_batch_training_completed(self, training_batch, logs):
        if not self.batch_level:
            return True

        success = True
        current = self._get_logs_base(logs)
        batch_step = current["batch_step"]

        has_dataset_level_metrics = False
        for set_name in self.set_names:
            dataset_metrics = get_value_at(f"{set_name}.dataset",
                                           current,
                                           warn_on_failure=False)
            has_dataset_level_metrics |= type(dataset_metrics) is dict and len(
                dataset_metrics) > 0

            if has_dataset_level_metrics:
                break

        if batch_step == 0 or batch_step % self.log_period == 0 or has_dataset_level_metrics:
            eta = self._calc_eta(logs)
            average_duration = self._get_average_batch_duration(logs)

            self._write('\nEpoch {:d}/{:d} - ETA: {:s}\tBatch {:d}/{:d} '
                        'Average batch training time {:s}\n'.format(
                            current["epoch"], logs["final_epoch"], eta,
                            current["batch_step"], logs["final_batch_step"],
                            average_duration))

            for metric_level in [
                    'batch', 'window_average', 'dataset', 'epoch'
            ]:
                self._write_metric_logs(metric_level, logs)
                self._write(f'\n')

            self._write(f'\n')

        return success
Пример #19
0
    def _calc_gradients_chunked(self, batch_data, training_settings=None):
        """
        See `train_on` method.

        This method slices the `batch_data` in slices of size `self.batch_chunk_size`. For each slice the loss is
        calculated and the gradients are updated through back prop.

        return: loss, auxiliary_results, accumulated_grads
                    loss: weighted average of chunk losses
                    auxiliary_results: list of dicts:
                                        [
                                            ...
                                            {
                                                "results": chunk aux. results,
                                                "num_samples": num samples in chunk
                                            }
                                            ...
                                        ]
                    accumulated_grads: weighted average of chunk gradients
        """

        if not is_chunkable(batch_data):
            raise BatchNotChunkableException()

        auxiliary_results = BatchChunkingResults()

        loss = 0
        # Will be set when we have the trainable variables
        accumulated_grads = None

        batch_size = len(batch_data)
        num_chunks = math.ceil(batch_size / self.batch_chunk_size)
        for chunk_idx in range(num_chunks):
            chunk_start = chunk_idx * self.batch_chunk_size
            chunk_end = min((chunk_idx + 1) * self.batch_chunk_size,
                            batch_size)

            chunk_len = chunk_end - chunk_start

            chunk = batch_data[chunk_start:chunk_end]

            with tf.GradientTape() as tape:
                results = self.evaluate_loss(
                    chunk,
                    inference_mode=False,
                    evaluate_settings=training_settings)

            if 'loss' not in results:
                raise LossNotAvailableException()

            if self.trainable_variables is None:
                # We now have evaluated the model and the trainable variables should be available
                self._retrieve_trainable_variables()

            if accumulated_grads is None:
                if self.trainable_variables is None:
                    raise MLPugException(
                        "Unexpected state :  trainable variables not found. Please file an issue."
                    )

                accumulated_grads = {}
                for optimizer_name, tvs in self.trainable_variables.item():
                    accumulated_grads[optimizer_name] = [
                        tf.zeros_like(tv) for tv in tvs
                    ]

            loss = results['loss']
            aux_results = get_value_at('auxiliary_results',
                                       results,
                                       warn_on_failure=False)

            # loss is assumed to be the average over the sample loss for the chunk
            # Divide through batch size to factor in that this loss is part of a larger batch.
            last_chunk = chunk_idx == (num_chunks - 1)
            chunk_loss = chunk_len * loss / batch_size
            chunk_gradients = self._back_propagate_from(chunk_loss,
                                                        tape,
                                                        last_chunk=last_chunk)

            loss += chunk_loss

            for optimizer_name, chunk_grads in chunk_gradients.items():
                accu_grads = accumulated_grads[optimizer_name]
                accumulated_grads[optimizer_name] = [
                    (accu_grad + chunk_grad)
                    for accu_grad, chunk_grad in zip(accu_grads, chunk_grads)
                ]

            auxiliary_results += [{
                "results": aux_results,
                "num_samples": chunk_len
            }]

        return loss, auxiliary_results, accumulated_grads
Пример #20
0
 def create_optimizer_weights():
     for optimizer_name, optimizer in self.optimizers.items():
         trainable_variables = get_value_at(optimizer_name,
                                            self.trainable_variables,
                                            warn_on_failure=False)
         optimizer._create_all_weights(trainable_variables)
Пример #21
0
 def get_model_component(self, name):
     return get_value_at(name, self.get_model_components())
Пример #22
0
 def get_optimizer(self, name):
     return get_value_at(name, self.get_optimizers())
Пример #23
0
    def _create_set_metrics_log_for(self, set_name, metric_level, logs):
        current = self._get_logs_base(logs)

        key_path = f"{set_name}.{metric_level}"
        metrics = get_value_at(key_path, current, warn_on_failure=False)
        return self._create_log_for(metrics)