예제 #1
0
    def on_train_begin(self, logs=None):  # pylint: disable=unused-argument
        """Summary.

        Args:
            logs (None, optional): Description
        """
        try_mlflow_log(log_param, 'num_layers', len(self.model.layers))
        try_mlflow_log(log_param, 'optimizer_name',
                       type(self.model.optimizer).__name__)
        if hasattr(self.model.optimizer, 'lr'):
            lr = self.model.optimizer.lr if \
                type(self.model.optimizer.lr) is float \
                else keras.backend.eval(self.model.optimizer.lr)
            try_mlflow_log(log_param, 'learning_rate', lr)
        if hasattr(self.model.optimizer, 'epsilon'):
            epsilon = self.model.optimizer.epsilon if \
                type(self.model.optimizer.epsilon) is float \
                else keras.backend.eval(self.model.optimizer.epsilon)
            try_mlflow_log(log_param, 'epsilon', epsilon)

        sum_list = []
        self.model.summary(print_fn=sum_list.append)
        summary = '\n'.join(sum_list)

        tempdir = tempfile.mkdtemp()
        try:
            summary_file = os.path.join(tempdir, 'model_summary.txt')
            with open(summary_file, 'w') as f:
                f.write(summary)
            try_mlflow_log(log_artifact,
                           key='model_summary.txt',
                           path=summary_file)
        finally:
            shutil.rmtree(tempdir)
예제 #2
0
    def on_epoch_end(self, epoch, logs=None):
        """Summary.

        Args:
            epoch (TYPE): Description
            logs (None, optional): Description

        Returns:
            TYPE: Description
        """
        self.current_epoch = epoch
        if not logs:
            return
        # sys_data = gpu_metrics()
        logs_copy = copy.deepcopy(logs)
        # logs_copy.update(gpu_data)

        # Removing system_metrics for now, as these are not frequently used
        # cpu_data = system_metrics()
        # sys_data.update(cpu_data)

        # try_mlflow_log(
        #     log_metrics,
        #     sys_data,
        #     step=self.num_step,
        #     epoch=self.current_epoch,
        #     tags={'sys_metric': 'yes'})
        try_mlflow_log(log_metrics,
                       logs_copy,
                       step=self.num_step,
                       epoch=self.current_epoch)
예제 #3
0
    def _save_model(self, epoch, logs):
        super(ModelCheckpointAndUpload, self)._save_model(epoch, logs)
        filepath = self._get_file_path(epoch, logs)

        if os.path.exists(filepath):
            output_filename = os.path.join(tempfile.gettempdir(),
                                           'checkpoint_segmind_track')

            if os.path.isfile(output_filename):
                os.remove(output_filename)
            # zip filepath folder

            if os.path.isdir(filepath):
                shutil.make_archive(output_filename, 'zip', filepath)
                # log as artifact
                print(f'Uploading checkpoint {output_filename} ...')
                try_mlflow_log(log_artifact,
                               key=os.path.basename(filepath) + '.zip',
                               path=output_filename + '.zip')

            else:
                # log as artifact
                print(f'Uploading checkpoint {filepath} ...')
                try_mlflow_log(log_artifact,
                               key=os.path.basename(filepath),
                               path=filepath)
예제 #4
0
    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx,
                           dataloader_idx):
        """Summary.

        Args:
            trainer (TYPE): Description
            pl_module (None, optional): Description
            batch (TYPE) : Description
            batch_idx (TYPE) : Description
            dataloader_idx (TYPE) : Description
        Returns:
            TYPE: Description
        """

        self.num_step += 1
        logs = trainer.logger_connector.callback_metrics

        if self.step_logging and self.num_step % self.log_evry_n_step == 0:
            gpu_data = gpu_metrics()
            logs_copy = copy.deepcopy(logs)
            logs_copy.update(gpu_data)

            cpu_data = system_metrics()
            logs_copy.update(cpu_data)

            try_mlflow_log(log_metrics, logs_copy, step=self.num_step)
예제 #5
0
    def _save_model(self, filepath: str, trainer, pl_module):
        super(PytorchModelCheckpointAndUpload, self)._save_model(filepath, trainer, pl_module)  # noqa: E501
        # filepath = self._get_file_path(epoch, logs)
        trainer.dev_debugger.track_checkpointing_history(filepath)
        if trainer.is_global_zero:
            self._fs.makedirs(os.path.dirname(filepath), exist_ok=True)
        if self.save_function is not None:
            self.save_function(filepath, self.save_weights_only)
        if os.path.exists(filepath):
            output_filename = os.path.join(tempfile.gettempdir(),
                                           'checkpoint_segmind_track')

            if os.path.isfile(output_filename):
                os.remove(output_filename)
            # zip filepath folder

            if os.path.isdir(filepath):
                shutil.make_archive(output_filename, 'zip', filepath)
                # log as artifact
                print(f'Uploading checkpoint {output_filename} ...')
                try_mlflow_log(
                    log_artifact,
                    key=os.path.basename(filepath) + '.zip',
                    path=output_filename + '.zip')

            else:
                # log as artifact
                print(f'Uploading checkpoint {filepath} ...')
                try_mlflow_log(
                    log_artifact,
                    key=os.path.basename(filepath),
                    path=filepath)
예제 #6
0
    def on_epoch_end(self, trainer, pl_module):
        """Summary.

        Args:
            trainer (TYPE): Description
            pl_module (None, optional): Description

        Returns:
            TYPE: Description
        """
        # self.current_epoch = epoch
        logs = trainer.logger_connector.callback_metrics
        self.current_epoch += 1

        # sys_data = gpu_metrics()
        # Removing system_metrics for now, as these are not frequently used
        # sys_data.update(system_metrics())
        # try_mlflow_log(
        #     log_metrics,
        #     sys_data,
        #     step=self.num_step,
        #     epoch=self.current_epoch,
        #     tags={'sys_metric': 'yes'})
        try_mlflow_log(log_metrics,
                       logs,
                       step=self.num_step,
                       epoch=self.current_epoch)
예제 #7
0
    def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx,
                          dataloader_idx):
        """Summary.

        Args:
            trainer (TYPE): Description
            pl_module (None, optional): Description
            batch (TYPE) : Description
            batch_idx (TYPE) : Description
            dataloader_idx (TYPE) : Description
        Returns:
            TYPE: Description
        """
        self.num_test_step += 1
        logs = trainer.logger_connector.callback_metrics

        if self.step_logging and self.num_test_step % self.log_evry_n_step == 0:  # noqa: E501
            # sys_data = gpu_metrics()
            # sys_data.update(system_metrics())
            # try_mlflow_log(
            #     log_metrics,
            #     sys_data,
            #     step=self.num_step,
            #     epoch=self.current_epoch,
            #     tags={'sys_metric': 'yes'})
            try_mlflow_log(
                log_metrics,
                logs,
                step=self.num_step,
                epoch=self.current_epoch,
            )
예제 #8
0
    def on_test_end(self, logs=None):

        self.num_test_epoch += 1

        if logs:
            try_mlflow_log(log_metrics,
                           logs,
                           step=self.num_test_step,
                           epoch=self.num_test_epoch)
예제 #9
0
    def on_test_end(self, trainer, pl_module):
        """Summary.

        Args:
            trainer (TYPE): Description
            pl_module (None, optional): Description

        Returns:
            TYPE: Description
        """
        logs = trainer.logger_connector.callback_metrics
        try_mlflow_log(log_metrics, logs, step=self.num_step)
예제 #10
0
    def callback(env):
        """internal function."""
        if env.rank != 0 or (not env.evaluation_result_list) or period is False or period == 0:  # noqa: E501
            return
        step = env.iteration

        results = {}
        gpu_data = gpu_metrics()
        results.update(gpu_data)

        cpu_data = system_metrics()
        results.update(cpu_data)
        if step % period == 0 or step + 1 == env.begin_iteration or step + 1 == env.end_iteration:  # noqa: E501
            for x in env.evaluation_result_list:
                results[x[0]] = x[1]
            try_mlflow_log(log_metrics, results, step=step)
예제 #11
0
    def on_test_batch_end(self, batch, logs=None):
        """Summary.

        Args:
            batch (TYPE): Description
            logs (None, optional): Description

        Returns:
            TYPE: Description
        """
        self.num_test_step += 1
        if not logs:
            return
        if self.step_logging and self.num_test_step % self.log_evry_n_step == 0:  # noqa: E501
            gpu_data = gpu_metrics()
            logs_copy = copy.deepcopy(logs)
            logs_copy.update(gpu_data)
            cpu_data = system_metrics()
            logs_copy.update(cpu_data)
            try_mlflow_log(log_metrics, logs_copy, step=self.num_step)
예제 #12
0
    def on_epoch_end(self, epoch, logs=None):
        """Summary.

        Args:
            epoch (TYPE): Description
            logs (None, optional): Description

        Returns:
            TYPE: Description
        """
        self.current_epoch = epoch
        if not logs:
            return
        gpu_data = gpu_metrics()
        logs_copy = copy.deepcopy(logs)
        logs_copy.update(gpu_data)

        cpu_data = system_metrics()
        logs_copy.update(cpu_data)

        try_mlflow_log(log_metrics, logs_copy, step=self.num_step)
예제 #13
0
    def on_epoch_end(self, trainer, pl_module):
        """Summary.

        Args:
            trainer (TYPE): Description
            pl_module (None, optional): Description

        Returns:
            TYPE: Description
        """
        # self.current_epoch = epoch
        logs = trainer.logger_connector.callback_metrics

        gpu_data = gpu_metrics()
        logs_copy = copy.deepcopy(logs)
        logs_copy.update(gpu_data)

        cpu_data = system_metrics()
        logs_copy.update(cpu_data)

        try_mlflow_log(log_metrics, logs_copy, step=self.num_step)
        print('end of epoch')
예제 #14
0
    def on_train_start(self, trainer, pl_module):  # pylint: disable=unused-arg

        """Summary.

        Args:
            trainer (TYPE): Description
            pl_module (None, optional): Description

        Returns:
            TYPE: Description
        """

        optimizer = pl_module.configure_optimizers()
        try_mlflow_log(log_param, 'optimizer_name',
                       optimizer.__class__.__name__)
        lr = optimizer.param_groups[0]['lr']
        try_mlflow_log(log_param, 'learning_rate', lr)
        print('learning rate value is ', lr)
        epsilon = optimizer.param_groups[0]['eps']
        try_mlflow_log(log_param, 'epsilon', epsilon)
        print('epsilon value is ', epsilon)

        sum_list = []
        x = pl_module.summarize()
        x = str(x)
        sum_list.append(x)
        summary = '\n'.join(sum_list)
        # try_mlflow_log(set_tag, 'model_summary', summary)

        tempdir = tempfile.mkdtemp()
        try:
            summary_file = os.path.join(tempdir, 'model_summary.txt')
            with open(summary_file, 'w') as f:
                f.write(summary)
            try_mlflow_log(
                log_artifact, key='model_summary.txt', path=summary_file)
        finally:
            shutil.rmtree(tempdir)
예제 #15
0
 def on_test_end(self, logs=None):
     if not logs:
         return
     else:
         try_mlflow_log(log_metrics, logs, step=self.num_step)