예제 #1
0
    def _train_internal(self, config, gpu_num, update_config, tensorboard_dir):
        tools_dir = self._get_tools_dir()
        if tensorboard_dir is not None:
            update_config += f' data.tb_log_dir {tensorboard_dir}'

        if get_cuda_device_count() > 0:
            logging.info('Training on GPUs started ...')
            available_gpu_num = get_cuda_device_count()
            if available_gpu_num < gpu_num:
                logging.warning(
                    f'available_gpu_num < args.gpu_num: {available_gpu_num} < {gpu_num}'
                )
                logging.warning(
                    f'decreased number of gpu to: {available_gpu_num}')
                gpu_num = available_gpu_num
                sys.stdout.flush()
            logging.info('... training on GPUs completed.')
        else:
            gpu_num = 0
            logging.info('Training on CPU started ...')

        run_with_termination(f'python {tools_dir}/main.py'
                             f' --config-file {config}'
                             f' --gpu-num {gpu_num}'
                             f' {update_config}'.split(' '))

        if get_cuda_device_count() > 0:
            logging.info('... training on GPUs completed.')
        else:
            logging.info('... training on CPU completed.')
예제 #2
0
    def _train_internal(self, config, gpu_num, update_config, tensorboard_dir):
        tools_dir = self._get_tools_dir()
        tensorboard_dir = f' --tensorboard-dir {tensorboard_dir}' if tensorboard_dir is not None else ''

        training_info = {'training_gpu_num': 0}
        if os.getenv('MASTER_ADDR') is not None and os.getenv(
                'MASTER_PORT') is not None:
            # Distributed training is handled by Kubeflow’s PyTorchJob at a higher level.
            logging.info('Distributed training started ...')
            run_with_termination(f'python {tools_dir}/train.py'
                                 f' --launcher=pytorch'
                                 f' {config}'
                                 f'{tensorboard_dir}'
                                 f'{update_config}'.split(' '))
            logging.info('... distributed training completed.')
        elif torch.cuda.is_available():
            logging.info('Training on GPUs started ...')
            available_gpu_num = torch.cuda.device_count()
            if available_gpu_num < gpu_num:
                logging.warning(
                    f'available_gpu_num < args.gpu_num: {available_gpu_num} < {gpu_num}'
                )
                logging.warning(
                    f'decreased number of gpu to: {available_gpu_num}')
                gpu_num = available_gpu_num
                sys.stdout.flush()
            run_with_termination(f'{tools_dir}/dist_train.sh'
                                 f' {config}'
                                 f' {gpu_num}'
                                 f'{tensorboard_dir}'
                                 f'{update_config}'.split(' '))
            training_info['training_gpu_num'] = gpu_num
            logging.info('... training on GPUs completed.')
        else:
            logging.info('Training on CPU started ...')
            run_with_termination(f'python {tools_dir}/train.py'
                                 f' {config}'
                                 f'{tensorboard_dir}'
                                 f'{update_config}'.split(' '))
            logging.info('... training on CPU completed.')

        return training_info