def _train_internal(self, config, gpu_num, update_config, tensorboard_dir): tools_dir = self._get_tools_dir() if tensorboard_dir is not None: update_config += f' data.tb_log_dir {tensorboard_dir}' if get_cuda_device_count() > 0: logging.info('Training on GPUs started ...') available_gpu_num = get_cuda_device_count() if available_gpu_num < gpu_num: logging.warning( f'available_gpu_num < args.gpu_num: {available_gpu_num} < {gpu_num}' ) logging.warning( f'decreased number of gpu to: {available_gpu_num}') gpu_num = available_gpu_num sys.stdout.flush() logging.info('... training on GPUs completed.') else: gpu_num = 0 logging.info('Training on CPU started ...') run_with_termination(f'python {tools_dir}/main.py' f' --config-file {config}' f' --gpu-num {gpu_num}' f' {update_config}'.split(' ')) if get_cuda_device_count() > 0: logging.info('... training on GPUs completed.') else: logging.info('... training on CPU completed.')
def _train_internal(self, config, gpu_num, update_config, tensorboard_dir): tools_dir = self._get_tools_dir() tensorboard_dir = f' --tensorboard-dir {tensorboard_dir}' if tensorboard_dir is not None else '' training_info = {'training_gpu_num': 0} if os.getenv('MASTER_ADDR') is not None and os.getenv( 'MASTER_PORT') is not None: # Distributed training is handled by Kubeflow’s PyTorchJob at a higher level. logging.info('Distributed training started ...') run_with_termination(f'python {tools_dir}/train.py' f' --launcher=pytorch' f' {config}' f'{tensorboard_dir}' f'{update_config}'.split(' ')) logging.info('... distributed training completed.') elif torch.cuda.is_available(): logging.info('Training on GPUs started ...') available_gpu_num = torch.cuda.device_count() if available_gpu_num < gpu_num: logging.warning( f'available_gpu_num < args.gpu_num: {available_gpu_num} < {gpu_num}' ) logging.warning( f'decreased number of gpu to: {available_gpu_num}') gpu_num = available_gpu_num sys.stdout.flush() run_with_termination(f'{tools_dir}/dist_train.sh' f' {config}' f' {gpu_num}' f'{tensorboard_dir}' f'{update_config}'.split(' ')) training_info['training_gpu_num'] = gpu_num logging.info('... training on GPUs completed.') else: logging.info('Training on CPU started ...') run_with_termination(f'python {tools_dir}/train.py' f' {config}' f'{tensorboard_dir}' f'{update_config}'.split(' ')) logging.info('... training on CPU completed.') return training_info