def all_reduce_item(value, op='sum'): """ All-reduces single scalar value if distributed is in use """ if dist.is_available() and dist.is_initialized(): if op == 'sum' or op == 'mean': dop = dist.ReduceOp.SUM elif op == 'min': dop = dist.ReduceOp.MIN elif op == 'max': dop = dist.ReduceOp.MAX elif op == 'product': dop = dist.ReduceOp.PRODUCT else: raise RuntimeError('Unsupported reduce op') # backend = dist.get_backend() # if backend == dist.Backend.NCCL: # device = torch.device('cuda') # elif backend == dist.Backend.GLOO: # device = torch.device('cpu') # else: # raise RuntimeError('Unsupported distributed backend') device = torch.device('cuda') tensor = torch.tensor(value, device=device) dist.all_reduce(tensor, dop) if op == 'mean': tensor /= get_world_size() ret = tensor.item() else: ret = value return ret
def get_rank(): """ Gets distributed rank or returns zero if distributed is not initialized. """ if dist.is_available() and dist.is_initialized(): rank = dist.get_rank() else: rank = 0 return rank
def get_world_size(): """ Gets total number of distributed workers or returns one if distributed is not initialized. """ if dist.is_available() and dist.is_initialized(): world_size = dist.get_world_size() else: world_size = 1 return world_size
def dist_init(fn, args): if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True if cudnn.deterministic: warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') args.is_distributed = len(args.hosts) > 1 and args.backend is not None args.is_multigpus = args.num_gpus > 1 args.multigpus_distributed = (args.is_distributed or args.is_multigpus) logger.debug("multigpus_distributed - {}".format( args.multigpus_distributed)) logger.debug("Number of gpus available - {}".format(args.num_gpus)) # print("######### Start Training #########") if args.multigpus_distributed: if args.apex: # Initialize the distributed environment. mp.spawn(fn, nprocs=args.num_gpus, args=(args, )) else: if args.data_parallel and not sdp.is_initialized(): sdp.init_process_group() elif args.model_parallel and not smp.is_initialized(): smp.init() fn(None, args) if args.model_parallel: smp.barrier() else: fn(0, args)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import time from argparse import ArgumentParser import torch import numpy as np from torch.optim.lr_scheduler import MultiStepLR import smdistributed.dataparallel.torch.distributed as herring if not herring.is_initialized(): herring.init_process_group() import torch.utils.data.distributed from src.model import SSD300, ResNet, Loss from src.utils import dboxes300_coco, Encoder from src.logger import Logger, BenchLogger from src.evaluate import evaluate from src.train import train_loop, tencent_trick, load_checkpoint, benchmark_train_loop, benchmark_inference_loop from src.data import get_train_loader, get_val_dataset, get_val_dataloader, get_coco_ground_truth import dllogger as DLLogger # Apex imports try:
from maskrcnn_benchmark.modeling.detector import build_detection_model from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer from maskrcnn_benchmark.utils.collect_env import collect_env_info from maskrcnn_benchmark.utils.comm import synchronize, get_rank, is_main_process from maskrcnn_benchmark.utils.imports import import_file from maskrcnn_benchmark.utils.logger import setup_logger from maskrcnn_benchmark.utils.miscellaneous import mkdir from maskrcnn_benchmark.engine.tester import test # Import SMDataParallel modules for PyTorch. from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP import smdistributed.dataparallel.torch.distributed as dist dist.init_process_group() # SMDataParallel: Initialize if not dist.is_initialized(): dist.init_process_group() # See if we can use apex.DistributedDataParallel instead of the torch default, # and enable mixed-precision via apex.amp try: from apex import amp use_amp = True except ImportError: print('Use APEX for multi-precision via apex.amp') use_amp = False # try: # from apex.parallel import DistributedDataParallel as DDP # use_apex_ddp = True # except ImportError:
def barrier(): """ Call dist.barrier() if distributed is in use """ if dist.is_available() and dist.is_initialized(): dist.barrier()
import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.utils.data import torch.utils.data.distributed from torchvision import datasets, transforms ######################################################## ####### 1. SageMaker Distributed Data Parallel ######## ####### - Import Package and Initialization ######## ######################################################## import smdistributed.dataparallel.torch.distributed as smdp from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as smdpDDP if not smdp.is_initialized(): smdp.init_process_group() ####################################################### logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logger.addHandler(logging.StreamHandler(sys.stdout)) # Based on https://github.com/pytorch/examples/blob/master/mnist/main.py class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(10, 20, kernel_size=5)