예제 #1
0
    def _get_num_workers(self):
        """Check horovod, smdataparallel, and torch.distributed."""
        # Try torch.distributed
        # torch.distributed is empty on Mac on Torch <= 1.2
        if hasattr(dist, "is_initialized") and dist.is_initialized():
            return torch.distributed.get_world_size()
        else:
            # Try horovod
            try:
                import horovod.torch as hvd

                if hvd.size():
                    return hvd.size()
            except (ModuleNotFoundError, ValueError, ImportError):
                pass

            # Try smdataparallel
            # smdistributed.dataparallel should be invoked via `mpirun`.
            # It supports EC2 machines with 8 GPUs per machine.
            if check_smdataparallel_env():
                try:
                    import smdistributed.dataparallel.torch.distributed as smdataparallel

                    if smdataparallel.get_world_size():
                        return smdataparallel.get_world_size()
                except (ModuleNotFoundError, ValueError, ImportError):
                    pass
        # Return default
        return 1
 def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None):
     warnings.warn(
         "SequentialDistributedSampler is deprecated and will be removed in v5 of Transformers.",
         FutureWarning,
     )
     if num_replicas is None:
         if not dist.is_available():
             raise RuntimeError(
                 "Requires distributed package to be available")
         num_replicas = dist.get_world_size()
     if rank is None:
         if not dist.is_available():
             raise RuntimeError(
                 "Requires distributed package to be available")
         rank = dist.get_rank()
     self.dataset = dataset
     self.num_replicas = num_replicas
     self.rank = rank
     num_samples = len(self.dataset)
     # Add extra samples to make num_samples a multiple of batch_size if passed
     if batch_size is not None:
         self.num_samples = int(
             math.ceil(num_samples /
                       (batch_size * num_replicas))) * batch_size
     else:
         self.num_samples = int(math.ceil(num_samples / num_replicas))
     self.total_size = self.num_samples * self.num_replicas
     self.batch_size = batch_size
def get_world_size():
    """
    Gets total number of distributed workers or returns one if distributed is
    not initialized.
    """
    if dist.is_available() and dist.is_initialized():
        world_size = dist.get_world_size()
    else:
        world_size = 1
    return world_size
예제 #4
0
 def world_size(self):
     """
     The number of processes used in parallel.
     """
     if is_torch_tpu_available():
         return xm.xrt_world_size()
     elif is_sagemaker_distributed_available():
         return sm_dist.get_world_size()
     elif self.local_rank != -1:
         return torch.distributed.get_world_size()
     return 1
예제 #5
0
    def __init__(
        self,
        batch_size: int,
        dataset: Optional[Dataset] = None,
        num_replicas: Optional[int] = None,
        rank: Optional[int] = None,
        seed: int = 0,
        drop_last: bool = False,
        lengths: Optional[List[int]] = None,
        model_input_name: Optional[str] = None,
    ):
        if dataset is None and lengths is None:
            raise ValueError("One of dataset and lengths must be provided.")
        if num_replicas is None:
            if not dist.is_available():
                raise RuntimeError(
                    "Requires distributed package to be available")
            num_replicas = dist.get_world_size()
        if rank is None:
            if not dist.is_available():
                raise RuntimeError(
                    "Requires distributed package to be available")
            rank = dist.get_rank()

        self.batch_size = batch_size
        self.num_replicas = num_replicas
        self.rank = rank
        self.epoch = 0
        self.drop_last = drop_last

        if lengths is None:
            model_input_name = model_input_name if model_input_name is not None else "input_ids"
            if (not (isinstance(dataset[0], dict)
                     or isinstance(dataset[0], BatchEncoding))
                    or model_input_name not in dataset[0]):
                raise ValueError(
                    "Can only automatically infer lengths for datasets whose items are dictionaries with an "
                    f"'{model_input_name}' key.")
            lengths = [len(feature[model_input_name]) for feature in dataset]
        self.lengths = lengths

        # If the dataset length is evenly divisible by # of replicas, then there
        # is no need to drop any data, since the dataset will be split equally.
        if self.drop_last and len(self.lengths) % self.num_replicas != 0:
            # Split to nearest available length that is evenly divisible.
            # This is to ensure each rank receives the same amount of data when
            # using this Sampler.
            self.num_samples = math.ceil(
                (len(self.lengths) - self.num_replicas) / self.num_replicas)
        else:
            self.num_samples = math.ceil(len(self.lengths) / self.num_replicas)
        self.total_size = self.num_samples * self.num_replicas
        self.seed = seed
def distributed_concat(tensor: "torch.Tensor", num_total_examples: Optional[int] = None) -> torch.Tensor:
    try:
        if isinstance(tensor, (tuple, list)):
            return type(tensor)(distributed_concat(t, num_total_examples) for t in tensor)
        output_tensors = [tensor.clone() for _ in range(dist.get_world_size())]
        dist.all_gather(output_tensors, tensor)
        concat = torch.cat(output_tensors, dim=0)

        # truncate the dummy elements added by SequentialDistributedSampler
        if num_total_examples is not None:
            concat = concat[:num_total_examples]
        return concat
    except AssertionError:
        raise AssertionError("Not currently using distributed training")
예제 #7
0
 def __init__(self, dataset, num_replicas=None, rank=None):
     if num_replicas is None:
         if not dist.is_available():
             raise RuntimeError("Requires distributed package to be available")
         num_replicas = dist.get_world_size()
     if rank is None:
         if not dist.is_available():
             raise RuntimeError("Requires distributed package to be available")
         rank = dist.get_rank()
     self.dataset = dataset
     self.num_replicas = num_replicas
     self.rank = rank
     self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
     self.total_size = self.num_samples * self.num_replicas
예제 #8
0
def dist_setting(args):
    #     args.data_parallel = False

    print("args.data_parallel : {}".format(args.data_parallel))
    print("args.model_parallel : {}".format(args.model_parallel))
    print("args.apex : {}".format(args.apex))

    args.world_size = 1
    args.host_num = args.hosts.index(args.current_host)

    if args.data_parallel:
        args.world_size = sdp.get_world_size()
        args.rank = sdp.get_rank()  # total rank in all hosts
        args.local_rank = sdp.get_local_rank()  # rank per host
    elif args.model_parallel:
        args.world_size = smp.size()
        args.local_rank = smp.local_rank()  # rank per host
        args.rank = smp.rank()
        args.dp_size = smp.dp_size()
        args.dp_rank = smp.dp_rank()
        print(
            "smp.rank() : {}, smp.size() : {}, smp.mp_rank() : {}, smp.local_size() : {}, smp.get_mp_group() : {}, smp.get_dp_group() : {}, smp.local_rank() : {}, smp.dp_size() : {}, smp.dp_rank() : {}"
            .format(smp.rank(), smp.size(), smp.mp_rank(), smp.local_size(),
                    smp.get_mp_group(), smp.get_dp_group(), smp.local_rank(),
                    smp.dp_size(), smp.dp_rank()))
    else:
        args.world_size = len(args.hosts) * args.num_gpus
        if args.local_rank is not None:
            args.rank = args.num_gpus * args.host_num + \
                args.local_rank  # total rank in all hosts

        dist.init_process_group(backend=args.backend,
                                rank=args.rank,
                                world_size=args.world_size)
        logger.info(
            'Initialized the distributed environment: \'{}\' backend on {} nodes. '
            .format(args.backend, dist.get_world_size()) +
            'Current host rank is {}. Number of gpus: {}'.format(
                dist.get_rank(), args.num_gpus))

    print("**** [dist_setting] args.rank : {}".format(args.rank))
    print("args.world_size : {}".format(args.world_size))
    print("Use GPU: {} for training".format(args.local_rank))

    args.lr = args.lr * float(args.world_size)

    args.batch_size //= args.world_size // args.num_gpus
    args.batch_size = max(args.batch_size, 1)

    return args
def distributed_broadcast_scalars(
    scalars: List[Union[int, float]], num_total_examples: Optional[int] = None
) -> torch.Tensor:
    try:
        tensorized_scalar = torch.tensor(scalars).cuda()
        output_tensors = [tensorized_scalar.clone() for _ in range(dist.get_world_size())]
        dist.all_gather(output_tensors, tensorized_scalar)
        concat = torch.cat(output_tensors, dim=0)

        # truncate the dummy elements added by SequentialDistributedSampler
        if num_total_examples is not None:
            concat = concat[:num_total_examples]
        return concat
    except AssertionError:
        raise AssertionError("Not currently using distributed training")
예제 #10
0
def get_distributed_worker():
    """Get the rank for horovod or torch distributed. If none of them are being used,
    return None"""
    rank = None
    try:
        import torch.distributed as dist
    except (ImportError, ModuleNotFoundError):
        dist = None
    rank = None
    if dist and hasattr(dist, "is_initialized") and dist.is_initialized():
        rank = dist.get_rank()
    else:
        try:
            import horovod.torch as hvd

            if hvd.size():
                rank = hvd.rank()
        except (ModuleNotFoundError, ValueError, ImportError):
            pass

        try:
            import horovod.tensorflow as hvd

            if hvd.size():
                rank = hvd.rank()
        except (ModuleNotFoundError, ValueError, ImportError):
            pass

        # smdistributed.dataparallel should be invoked via `mpirun`.
        # It supports EC2 machines with 8 GPUs per machine.
        if check_smdataparallel_env():
            try:
                import smdistributed.dataparallel.torch.distributed as smdataparallel

                if smdataparallel.get_world_size():
                    return smdataparallel.get_rank()
            except (ModuleNotFoundError, ValueError, ImportError):
                pass

            try:
                import smdistributed.dataparallel.tensorflow as smdataparallel

                if smdataparallel.size():
                    return smdataparallel.rank()
            except (ModuleNotFoundError, ValueError, ImportError):
                pass
    return rank
예제 #11
0
def get_val_dataloader(dataset, args):
    if args.distributed:
        val_sampler = torch.utils.data.distributed.DistributedSampler(
            dataset,
            num_replicas=herring.get_world_size(),
            rank=herring.get_rank())
    else:
        val_sampler = None

    val_dataloader = DataLoader(
        dataset,
        batch_size=args.eval_batch_size,
        shuffle=False,  # Note: distributed sampler is shuffled :(
        sampler=val_sampler,
        num_workers=args.num_workers)

    return val_dataloader
예제 #12
0
def total_processes_number(local_rank):
    """
    Return the number of processes launched in parallel. Works with `torch.distributed` and TPUs.
    """
    if is_torch_tpu_available():
        import torch_xla.core.xla_model as xm

        return xm.xrt_world_size()
    elif is_sagemaker_dp_enabled():
        import smdistributed.dataparallel.torch.distributed as dist

        return dist.get_world_size()
    elif local_rank != -1 and is_torch_available():
        import torch

        return torch.distributed.get_world_size()
    return 1
 def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None):
     if num_replicas is None:
         if not dist.is_available():
             raise RuntimeError("Requires distributed package to be available")
         num_replicas = dist.get_world_size()
     if rank is None:
         if not dist.is_available():
             raise RuntimeError("Requires distributed package to be available")
         rank = dist.get_rank()
     self.dataset = dataset
     self.num_replicas = num_replicas
     self.rank = rank
     num_samples = len(self.dataset)
     # Add extra samples to make num_samples a multiple of batch_size if passed
     if batch_size is not None:
         self.num_samples = int(math.ceil(num_samples / (batch_size * num_replicas))) * batch_size
     else:
         self.num_samples = int(math.ceil(num_samples / num_replicas))
     self.total_size = self.num_samples * self.num_replicas
     self.batch_size = batch_size
예제 #14
0
def reduce_loss_dict(loss_dict):
    """
    Reduce the loss dictionary from all processes so that process with rank
    0 has the averaged results. Returns a dict with the same fields as
    loss_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return loss_dict
    with torch.no_grad():
        loss_names = []
        all_losses = []
        for k in sorted(loss_dict.keys()):
            loss_names.append(k)
            all_losses.append(loss_dict[k])
        all_losses = torch.stack(all_losses, dim=0)
        #dist.reduce(all_losses, dst=0)
        herring.all_reduce(all_losses)
        #if herring.get_rank() == 0:
        # only main process gets accumulated, so only divide by
        # world_size in this case
        all_losses /= world_size
        reduced_losses = {k: v for k, v in zip(loss_names, all_losses)}
    return reduced_losses
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=herring.get_local_rank())
    parser.add_argument("--seed",
                        help="manually set random seed for torch",
                        type=int,
                        default=99)
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )
    parser.add_argument(
        "--bucket-cap-mb",
        dest="bucket_cap_mb",
        help="specify bucket size for herring",
        default=25,
        type=int,
    )
    parser.add_argument("--data-dir",
                        dest="data_dir",
                        help="Absolute path of dataset ",
                        type=str,
                        default=None)

    args = parser.parse_args()

    # Set seed to reduce randomness
    random.seed(args.seed + herring.get_local_rank())
    np.random.seed(args.seed + herring.get_local_rank())
    torch.manual_seed(args.seed + herring.get_local_rank())
    torch.cuda.manual_seed(args.seed + herring.get_local_rank())

    # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    num_gpus = herring.get_world_size()
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        # torch.distributed.init_process_group(
        #     backend="nccl", init_method="env://"
        # )
        #synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, herring.get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    model = train(cfg, args)

    if not args.skip_test:
        if not cfg.PER_EPOCH_EVAL:
            test_model(cfg, model, args)
예제 #16
0
def main():
    parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=dist.get_local_rank())
    parser.add_argument(
        "--seed",
        help="manually set random seed for torch",
        type=int,
        default=99
    )
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )
    parser.add_argument(
        "--bucket-cap-mb",
        dest="bucket_cap_mb",
        help="specify bucket size for SMDataParallel",
        default=25,
        type=int,
    )
    parser.add_argument(
        "--data-dir",
        dest="data_dir",
        help="Absolute path of dataset ",
        type=str,
        default=None
    )
    parser.add_argument(
        "--dtype",
        dest="dtype"
    )
    parser.add_argument(
        "--spot_ckpt",
        default=None
    )


    args = parser.parse_args()
    keys = list(os.environ.keys())
    args.data_dir = os.environ['SM_CHANNEL_TRAIN'] if 'SM_CHANNEL_TRAIN' in keys else args.data_dir
    print("dataset dir: ", args.data_dir)


    # Set seed to reduce randomness
    random.seed(args.seed + dist.get_local_rank())
    np.random.seed(args.seed + dist.get_local_rank())
    torch.manual_seed(args.seed + dist.get_local_rank())
    torch.cuda.manual_seed(args.seed + dist.get_local_rank())

    # num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    num_gpus = dist.get_world_size()
    args.distributed = num_gpus > 1

    if args.distributed:
        # SMDataParallel: Pin each GPU to a single SMDataParallel process. 
        torch.cuda.set_device(args.local_rank)
        # torch.distributed.init_process_group(
        #     backend="nccl", init_method="env://"
        # )
        #synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.DTYPE=args.dtype
    # grab checkpoint file to start from
    os.system(f"aws s3 cp {args.spot_ckpt} /opt/ml/checkpoints/{args.spot_ckpt.split('/')[-1]}")
    cfg.MODEL.WEIGHT = f"/opt/ml/checkpoints/{args.spot_ckpt.split('/')[-1]}" 
    cfg.freeze()
    print ("CONFIG")
    print (cfg)

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, dist.get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    model = train(cfg, args)

    if not args.skip_test:
        if not cfg.PER_EPOCH_EVAL:
            test_model(cfg, model, args)
예제 #17
0
def get_world_size():
    # if not dist.is_available():
    #     return 1
    # if not dist.is_initialized():
    #     return 1
    return herring.get_world_size()
def _get_data_loader(imgs, trn_df, vld_df):

    import albumentations as A
    from albumentations import (
        Rotate, HorizontalFlip, IAAPerspective, ShiftScaleRotate, CLAHE,
        RandomRotate90, Transpose, ShiftScaleRotate, Blur, OpticalDistortion,
        GridDistortion, HueSaturationValue, IAAAdditiveGaussianNoise,
        GaussNoise, MotionBlur, MedianBlur, RandomBrightnessContrast,
        IAAPiecewiseAffine, IAASharpen, IAAEmboss, Flip, OneOf, Compose)
    from albumentations.pytorch import ToTensor, ToTensorV2

    train_transforms = A.Compose([
        Rotate(20),
        OneOf([
            IAAAdditiveGaussianNoise(),
            GaussNoise(),
        ], p=0.2),
        OneOf([
            MotionBlur(p=.2),
            MedianBlur(blur_limit=3, p=0.1),
            Blur(blur_limit=3, p=0.1),
        ],
              p=0.2),
        ShiftScaleRotate(
            shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.2),
        OneOf([
            OpticalDistortion(p=0.3),
            GridDistortion(p=.1),
            IAAPiecewiseAffine(p=0.3),
        ],
              p=0.2),
        OneOf([
            CLAHE(clip_limit=2),
            IAASharpen(),
            IAAEmboss(),
            RandomBrightnessContrast(),
        ],
              p=0.3),
        HueSaturationValue(p=0.3),
        ToTensor()
    ],
                                 p=1.0)

    valid_transforms = A.Compose([ToTensor()])

    from torch.utils.data import Dataset, DataLoader
    trn_dataset = BangaliDataset(imgs=imgs,
                                 label_df=trn_df,
                                 transform=train_transforms)
    vld_dataset = BangaliDataset(imgs=imgs,
                                 label_df=vld_df,
                                 transform=valid_transforms)

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    trn_sampler = torch.utils.data.distributed.DistributedSampler(
        trn_dataset,
        num_replicas=world_size,  # worldsize만큼 분할
        rank=rank)

    trn_loader = DataLoader(trn_dataset,
                            shuffle=False,
                            num_workers=8,
                            pin_memory=True,
                            batch_size=BATCH_SIZE,
                            sampler=trn_sampler)

    vld_loader = DataLoader(vld_dataset,
                            shuffle=False,
                            num_workers=NUM_WORKERS,
                            batch_size=BATCH_SIZE)
    return trn_loader, vld_loader
                        default=os.environ['SM_CHANNEL_TRAINING'])
    parser.add_argument('--model_dir',
                        type=str,
                        default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--num_gpus',
                        type=int,
                        default=os.environ['SM_NUM_GPUS'])

    args = parser.parse_args()
    return args


if __name__ == '__main__':

    #parse arguments
    args = parser_args()

    args.world_size = dist.get_world_size()
    args.rank = dist.get_rank()
    args.local_rank = dist.get_local_rank()
    #print(f"rank={args.rank}, local_rank={args.local_rank}")
    args.batch_size //= args.world_size // 8
    args.batch_size = max(args.batch_size, 1)

    args.use_cuda = args.num_gpus > 0
    print("args.use_cuda : {} , args.num_gpus : {}".format(
        args.use_cuda, args.num_gpus))
    args.device = torch.device("cuda" if args.use_cuda else "cpu")

    train_model(args)
예제 #20
0
def train(train_loop_func, logger, args):
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda
    train_samples = 118287

    # Setup multi-GPU if necessary
    args.distributed = False
    # if 'WORLD_SIZE' in os.environ:
    #     args.distributed = int(os.environ['WORLD_SIZE']) > 1

    num_gpus = herring.get_world_size()
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        # torch.distributed.init_process_group(backend='nccl', init_method='env://')
        # args.N_gpu = torch.distributed.get_world_size()
        args.N_gpu = herring.get_world_size()
    else:
        args.N_gpu = 1

    if args.seed is None:
        args.seed = np.random.randint(1e4)

    if args.distributed:
        # args.seed = (args.seed + torch.distributed.get_rank()) % 2**32
        args.seed = (args.seed + herring.get_rank()) % 2 ** 32
    print("Using seed = {}".format(args.seed))
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    np.random.seed(seed=args.seed)


    # Setup data, defaults
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    cocoGt = get_coco_ground_truth(args)

    train_loader = get_train_loader(args, args.seed - 2**31)

    val_dataset = get_val_dataset(args)
    val_dataloader = get_val_dataloader(val_dataset, args)

    ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path))
    args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)
    start_epoch = 0
    iteration = 0
    loss_func = Loss(dboxes)

    if use_cuda:
        ssd300.cuda()
        loss_func.cuda()

    optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate,
                                    momentum=args.momentum, weight_decay=args.weight_decay)
    scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1)
    if args.amp:
        ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2')

    if args.distributed:
        ssd300 = DDP(ssd300, broadcast_buffers=False)

    if args.checkpoint is not None:
        if os.path.isfile(args.checkpoint):
            load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint)
            checkpoint = torch.load(args.checkpoint,
                                    map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device()))
            start_epoch = checkpoint['epoch']
            iteration = checkpoint['iteration']
            scheduler.load_state_dict(checkpoint['scheduler'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else:
            print('Provided checkpoint is not path to a file')
            return

    inv_map = {v: k for k, v in val_dataset.label_map.items()}

    total_time = 0

    if args.mode == 'evaluation':
        acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
        if args.local_rank == 0:
            print('Model precision {} mAP'.format(acc))

        return
    mean, std = generate_mean_std(args)

    for epoch in range(start_epoch, args.epochs):
        start_epoch_time = time.time()
        scheduler.step()
        iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration,
                                    logger, args, mean, std)
        end_epoch_time = time.time() - start_epoch_time
        total_time += end_epoch_time

        if herring.get_rank() == 0:
            throughput = train_samples / end_epoch_time
            logger.update_epoch_time(epoch, end_epoch_time)
            logger.update_throughput_speed(epoch, throughput)

        if epoch in args.evaluation:
            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)

            if args.local_rank == 0:
                logger.update_epoch(epoch, acc)

        if args.save and args.local_rank == 0:
            print("saving model...")
            obj = {'epoch': epoch + 1,
                   'iteration': iteration,
                   'optimizer': optimizer.state_dict(),
                   'scheduler': scheduler.state_dict(),
                   'label_map': val_dataset.label_info}
            if args.distributed:
                obj['model'] = ssd300.module.state_dict()
            else:
                obj['model'] = ssd300.state_dict()
            save_path = os.path.join(args.save, f'epoch_{epoch}.pt')
            torch.save(obj, save_path)
            logger.log('model path', save_path)
        train_loader.reset()
    if herring.get_rank() == 0:
        DLLogger.log((), { 'Total training time': '%.2f' % total_time + ' secs' })
        logger.log_summary()
예제 #21
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument(
        "--batch-size",
        type=int,
        default=64,
        metavar="N",
        help="input batch size for training (default: 64)",
    )
    parser.add_argument(
        "--test-batch-size",
        type=int,
        default=1000,
        metavar="N",
        help="input batch size for testing (default: 1000)",
    )
    parser.add_argument(
        "--epochs",
        type=int,
        default=14,
        metavar="N",
        help="number of epochs to train (default: 14)",
    )
    parser.add_argument("--lr",
                        type=float,
                        default=1.0,
                        metavar="LR",
                        help="learning rate (default: 1.0)")
    parser.add_argument(
        "--gamma",
        type=float,
        default=0.7,
        metavar="M",
        help="Learning rate step gamma (default: 0.7)",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        metavar="S",
                        help="random seed (default: 1)")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=10,
        metavar="N",
        help="how many batches to wait before logging training status",
    )
    parser.add_argument("--save-model",
                        action="store_true",
                        default=False,
                        help="For Saving the current Model")
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
        help="For displaying smdistributed.dataparallel-specific logs",
    )
    parser.add_argument(
        "--data-path",
        type=str,
        default="/tmp/data",
        help="Path for downloading "
        "the MNIST dataset",
    )

    args = parser.parse_args()
    args.world_size = dist.get_world_size()
    args.rank = rank = dist.get_rank()
    args.local_rank = local_rank = dist.get_local_rank()
    args.lr = 1.0
    args.batch_size //= args.world_size // 8
    args.batch_size = max(args.batch_size, 1)
    data_path = args.data_path

    if args.verbose:
        print(
            "Hello from rank",
            rank,
            "of local_rank",
            local_rank,
            "in world size of",
            args.world_size,
        )

    if not torch.cuda.is_available():
        raise CUDANotFoundException(
            "Must run smdistributed.dataparallel MNIST example on CUDA-capable devices."
        )

    torch.manual_seed(args.seed)

    device = torch.device("cuda")

    # select a single rank per node to download data
    is_first_local_rank = local_rank == 0
    if is_first_local_rank:
        train_dataset = datasets.MNIST(
            data_path,
            train=True,
            download=True,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ]),
        )
    dist.barrier()  # prevent other ranks from accessing the data early
    if not is_first_local_rank:
        train_dataset = datasets.MNIST(
            data_path,
            train=True,
            download=False,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ]),
        )

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=args.world_size, rank=rank)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        sampler=train_sampler,
    )
    if rank == 0:
        test_loader = torch.utils.data.DataLoader(
            datasets.MNIST(
                data_path,
                train=False,
                transform=transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize((0.1307, ), (0.3081, ))
                ]),
            ),
            batch_size=args.test_batch_size,
            shuffle=True,
        )

    model = DDP(Net().to(device))
    torch.cuda.set_device(local_rank)
    model.cuda(local_rank)
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        if rank == 0:
            test(model, device, test_loader)
        scheduler.step()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=14,
                        metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr',
                        type=float,
                        default=1.0,
                        metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.7,
                        metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    parser.add_argument('--verbose',
                        action='store_true',
                        default=False,
                        help='For displaying SM Data Parallel-specific logs')
    parser.add_argument('--data-path',
                        type=str,
                        default='/tmp/data',
                        help='Path for downloading '
                        'the MNIST dataset')

    args = parser.parse_args()
    args.world_size = dist.get_world_size()
    args.rank = rank = dist.get_rank()
    args.local_rank = local_rank = dist.get_local_rank()
    args.lr = 1.0
    args.batch_size //= args.world_size // 8
    args.batch_size = max(args.batch_size, 1)
    data_path = args.data_path

    if args.verbose:
        print('Hello from rank', rank, 'of local_rank', local_rank,
              'in world size of', args.world_size)

    if not torch.cuda.is_available():
        raise Exception(
            "Must run SM Distributed DataParallel MNIST example on CUDA-capable devices."
        )

    torch.manual_seed(args.seed)

    device = torch.device("cuda")

    if local_rank == 0:
        train_dataset = datasets.MNIST(data_path,
                                       train=True,
                                       download=True,
                                       transform=transforms.Compose([
                                           transforms.ToTensor(),
                                           transforms.Normalize((0.1307, ),
                                                                (0.3081, ))
                                       ]))
    else:
        time.sleep(8)
        train_dataset = datasets.MNIST(data_path,
                                       train=True,
                                       download=False,
                                       transform=transforms.Compose([
                                           transforms.ToTensor(),
                                           transforms.Normalize((0.1307, ),
                                                                (0.3081, ))
                                       ]))

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=args.world_size, rank=rank)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler)
    if rank == 0:
        test_loader = torch.utils.data.DataLoader(
            datasets.MNIST(data_path,
                           train=False,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize((0.1307, ), (0.3081, ))
                           ])),
            batch_size=args.test_batch_size,
            shuffle=True)

    model = DDP(Net().to(device))
    torch.cuda.set_device(local_rank)
    model.cuda(local_rank)
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        if rank == 0:
            test(model, device, test_loader)
        scheduler.step()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")
예제 #23
0
def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, data_dir=None):
    num_gpus = get_world_size()
    if is_train:
        images_per_batch = cfg.SOLVER.IMS_PER_BATCH
        print("batch size ", cfg.SOLVER.IMS_PER_BATCH, "num_gpus, ", num_gpus, images_per_batch % num_gpus)
        assert (
            images_per_batch % num_gpus == 0
        ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
        images_per_gpu = images_per_batch // num_gpus
        shuffle = True
        num_iters = cfg.SOLVER.MAX_ITER
    else:
        images_per_batch = cfg.TEST.IMS_PER_BATCH
        assert (
            images_per_batch % num_gpus == 0
        ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number "
        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
        images_per_gpu = images_per_batch // num_gpus
        shuffle = False if not is_distributed else True
        num_iters = None
        start_iter = 0

    if images_per_gpu > 1:
        logger = logging.getLogger(__name__)
        logger.warning(
            "When using more than one image per GPU you may encounter "
            "an out-of-memory (OOM) error if your GPU does not have "
            "sufficient memory. If this happens, you can reduce "
            "SOLVER.IMS_PER_BATCH (for training) or "
            "TEST.IMS_PER_BATCH (for inference). For training, you must "
            "also adjust the learning rate and schedule length according "
            "to the linear scaling rule. See for example: "
            "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14"
        )

    # group images which have similar aspect ratio. In this case, we only
    # group in two cases: those with width / height > 1, and the other way around,
    # but the code supports more general grouping strategy
    aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []

    paths_catalog = import_file(
        "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True
    )
    DatasetCatalog = paths_catalog.DatasetCatalog
    dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST

    transforms = build_transforms(cfg, is_train)
    datasets, epoch_size = build_dataset(dataset_list, transforms, DatasetCatalog, data_dir, is_train)
    print("total_dataset_size: ", epoch_size)

    data_loaders = []
    for dataset in datasets:
        sampler = make_data_sampler(dataset, shuffle, is_distributed)
        batch_sampler = make_batch_data_sampler(
            dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter
        )
        collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
        num_workers = cfg.DATALOADER.NUM_WORKERS
        data_loader = torch.utils.data.DataLoader(
            dataset,
            num_workers=num_workers,
            batch_sampler=batch_sampler,
            collate_fn=collator,
        )
        data_loaders.append(data_loader)
    if is_train:
        # during training, a single (possibly concatenated) data_loader is returned
        assert len(data_loaders) == 1
        iterations_per_epoch = epoch_size // images_per_batch
        return data_loaders[0], iterations_per_epoch
    return data_loaders
예제 #24
0
    # SageMaker Container environment
    parser.add_argument('--model-dir', type=str, default='../model')
    parser.add_argument('--data-dir', type=str, default='../data')

    args = parser.parse_args()

    try:
        args.model_dir = os.environ['SM_MODEL_DIR']
        args.data_dir = os.environ['SM_CHANNEL_TRAINING']
    except KeyError as e:
        print(
            "The model starts training on the local host without SageMaker TrainingJob."
        )
        if not os.path.exists(args.model_dir):
            os.makedirs(args.model_dir)
        pass

    ########################################################
    ####### 2. SageMaker Distributed Data Parallel   #######
    #######  - Get all number of GPU and rank number #######
    ########################################################

    args.world_size = smdp.get_world_size()  # all number of GPU
    args.rank = smdp.get_rank()  # total rank in all hosts
    args.local_rank = smdp.get_local_rank()  # rank per host

    ########################################################

    train(args)